diff --git a/.github/actions/multi-functest/action.yml b/.github/actions/multi-functest/action.yml
index 74e8194c8..4e4c0b426 100644
--- a/.github/actions/multi-functest/action.yml
+++ b/.github/actions/multi-functest/action.yml
@@ -238,7 +238,10 @@ runs:
           nix-verbose: ${{ inputs.nix-verbose }}
           gh_token: ${{ inputs.gh_token }}
           custom_shell: ${{ inputs.custom_shell }}
-          cflags: "${{ inputs.cflags }} -DMLD_FORCE_RISCV32"
+          # The RV32-IM arithmetic backend is experimental and not picked
+          # up by native/meta.h's defaults; select it explicitly here.
+          # No-op for OPT=0 builds (MLD_CONFIG_ARITH_BACKEND_FILE is unused).
+          cflags: "${{ inputs.cflags }} -DMLD_FORCE_RISCV32 -DMLD_CONFIG_ARITH_BACKEND_FILE=\\\\\\\"native/rv32im/meta.h\\\\\\\""
           ldflags: ${{ inputs.ldflags }}
           cross_prefix: riscv32-unknown-linux-gnu-
           exec_wrapper: "${{ inputs.exec_wrapper != '' && inputs.exec_wrapper || 'qemu-riscv32' }}"
@@ -255,4 +258,3 @@ runs:
           rng_fail: ${{ inputs.rng_fail }}
           extra_args: ${{ inputs.extra_args }}
           extra_env: ${{ inputs.extra_env }}
-
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index afffdd299..88bcaa96e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -122,8 +122,8 @@ jobs:
           check_namespace: 'false'
       - name: build + test (cross, opt)
         uses: ./.github/actions/multi-functest
-        # There is no native code yet on PPC64LE, riscv32 or AArch64_be, so no point running opt tests
-        if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') }}
+        # There is no native code yet on PPC64LE or AArch64_be, so no point running opt tests
+        if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'aarch64_be') }}
         with:
           nix-shell: ${{ matrix.target.nix_shell }}
           nix-cache: ${{ matrix.target.mode == 'native' && 'false' || 'true' }}
@@ -134,8 +134,8 @@ jobs:
           opt: 'opt'
       - name: build + test (cross, opt, +debug)
         uses: ./.github/actions/multi-functest
-        # There is no native code yet on PPC64LE, riscv32 or AArch64_be, so no point running opt tests
-        if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') }}
+        # There is no native code yet on PPC64LE or AArch64_be, so no point running opt tests
+        if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'aarch64_be') }}
         with:
           nix-shell: ${{ matrix.target.nix_shell }}
           nix-cache: ${{ matrix.target.mode == 'native' && 'false' || 'true' }}
@@ -145,6 +145,31 @@ jobs:
           exec_wrapper: ${{ matrix.target.exec_wrapper || '' }}
           cflags: "-DMLDSA_DEBUG"
           opt: 'opt'
+      # The RV32IM backend has two interchangeable variants of the Barrett
+      # low(t*q) reduction, selected by MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER:
+      # a single multiply by q (default, exercised by the opt steps above) and
+      # a multiply-free shift-add chain (for cores with a slow multiplier).
+      # Exercise the slow-multiplier variant here so both are covered.
+      - name: build + test (riscv32, slow-multiplier, opt)
+        uses: ./.github/actions/multi-functest
+        if: ${{ matrix.target.arch == 'riscv32' }}
+        with:
+          nix-shell: ${{ matrix.target.nix_shell }}
+          nix-cache: 'true'
+          gh_token: ${{ secrets.GITHUB_TOKEN }}
+          compile_mode: ${{ matrix.target.mode }}
+          cflags: "-DMLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER"
+          opt: 'opt'
+      - name: build + test (riscv32, slow-multiplier, opt, +debug)
+        uses: ./.github/actions/multi-functest
+        if: ${{ matrix.target.arch == 'riscv32' }}
+        with:
+          nix-shell: ${{ matrix.target.nix_shell }}
+          nix-cache: 'true'
+          gh_token: ${{ secrets.GITHUB_TOKEN }}
+          compile_mode: ${{ matrix.target.mode }}
+          cflags: "-DMLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER -DMLDSA_DEBUG"
+          opt: 'opt'
   backend_tests:
     name: AArch64 FIPS202 backends (${{ matrix.backend }})
     strategy:
diff --git a/README.md b/README.md
index a8508b602..d2c2e8cd8 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ mldsa-native allows developers to support ML-DSA with minimal performance and ma
 
 **Maintainability and Safety:** Memory safety, type safety and absence of various classes of timing leakage are automatically checked on every change, using a combination of static model checking (using CBMC) and dynamic instrumentation (using valgrind). This reduces review and maintenance burden and accelerates safe code delivery. See [Formal Verification](#formal-verification) and [Security](#security).
 
-**Architecture Support:** Native backends are added under a unified interface, minimizing duplicated code and reasoning. mldsa-native comes with backends for AArch64 and x86-64. See [Design](#design).
+**Architecture Support:** Native backends are added under a unified interface, minimizing duplicated code and reasoning. mldsa-native comes with backends for AArch64 and x86-64, and experimental backends for Armv8.1-M and RV32-IM. See [Design](#design).
 
 ## Quickstart for Ubuntu
 
@@ -94,6 +94,7 @@ mldsa-native currently offers the following backends:
 * 64-bit Arm backend (using Neon)
 * 64-bit Intel/AMD backend (using AVX2)
 * 32-bit Armv8.1-M backend (using Helium/MVE). This is still experimental and disabled by default.
+* 32-bit RISC-V backend (RV32-IM, base integer + M-extension only). This is still experimental and disabled by default.
 
 If you'd like contribute new backends, please reach out!
 
diff --git a/dev/riscv32/meta.h b/dev/riscv32/meta.h
new file mode 100644
index 000000000..a83cd62c1
--- /dev/null
+++ b/dev/riscv32/meta.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#ifndef MLD_NATIVE_RV32IM_META_H
+#define MLD_NATIVE_RV32IM_META_H
+
+/* Set of primitives that this backend replaces */
+#define MLD_USE_NATIVE_NTT
+#define MLD_USE_NATIVE_INTT
+#define MLD_USE_NATIVE_POINTWISE_MONTGOMERY
+
+/* Identifier for this backend so that source and assembly files
+ * in the build can be appropriately guarded. */
+#define MLD_ARITH_BACKEND_RV32IM
+
+
+#if !defined(__ASSEMBLER__)
+#include "../api.h"
+#include "src/arith_native_rv32im.h"
+
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_ntt_native(int32_t data[MLDSA_N])
+{
+  mld_ntt_rv32im_asm(data, mld_rv32im_ntt_zetas);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_intt_native(int32_t data[MLDSA_N])
+{
+  mld_intt_rv32im_asm(data, mld_rv32im_ntt_zetas);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_poly_pointwise_montgomery_native(
+    int32_t a[MLDSA_N], const int32_t b[MLDSA_N])
+{
+  mld_poly_pointwise_montgomery_rv32im_asm(a, b);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+
+#endif /* !__ASSEMBLER__ */
+#endif /* !MLD_NATIVE_RV32IM_META_H */
diff --git a/dev/riscv32/src/arith_native_rv32im.h b/dev/riscv32/src/arith_native_rv32im.h
new file mode 100644
index 000000000..9a987fbfd
--- /dev/null
+++ b/dev/riscv32/src/arith_native_rv32im.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#ifndef MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H
+#define MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H
+
+#include "../../../cbmc.h"
+#include "../../../common.h"
+
+#define mld_rv32im_ntt_zetas MLD_NAMESPACE(rv32im_ntt_zetas)
+
+/*
+ * Forward NTT zeta table for the RV32-IM backend.
+ *
+ * 255 logical entries, each a (zeta, w) Barrett pair: zeta is the plain
+ * centered twiddle w^{bitrev_8(k)} mod q (|zeta| <= q/2) and
+ * w = round(zeta * 2^32 / q) is the Barrett multiplier used by the
+ * constant-twiddle butterfly. The order matches the consumption order of
+ * the 2+2+2+2 forward NTT.
+ */
+MLD_INTERNAL_DATA_DECLARATION const int32_t mld_rv32im_ntt_zetas[510];
+
+#define mld_ntt_rv32im_asm MLD_NAMESPACE(ntt_rv32im_asm)
+void mld_ntt_rv32im_asm(int32_t *r, const int32_t *zetas)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N))
+  requires(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q))
+  requires(zetas == mld_rv32im_ntt_zetas)
+  assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N))
+  /* Forward-NTT output bound MLD_NTT_BOUND = 9 * MLD_FQMUL_BOUND. The
+   * truncating `mulh` Barrett multiply has output bound MLD_FQMUL_BOUND =
+   * 5/4 * MLDSA_Q (vs MLDSA_Q for the rounding `sqrdmulh` used on AArch64),
+   * so the NTT output is bounded by 9 * MLD_FQMUL_BOUND, not 9 * MLDSA_Q.
+   * Spelled out inline to keep this header free of poly.h. */
+  ensures(array_abs_bound(r, 0, MLDSA_N, 9 * ((5 * MLDSA_Q + 3) / 4)))
+);
+
+#define mld_intt_rv32im_asm MLD_NAMESPACE(intt_rv32im_asm)
+void mld_intt_rv32im_asm(int32_t *r, const int32_t *zetas)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N))
+  requires(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q))
+  requires(zetas == mld_rv32im_ntt_zetas)
+  assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N))
+  ensures(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q))
+);
+
+#define mld_poly_pointwise_montgomery_rv32im_asm \
+  MLD_NAMESPACE(poly_pointwise_montgomery_rv32im_asm)
+void mld_poly_pointwise_montgomery_rv32im_asm(int32_t *a, const int32_t *b)
+__contract__(
+  requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N))
+  requires(memory_no_alias(b, sizeof(int32_t) * MLDSA_N))
+  /* Inputs bounded by MLD_NTT_BOUND = 9 * MLD_FQMUL_BOUND, the guaranteed
+   * output bound of any forward NTT. Spelled out inline to keep this header
+   * free of poly.h. */
+  requires(array_abs_bound(a, 0, MLDSA_N, 9 * ((5 * MLDSA_Q + 3) / 4)))
+  requires(array_abs_bound(b, 0, MLDSA_N, 9 * ((5 * MLDSA_Q + 3) / 4)))
+  assigns(memory_slice(a, sizeof(int32_t) * MLDSA_N))
+  ensures(array_abs_bound(a, 0, MLDSA_N, MLDSA_Q))
+);
+
+#endif /* !MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H */
diff --git a/dev/riscv32/src/intt_rv32im_asm.S b/dev/riscv32/src/intt_rv32im_asm.S
new file mode 100644
index 000000000..11222d02d
--- /dev/null
+++ b/dev/riscv32/src/intt_rv32im_asm.S
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * RV32-IM ML-DSA inverse NTT, fast-multiplier variant.
+ *
+ * Thin wrapper: the kernel body is shared via intt_rv32im_asm.i, which here
+ * computes the Barrett low(t*q) reduction with a single multiply by q. The
+ * slow-multiplier variant (shift-add) lives in intt_rv32im_slowmul_asm.S.
+ * Exactly one of the two is selected, by MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER.
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_RV32IM) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
+    !defined(MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER)
+/* simpasm: header-end */
+
+        .text
+        .global MLD_ASM_NAMESPACE(intt_rv32im_asm)
+        .balign 4
+MLD_ASM_FN_SYMBOL(intt_rv32im_asm)
+
+#include "intt_rv32im_asm.i"
+
+/* simpasm: footer-start */
+#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
+          !MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER */
diff --git a/dev/riscv32/src/intt_rv32im_asm.i b/dev/riscv32/src/intt_rv32im_asm.i
new file mode 100644
index 000000000..c84c20f6f
--- /dev/null
+++ b/dev/riscv32/src/intt_rv32im_asm.i
@@ -0,0 +1,413 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * RV32-IM ML-DSA inverse NTT -- shared kernel body.
+ *
+ * This file is #include'd by the thin wrapper .S files
+ *   intt_rv32im_asm.S         (fast multiplier: low(t*q) via a single mul)
+ *   intt_rv32im_slowmul_asm.S (slow multiplier: low(t*q) via shift-add)
+ * which differ only in whether they #define
+ * MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER before the include. It is not a
+ * standalone translation unit: the backend guard, the .global directive,
+ * and the simpasm header/footer markers live in the wrappers.
+ *
+ * Layered structure: 2+2+2+2 (mirror of the forward NTT, with passes
+ * applied in reverse layer order). Each pass merges two C-layers into a
+ * radix-4 inner kernel that holds 4 coefficients in registers.
+ *
+ *   inv-pass-1: C-layers 8, 7   (inner stride =  4 B,  64 outer iters)
+ *   inv-pass-2: C-layers 6, 5   (inner stride = 16 B,  16 outer iters)
+ *   inv-pass-3: C-layers 4, 3   (inner stride = 64 B,   4 outer iters)
+ *   inv-pass-4: C-layers 2, 1   (inner stride = 256 B,  1 outer iter )
+ *
+ * Twiddles: this routine reuses `mld_rv32im_ntt_zetas` (the forward-NTT
+ * table). The forward pass-(5-k) consumes its 3*N_outer pairs in
+ * outer order 0,1,...,N-1; the inv pass-k requires the *same* zetas but
+ * in reverse outer order, with the two "hi" zetas swapped. We implement
+ * this by initializing zeta_ptr at the end of each pass region and
+ * subtracting 24 bytes per outer iter; within the iter the lo zeta is
+ * read from offset 0 and the hi zetas from offsets 8/16 swapped via the
+ * GS kernel argument order. The negation that the C reference applies
+ * (`-mld_zetas[k]`) is absorbed by the GS butterfly form
+ *      a' = a + b
+ *      b' = barrett(b - a, +zeta)
+ * which produces the same result as the canonical
+ *      t  = a; a' = t + b; b' = barrett(t - b, -zeta).
+ *
+ * Modular arithmetic: Barrett multiplication by a constant twiddle
+ * (2-mul kernel  t = hi(a*w), r = low(a*zeta) - low(t*q)), matching the
+ * forward NTT. Each zeta is a (zeta, w) pair (plain centered twiddle and
+ * its Barrett multiplier). The plain-domain result matches the previous
+ * Montgomery convention. The low(t*q) reduction has two bit-identical
+ * forms (see mul_q_sub), selected by
+ * MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER: shift-add or a single multiply.
+ *
+ * Final scaling: after the four passes, every coefficient is multiplied
+ * by the plain twiddle  f = 16382 = R * 2^{-8} mod q  (= 2^24 mod q),
+ * which folds in both the 2^{-8} of the inverse NTT and the R factor of
+ * the previous Montgomery output convention. This uses a rounding Barrett
+ * (see barrett_round): a doubled multiplier round(f*2^33/q) and a (t+1)>>1
+ * round-to-nearest of the quotient, tightening the output to |coef| < q
+ * (measured <= 0.503 q). The truncating Barrett of the butterflies gives
+ * |coef| < 1.01 q, so the rounding form is used here to meet the invntt
+ * output contract of |coef| < q.
+ *
+ * Bounds (after each inv-pass):
+ *
+ *   start                       :  |coef| < q          (= 1*q)
+ *   after inv-pass-1 (C-L 8,7)  :  |coef| < 4*q
+ *   after inv-pass-2 (C-L 6,5)  :  |coef| < 16*q
+ *   after inv-pass-3 (C-L 4,3)  :  |coef| < 64*q
+ *   after inv-pass-4 (C-L 2,1)  :  |coef| < 256*q   (~ 2^31, fits int32)
+ *   after final fqscale         :  |coef| < q       (rounding Barrett)
+ */
+
+/*****************************************************************
+ * Register aliases
+ *****************************************************************/
+
+/* Arguments */
+#define in_ptr a0
+#define zeta_ptr a1
+
+/* Working pointers / counters */
+#define data t2
+#define outer_end t3
+#define inner_end t4
+#define scale_end t5 /* end pointer for final-scaling loop  */
+
+/* Coefficient registers */
+#define ca a2
+#define cb a3
+#define cc a4
+#define cd a5
+
+/* Butterfly temporaries */
+#define tmp0 a6
+#define tmp1 a7
+
+/* Loaded zeta pair registers. Each pair is (zeta, w): the plain centered
+ * twiddle and its Barrett multiplier w = round(zeta * 2^32 / q). */
+#define zeta_lo s0
+#define zeta_lo_w s1
+#define zeta_h0 s2
+#define zeta_h0_w s3
+#define zeta_h1 s4
+#define zeta_h1_w s5
+
+/* Constants (used only by the Barrett final-scale post-loop). */
+#define f s6    /* plain fqscale: 16382 = R*2^-8 mod q */
+#define f_w2 s7 /* doubled Barrett mult: round(f*2^33/q) */
+
+/* Constant q register, used only by mul_q_sub when
+ * MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER is undefined. t0 is caller-saved
+ * and otherwise unused, so no extra save/restore is needed. */
+#define q t0 /* MLDSA_Q = 8380417            */
+
+/*****************************************************************
+ * Macros
+ *****************************************************************/
+
+/* mul_q_sub rd, rt :
+ *
+ *   rd = rd - low(rt * q)   (mod 2^32),  clobbers rt.
+ *
+ * Two bit-identical implementations, selected by
+ * MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER:
+ *
+ *   defined   : shift-add, exploiting q = 2^23 - 2^13 + 1, no multiply.
+ *   undefined : single low multiply by q (q held in `q`).
+ *
+ * The reduction is the only multiplier-dependent step; the Barrett kernels,
+ * butterflies, final scaling and zeta table are shared.
+ */
+.macro mul_q_sub rd, rt
+#if defined(MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER)
+        sub  \rd, \rd, \rt        /* - rt                      */
+        slli \rt, \rt, 13
+        add  \rd, \rd, \rt        /* + (rt<<13)                */
+        slli \rt, \rt, 10
+        sub  \rd, \rd, \rt        /* - (rt<<23) => - low(rt*q) */
+#else  /* MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER */
+        mul  \rt, \rt, q          /* low(rt * q)               */
+        sub  \rd, \rd, \rt
+#endif /* !MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER */
+.endm
+
+/* barrett rd, ra, rzeta, rw, rt :
+ *
+ *   rd = (ra * rzeta) mod q   (plain domain, |rd| < 1.01 q).  Clobbers: rt.
+ *   t  = hi(ra * rw) ; rd = low(ra * rzeta) - low(t * q).
+ *
+ * Uses a truncating quotient estimate t = hi(ra * w) with
+ * w = round(rzeta * 2^32 / q). Good enough for the butterfly bound.
+ */
+.macro barrett rd, ra, rzeta, rw, rt
+        mulh  \rt, \ra, \rw       /* t   = hi(ra * w)          */
+        mul   \rd, \ra, \rzeta    /* azl = low(ra * zeta)      */
+        mul_q_sub \rd, \rt        /* rd  = azl - low(t * q)    */
+.endm
+
+/* barrett_round rd, ra, rf, rf_w2, rt :
+ *
+ *   rd = (ra * rf) mod q   (plain domain, |rd| < q).  Clobbers: rt.
+ *
+ * Rounding Barrett: instead of the truncating hi(ra*w) of `barrett`, it
+ * uses the doubled multiplier  rf_w2 = round(rf * 2^33 / q)  and recovers a
+ * round-to-nearest quotient by  qhat = (hi(ra*rf_w2) + 1) >> 1:
+ *   t    = hi(ra * rf_w2)        ~ floor(2 * ra * rf / q)
+ *   qhat = (t + 1) >> 1          ~ round(ra * rf / q)
+ *   rd   = low(ra * rf) - low(qhat * q)
+ * The round-to-nearest quotient gives the tighter bound |rd| < q (measured
+ * <= 0.503 q), versus |rd| < 1.01 q for the truncating `barrett`.
+ *
+ * rf_w2 fits int32 only because rf is small (here 16382); a general twiddle
+ * up to q/2 would overflow the doubled constant. Final scaling only.
+ */
+.macro barrett_round rd, ra, rf, rf_w2, rt
+        mulh  \rt, \ra, \rf_w2    /* t    = hi(ra * (2*f)~)    */
+        addi  \rt, \rt, 1
+        srai  \rt, \rt, 1         /* qhat = (t + 1) >> 1       */
+        mul   \rd, \ra, \rf       /* azl  = low(ra * f)        */
+        mul_q_sub \rd, \rt        /* rd   = azl - low(qhat*q)  */
+.endm
+
+/* gs_bfly ra, rb, rzeta, rw, rt0, rt1 :
+ *
+ *   t  = rb - ra
+ *   ra = ra + rb
+ *   rb = barrett(t, +rzeta)
+ *
+ * Gentleman-Sande butterfly. Each application grows |coef| by a factor of 2
+ * (or by ~q, whichever is greater): the additive part doubles, the
+ * multiplicative part is bounded by ~q.
+ *
+ * The algebraic equivalence with the C reference's
+ *      t = ra; ra = t + rb; rb = barrett(t - rb, -zeta)
+ * follows from barrett being linear in its constant:
+ *      barrett(t - rb, -zeta) = -barrett(t - rb, +zeta)
+ *                             =  barrett(rb - t, +zeta)
+ *                             =  barrett(rb - ra, +zeta)       (t == ra)
+ * which is what this macro computes. This lets us reuse the (un-negated)
+ * forward-NTT zeta table.
+ *
+ * Clobbers: rt0, rt1.
+ */
+.macro gs_bfly ra, rb, rzeta, rw, rt0, rt1
+        sub  \rt0, \rb, \ra
+        add  \ra,  \ra, \rb
+        barrett \rb, \rt0, \rzeta, \rw, \rt1
+.endm
+
+/* gs_radix4 stride :
+ *
+ * Reads four coefficients from offsets [0, s, 2s, 3s] of `data`,
+ * applies the inverse-NTT radix-4 kernel using the loaded zetas,
+ * writes them back.
+ *
+ * Within a single inv-pass:
+ *   - "Inner" layer (the smaller-stride C-layer, run first) pairs
+ *     (a,b) and (c,d). The C reference uses two distinct zetas here
+ *     (k = (1<<L_in)-1-2o and (1<<L_in)-2-2o), which appear in our
+ *     table in fwd order as (h0, h1). With the cursor walked
+ *     backward, position offsets remain (h0=8, h1=16); the inv
+ *     consumption order swaps them: (a,b) gets h1, (c,d) gets h0.
+ *   - "Outer" layer (the larger-stride C-layer, run second) pairs
+ *     (a,c) and (b,d) with a single shared zeta = lo.
+ */
+.macro gs_radix4 stride
+        lw   ca, 0(data)
+        lw   cb, (1*\stride)(data)
+        lw   cc, (2*\stride)(data)
+        lw   cd, (3*\stride)(data)
+
+        /* Inner C-layer (smaller stride): (a,b) gets h1, (c,d) gets h0. */
+        gs_bfly ca, cb, zeta_h1, zeta_h1_w, tmp0, tmp1
+        gs_bfly cc, cd, zeta_h0, zeta_h0_w, tmp0, tmp1
+
+        /* Outer C-layer (larger stride): (a,c) and (b,d), shared lo. */
+        gs_bfly ca, cc, zeta_lo, zeta_lo_w, tmp0, tmp1
+        gs_bfly cb, cd, zeta_lo, zeta_lo_w, tmp0, tmp1
+
+        sw   ca, 0(data)
+        sw   cb, (1*\stride)(data)
+        sw   cc, (2*\stride)(data)
+        sw   cd, (3*\stride)(data)
+.endm
+
+/* load_outer_zetas_rev:
+ *
+ *   zeta_ptr -= 24
+ *   load (lo, lo_w, h0, h0_w, h1, h1_w) from [zeta_ptr+0..+23]
+ *
+ * Walks the forward-NTT zeta table backward, one outer-iter pair set
+ * (24 bytes) at a time.
+ */
+.macro load_outer_zetas_rev
+        addi zeta_ptr, zeta_ptr, -24
+        lw   zeta_lo,    0(zeta_ptr)
+        lw   zeta_lo_w,  4(zeta_ptr)
+        lw   zeta_h0,    8(zeta_ptr)
+        lw   zeta_h0_w,  12(zeta_ptr)
+        lw   zeta_h1,    16(zeta_ptr)
+        lw   zeta_h1_w,  20(zeta_ptr)
+.endm
+
+/* save / restore the callee-saved regs s0..s7 we use. */
+.macro save_regs
+        addi sp, sp, -32
+        sw   s0,  0(sp)
+        sw   s1,  4(sp)
+        sw   s2,  8(sp)
+        sw   s3, 12(sp)
+        sw   s4, 16(sp)
+        sw   s5, 20(sp)
+        sw   s6, 24(sp)
+        sw   s7, 28(sp)
+.endm
+
+.macro restore_regs
+        lw   s0,  0(sp)
+        lw   s1,  4(sp)
+        lw   s2,  8(sp)
+        lw   s3, 12(sp)
+        lw   s4, 16(sp)
+        lw   s5, 20(sp)
+        lw   s6, 24(sp)
+        lw   s7, 28(sp)
+        addi sp, sp, 32
+.endm
+
+/*****************************************************************
+ * Function
+ *
+ * The MLD_ASM_FN_SYMBOL(intt_rv32im_asm) entry label lives in the wrapper
+ * .S file (next to its .global), so it is the first thing in .text.
+ *****************************************************************/
+
+        save_regs
+
+#if !defined(MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER)
+        /* q = 8380417 = 0x007FE001, for the multiply in mul_q_sub (used by
+         * both the butterflies and the final Barrett scaling). */
+        lui  q, 0x7FE
+        addi q, q, 1
+#endif /* !MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER */
+
+        /* Position zeta_ptr at the END of the table (one past last entry).
+         * The table has 255 pairs = 510 int32 = 2040 bytes. */
+        addi zeta_ptr, zeta_ptr, 2040
+
+        /***************************************************
+         * inv-pass-1: C-layers 8, 7.
+         *   64 outer iters, 1 inner iter each, stride = 4 B.
+         *   Each outer iter handles 4 consecutive coefficients.
+         *
+         * Reads fwd-pass-4's 64 outer iters in reverse order.
+         ***************************************************/
+        mv   data, in_ptr
+        addi outer_end, in_ptr, 1024
+intt_rv32im_p1_outer:
+        load_outer_zetas_rev
+        gs_radix4 4
+        addi data, data, 16
+        bne  data, outer_end, intt_rv32im_p1_outer
+
+        /***************************************************
+         * inv-pass-2: C-layers 6, 5.
+         *   16 outer iters, 4 inner iters each, stride = 16 B.
+         *   Each outer block is 64 B (= 16 coefs).
+         ***************************************************/
+        mv   data, in_ptr
+        addi outer_end, in_ptr, 1024
+intt_rv32im_p2_outer:
+        load_outer_zetas_rev
+        addi inner_end, data, 16          /* 4 * 4 B */
+intt_rv32im_p2_inner:
+        gs_radix4 16
+        addi data, data, 4
+        bne  data, inner_end, intt_rv32im_p2_inner
+        addi data, data, (64 - 16)        /* skip to next 64 B block */
+        bne  data, outer_end, intt_rv32im_p2_outer
+
+        /***************************************************
+         * inv-pass-3: C-layers 4, 3.
+         *   4 outer iters, 16 inner iters each, stride = 64 B.
+         *   Each outer block is 256 B (= 64 coefs).
+         ***************************************************/
+        mv   data, in_ptr
+        addi outer_end, in_ptr, 1024
+intt_rv32im_p3_outer:
+        load_outer_zetas_rev
+        addi inner_end, data, 64          /* 16 * 4 B */
+intt_rv32im_p3_inner:
+        gs_radix4 64
+        addi data, data, 4
+        bne  data, inner_end, intt_rv32im_p3_inner
+        addi data, data, (256 - 64)
+        bne  data, outer_end, intt_rv32im_p3_outer
+
+        /***************************************************
+         * inv-pass-4: C-layers 2, 1.
+         *   1 outer iter, 64 inner iters, stride = 256 B.
+         ***************************************************/
+        load_outer_zetas_rev
+        mv   data, in_ptr
+        addi inner_end, in_ptr, 256       /* 64 * 4 B */
+intt_rv32im_p4_inner:
+        gs_radix4 256
+        addi data, data, 4
+        bne  data, inner_end, intt_rv32im_p4_inner
+
+        /***************************************************
+         * Final scaling: each coefficient *= 16382  (plain, rounding Barrett).
+         *
+         * f    = 16382 = R * 2^{-8} mod q = 2^24 mod q   (plain twiddle;
+         *        folds in both the 2^{-8} of the inverse NTT and the R
+         *        factor of the previous Montgomery output convention).
+         * f_w2 = round(f * 2^33 / q) = 16791564           (doubled Barrett
+         *        multiplier, fits int32 because f is small).
+         *
+         * Rounding Barrett (see barrett_round) yields |coef| < q, restoring
+         * the invntt output contract that the plain butterflies would miss.
+         ***************************************************/
+        li   f,    16382
+        li   f_w2, 16791564
+
+        mv   data, in_ptr
+        addi scale_end, in_ptr, 1024
+intt_rv32im_scale:
+        lw   ca, 0(data)
+        barrett_round cb, ca, f, f_w2, tmp0
+        sw   cb, 0(data)
+        addi data, data, 4
+        bne  data, scale_end, intt_rv32im_scale
+
+        restore_regs
+        ret
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. */
+#undef in_ptr
+#undef zeta_ptr
+#undef data
+#undef outer_end
+#undef inner_end
+#undef scale_end
+#undef ca
+#undef cb
+#undef cc
+#undef cd
+#undef tmp0
+#undef tmp1
+#undef zeta_lo
+#undef zeta_lo_w
+#undef zeta_h0
+#undef zeta_h0_w
+#undef zeta_h1
+#undef zeta_h1_w
+#undef f
+#undef f_w2
+#undef q
diff --git a/dev/riscv32/src/intt_rv32im_slowmul_asm.S b/dev/riscv32/src/intt_rv32im_slowmul_asm.S
new file mode 100644
index 000000000..6fb9b28b9
--- /dev/null
+++ b/dev/riscv32/src/intt_rv32im_slowmul_asm.S
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * RV32-IM ML-DSA inverse NTT, slow-multiplier variant.
+ *
+ * Thin wrapper: the kernel body is shared via intt_rv32im_asm.i, which here
+ * computes the Barrett low(t*q) reduction with a shift-add chain (exploiting
+ * q = 2^23 - 2^13 + 1), trading the multiply for cheap ALU ops -- preferred
+ * when the multiplier is slow. The fast-multiplier variant lives in
+ * intt_rv32im_asm.S. Exactly one of the two is selected, by
+ * MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER.
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_RV32IM) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
+    defined(MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER)
+/* simpasm: header-end */
+
+        .text
+        .global MLD_ASM_NAMESPACE(intt_rv32im_asm)
+        .balign 4
+MLD_ASM_FN_SYMBOL(intt_rv32im_asm)
+
+#define MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER
+#include "intt_rv32im_asm.i"
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER
+
+/* simpasm: footer-start */
+#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
+          MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER */
diff --git a/dev/riscv32/src/ntt_rv32im_asm.S b/dev/riscv32/src/ntt_rv32im_asm.S
new file mode 100644
index 000000000..51dac738b
--- /dev/null
+++ b/dev/riscv32/src/ntt_rv32im_asm.S
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * RV32-IM ML-DSA forward NTT, fast-multiplier variant.
+ *
+ * Thin wrapper: the kernel body is shared via ntt_rv32im_asm.i, which here
+ * computes the Barrett low(t*q) reduction with a single multiply by q. The
+ * slow-multiplier variant (shift-add) lives in ntt_rv32im_slowmul_asm.S.
+ * Exactly one of the two is selected, by MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER.
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_RV32IM) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
+    !defined(MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER)
+/* simpasm: header-end */
+
+        .text
+        .global MLD_ASM_NAMESPACE(ntt_rv32im_asm)
+        .balign 4
+MLD_ASM_FN_SYMBOL(ntt_rv32im_asm)
+
+#include "ntt_rv32im_asm.i"
+
+/* simpasm: footer-start */
+#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
+          !MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER */
diff --git a/dev/riscv32/src/ntt_rv32im_asm.i b/dev/riscv32/src/ntt_rv32im_asm.i
new file mode 100644
index 000000000..7d4fb3ff2
--- /dev/null
+++ b/dev/riscv32/src/ntt_rv32im_asm.i
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * RV32-IM ML-DSA forward NTT -- shared kernel body.
+ *
+ * This file is #include'd by the thin wrapper .S files
+ *   ntt_rv32im_asm.S         (fast multiplier: low(t*q) via a single mul)
+ *   ntt_rv32im_slowmul_asm.S (slow multiplier: low(t*q) via shift-add)
+ * which differ only in whether they #define
+ * MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER before the include. It is not a
+ * standalone translation unit: the backend guard, the .global directive,
+ * and the simpasm header/footer markers live in the wrappers.
+ *
+ * Layered structure: 2+2+2+2 (four passes, each merging two layers, with
+ * a radix-4 inner kernel holding 4 coefficients in registers).
+ *
+ * Modular arithmetic: Barrett multiplication by a constant twiddle.
+ * Each zeta is provided as a (zeta, w) pair, where zeta is the plain
+ * centered twiddle (w^{bitrev(k)} mod q, |zeta| <= q/2) and
+ * w = round(zeta * 2^32 / q) is the Barrett multiplier, so a Barrett
+ * multiply is 2 multiplies + a sparse "low(t*q)" reduction:
+ *
+ *   t  = hi(a * w)               ~ round(a * zeta / q)
+ *   r  = low(a * zeta) - low(t * q)        == (a * zeta) mod q
+ *
+ * The low(t*q) reduction has two bit-identical forms (see mul_q_sub),
+ * selected by MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER: a shift-add
+ * exploiting q = 2^23 - 2^13 + 1 when defined, or a single low multiply
+ * by q otherwise.
+ *
+ * The result is in the plain domain (no Montgomery factor), matching the
+ * input/output convention of the previous Montgomery kernel, which folded
+ * R into the twiddle and cancelled it via R^-1. Bound: |r| < 1.01 q.
+ */
+
+/*****************************************************************
+ * Register aliases (RV32 GAS lacks `.req`; use cpp #defines).
+ *****************************************************************/
+
+/* Arguments */
+#define in_ptr a0   /* base of int32_t r[256]       */
+#define zeta_ptr a1 /* zeta cursor                  */
+
+/* Working pointers / counters */
+#define data t2      /* inner data cursor            */
+#define outer_end t3 /* end address for outer loop   */
+#define inner_end t4 /* end address for inner loop   */
+
+/* Coefficient registers (caller-saved) */
+#define ca a2
+#define cb a3
+#define cc a4
+#define cd a5
+
+/* Butterfly temporaries (caller-saved) */
+#define tmp0 a6
+#define tmp1 a7
+
+/* Loaded zeta pair registers (callee-saved; loaded once per outer iter,
+ * used across the inner loop). Each pair is (zeta, w): the plain centered
+ * twiddle and its Barrett multiplier w = round(zeta * 2^32 / q). */
+#define zeta_lo s0
+#define zeta_lo_w s1
+#define zeta_h0 s2
+#define zeta_h0_w s3
+#define zeta_h1 s4
+#define zeta_h1_w s5
+
+/* Constant q register, used only by mul_q_sub when
+ * MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER is undefined. t0 is caller-saved
+ * and otherwise unused, so no extra save/restore is needed. */
+#define q t0 /* MLDSA_Q = 8380417            */
+
+/*****************************************************************
+ * Macros
+ *****************************************************************/
+
+/* mul_q_sub rd, rt :
+ *
+ *   rd = rd - low(rt * q)   (mod 2^32),  clobbers rt.
+ *
+ * Two bit-identical implementations, selected by
+ * MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER:
+ *
+ *   defined   : shift-add, exploiting q = 2^23 - 2^13 + 1, no multiply.
+ *   undefined : single low multiply by q (q held in `q`).
+ *
+ * The reduction is the only multiplier-dependent step; the Barrett kernel,
+ * butterflies and zeta table are shared.
+ */
+.macro mul_q_sub rd, rt
+#if defined(MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER)
+        sub  \rd, \rd, \rt        /* - rt                      */
+        slli \rt, \rt, 13
+        add  \rd, \rd, \rt        /* + (rt<<13)                */
+        slli \rt, \rt, 10
+        sub  \rd, \rd, \rt        /* - (rt<<23) => - low(rt*q) */
+#else  /* MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER */
+        mul  \rt, \rt, q          /* low(rt * q)               */
+        sub  \rd, \rd, \rt
+#endif /* !MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER */
+.endm
+
+/* barrett rd, ra, rzeta, rw, rt :
+ *
+ *   rd = (ra * rzeta) mod q   (plain domain, |rd| < 1.01 q).
+ *
+ * rzeta : plain centered twiddle (constant)
+ * rw    : Barrett multiplier round(rzeta * 2^32 / q) (constant)
+ *   t  = hi(ra * rw)
+ *   rd = low(ra * rzeta) - low(t * q)
+ * with low(t*q) computed by mul_q_sub. Clobbers: rt.
+ */
+.macro barrett rd, ra, rzeta, rw, rt
+        mulh  \rt, \ra, \rw       /* t   = hi(ra * w)          */
+        mul   \rd, \ra, \rzeta    /* azl = low(ra * zeta)      */
+        mul_q_sub \rd, \rt        /* rd  = azl - low(t * q)    */
+.endm
+
+/* ct_bfly ra, rb, rzeta, rw, rt0, rt1 :
+ *
+ *   t  = barrett(rb, rzeta)
+ *   rb = ra - t
+ *   ra = ra + t
+ *
+ * Cooley-Tukey butterfly. Each application grows |coeff| by at most ~q.
+ * Clobbers: rt0, rt1.
+ */
+.macro ct_bfly ra, rb, rzeta, rw, rt0, rt1
+        barrett \rt0, \rb, \rzeta, \rw, \rt1
+        sub  \rb, \ra, \rt0
+        add  \ra, \ra, \rt0
+.endm
+
+/* radix4_kernel stride (in bytes):
+ *
+ * Reads four coefficients from offsets [0, s, 2s, 3s] of `data`, runs
+ * two layers of CT butterflies using the loaded zeta pairs, writes back.
+ */
+.macro radix4_kernel stride
+        lw   ca, 0(data)
+        lw   cb, (1*\stride)(data)
+        lw   cc, (2*\stride)(data)
+        lw   cd, (3*\stride)(data)
+
+        /* "Lo" layer: pair (ca,cc) and (cb,cd), both with zeta_lo. */
+        ct_bfly ca, cc, zeta_lo, zeta_lo_w, tmp0, tmp1
+        ct_bfly cb, cd, zeta_lo, zeta_lo_w, tmp0, tmp1
+
+        /* "Hi" layer: (ca,cb) with zeta_h0, (cc,cd) with zeta_h1. */
+        ct_bfly ca, cb, zeta_h0, zeta_h0_w, tmp0, tmp1
+        ct_bfly cc, cd, zeta_h1, zeta_h1_w, tmp0, tmp1
+
+        sw   ca, 0(data)
+        sw   cb, (1*\stride)(data)
+        sw   cc, (2*\stride)(data)
+        sw   cd, (3*\stride)(data)
+.endm
+
+/* load_outer_zetas: load 3 (zeta, w) pairs (24 bytes) for one outer iter
+ * from `zeta_ptr`, advancing it. */
+.macro load_outer_zetas
+        lw   zeta_lo,    0(zeta_ptr)
+        lw   zeta_lo_w,  4(zeta_ptr)
+        lw   zeta_h0,    8(zeta_ptr)
+        lw   zeta_h0_w,  12(zeta_ptr)
+        lw   zeta_h1,    16(zeta_ptr)
+        lw   zeta_h1_w,  20(zeta_ptr)
+        addi zeta_ptr, zeta_ptr, 24
+.endm
+
+/* save / restore the callee-saved regs s0..s5 we use. */
+.macro save_regs
+        addi sp, sp, -24
+        sw   s0,  0(sp)
+        sw   s1,  4(sp)
+        sw   s2,  8(sp)
+        sw   s3, 12(sp)
+        sw   s4, 16(sp)
+        sw   s5, 20(sp)
+.endm
+
+.macro restore_regs
+        lw   s0,  0(sp)
+        lw   s1,  4(sp)
+        lw   s2,  8(sp)
+        lw   s3, 12(sp)
+        lw   s4, 16(sp)
+        lw   s5, 20(sp)
+        addi sp, sp, 24
+.endm
+
+/*****************************************************************
+ * Function
+ *
+ * The MLD_ASM_FN_SYMBOL(ntt_rv32im_asm) entry label lives in the wrapper
+ * .S file (next to its .global), so it is the first thing in .text.
+ *****************************************************************/
+
+        save_regs
+
+#if !defined(MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER)
+        /* q = 8380417 = 0x007FE001, for the multiply in mul_q_sub. */
+        lui  q, 0x7FE
+        addi q, q, 1
+#endif
+
+        /***************************************************
+         * Pass 1: C-layers 1, 2.
+         *   1 outer iter, 64 inner iters, butterfly stride = 256 B.
+         ***************************************************/
+        load_outer_zetas
+        mv   data, in_ptr
+        addi inner_end, in_ptr, 256       /* 64 * 4 B */
+ntt_rv32im_p1_loop:
+        radix4_kernel 256
+        addi data, data, 4
+        bne  data, inner_end, ntt_rv32im_p1_loop
+
+        /***************************************************
+         * Pass 2: C-layers 3, 4.
+         *   4 outer iters, 16 inner iters each, stride = 64 B.
+         *   Each outer block is 256 B (= 64 coefs).
+         ***************************************************/
+        mv   data, in_ptr
+        addi outer_end, in_ptr, 1024
+ntt_rv32im_p2_outer:
+        load_outer_zetas
+        addi inner_end, data, 64          /* 16 * 4 B */
+ntt_rv32im_p2_inner:
+        radix4_kernel 64
+        addi data, data, 4
+        bne  data, inner_end, ntt_rv32im_p2_inner
+        addi data, data, (256 - 64)       /* skip to next 256 B block */
+        bne  data, outer_end, ntt_rv32im_p2_outer
+
+        /***************************************************
+         * Pass 3: C-layers 5, 6.
+         *   16 outer iters, 4 inner iters each, stride = 16 B.
+         *   Each outer block is 64 B (= 16 coefs).
+         ***************************************************/
+        mv   data, in_ptr
+        addi outer_end, in_ptr, 1024
+ntt_rv32im_p3_outer:
+        load_outer_zetas
+        addi inner_end, data, 16          /* 4 * 4 B */
+ntt_rv32im_p3_inner:
+        radix4_kernel 16
+        addi data, data, 4
+        bne  data, inner_end, ntt_rv32im_p3_inner
+        addi data, data, (64 - 16)        /* skip to next 64 B block */
+        bne  data, outer_end, ntt_rv32im_p3_outer
+
+        /***************************************************
+         * Pass 4: C-layers 7, 8.
+         *   64 outer iters, 1 inner iter each, stride = 4 B.
+         *   Each outer iter handles 4 consecutive coefficients.
+         ***************************************************/
+        mv   data, in_ptr
+        addi outer_end, in_ptr, 1024
+ntt_rv32im_p4_outer:
+        load_outer_zetas
+        radix4_kernel 4
+        addi data, data, 16
+        bne  data, outer_end, ntt_rv32im_p4_outer
+
+        restore_regs
+        ret
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. */
+#undef in_ptr
+#undef zeta_ptr
+#undef data
+#undef outer_end
+#undef inner_end
+#undef ca
+#undef cb
+#undef cc
+#undef cd
+#undef tmp0
+#undef tmp1
+#undef zeta_lo
+#undef zeta_lo_w
+#undef zeta_h0
+#undef zeta_h0_w
+#undef zeta_h1
+#undef zeta_h1_w
+#undef q
diff --git a/dev/riscv32/src/ntt_rv32im_slowmul_asm.S b/dev/riscv32/src/ntt_rv32im_slowmul_asm.S
new file mode 100644
index 000000000..b3ae6464b
--- /dev/null
+++ b/dev/riscv32/src/ntt_rv32im_slowmul_asm.S
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * RV32-IM ML-DSA forward NTT, slow-multiplier variant.
+ *
+ * Thin wrapper: the kernel body is shared via ntt_rv32im_asm.i, which here
+ * computes the Barrett low(t*q) reduction with a shift-add chain (exploiting
+ * q = 2^23 - 2^13 + 1), trading the multiply for cheap ALU ops -- preferred
+ * when the multiplier is slow. The fast-multiplier variant lives in
+ * ntt_rv32im_asm.S. Exactly one of the two is selected, by
+ * MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER.
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_RV32IM) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
+    defined(MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER)
+/* simpasm: header-end */
+
+        .text
+        .global MLD_ASM_NAMESPACE(ntt_rv32im_asm)
+        .balign 4
+MLD_ASM_FN_SYMBOL(ntt_rv32im_asm)
+
+#define MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER
+#include "ntt_rv32im_asm.i"
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER
+
+/* simpasm: footer-start */
+#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
+          MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER */
diff --git a/dev/riscv32/src/poly_pointwise_montgomery_rv32im_asm.S b/dev/riscv32/src/poly_pointwise_montgomery_rv32im_asm.S
new file mode 100644
index 000000000..4d51c9afc
--- /dev/null
+++ b/dev/riscv32/src/poly_pointwise_montgomery_rv32im_asm.S
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * RV32-IM ML-DSA pointwise polynomial multiplication with Montgomery
+ * reduction. Computes
+ *
+ *     a[i] = (a[i] * b[i]) * R^-1  mod q,    R = 2^32, |result| < q,
+ *
+ * for i in 0..256, in-place in a.
+ *
+ * Modular arithmetic: standard signed Montgomery reduction. Unlike the
+ * NTT, neither operand is constant, so we can't precompute a twisted
+ * form -- the kernel uses 4 multiplies per coefficient:
+ *
+ *     plo = low (a * b)            ; mul
+ *     m   = low (plo * QINV)       ; mul (low 32 of (plo * QINV))
+ *     phi = high(a * b)            ; mulh
+ *     mh  = high(m * q)            ; mulh
+ *     r   = phi - mh               ; sub
+ *
+ * Bounds: requires |a[i]|, |b[i]| < MLD_NTT_BOUND = 9*q. The product
+ * is bounded by (9q)^2 < 2^31 * q, well within the safe input range
+ * for `mld_montgomery_reduce` (which is |a| <= 2^31 * q).
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_RV32IM) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+/* simpasm: header-end */
+
+/*****************************************************************
+ * Register aliases
+ *****************************************************************/
+
+/* Arguments */
+#define a_ptr       a0
+#define b_ptr       a1
+
+/* Loop control */
+#define a_end       t0          /* end-of-array sentinel for a_ptr */
+
+/* Per-coef working set (caller-saved) */
+#define a_val       a2
+#define b_val       a3
+#define plo         a4
+#define phi         a5
+#define mlo         a6
+#define mhi         a7
+
+/* Constants (callee-saved) */
+#define q           s0          /* MLDSA_Q = 8380417           */
+#define qinv        s1          /* QINV    = 58728449          */
+
+/*****************************************************************
+ * Function
+ *****************************************************************/
+
+        .text
+        .global MLD_ASM_NAMESPACE(poly_pointwise_montgomery_rv32im_asm)
+        .balign 4
+MLD_ASM_FN_SYMBOL(poly_pointwise_montgomery_rv32im_asm)
+
+        addi sp, sp, -8
+        sw   s0, 0(sp)
+        sw   s1, 4(sp)
+
+        /* q    = 0x007FE001 */
+        lui  q, 0x7FE
+        addi q, q, 1
+        /* qinv = 0x03802001 = 58728449
+         *   lui qinv, 0x3802; addi qinv, qinv, 1  -> 0x03802001 */
+        lui  qinv, 0x3802
+        addi qinv, qinv, 1
+
+        addi a_end, a_ptr, 1024     /* 256 * 4 bytes */
+
+poly_pointwise_montgomery_rv32im_loop:
+        lw   a_val, 0(a_ptr)
+        lw   b_val, 0(b_ptr)
+
+        /* Standard signed Montgomery reduction of a*b:
+         *   plo = (a*b)   low 32
+         *   mlo = plo*QINV low 32
+         *   phi = (a*b)   high 32  (signed)
+         *   mhi = mlo*q   high 32  (signed)
+         *   res = phi - mhi
+         */
+        mul   plo, a_val, b_val
+        mul   mlo, plo,   qinv
+        mulh  phi, a_val, b_val
+        mulh  mhi, mlo,   q
+        sub   a_val, phi, mhi
+
+        sw   a_val, 0(a_ptr)
+
+        addi a_ptr, a_ptr, 4
+        addi b_ptr, b_ptr, 4
+        bne  a_ptr, a_end, poly_pointwise_montgomery_rv32im_loop
+
+        lw   s0, 0(sp)
+        lw   s1, 4(sp)
+        addi sp, sp, 8
+        ret
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef a_ptr
+#undef b_ptr
+#undef a_end
+#undef a_val
+#undef b_val
+#undef plo
+#undef phi
+#undef mlo
+#undef mhi
+#undef q
+#undef qinv
+
+/* simpasm: footer-start */
+#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/dev/riscv32/src/rv32im_zetas.c b/dev/riscv32/src/rv32im_zetas.c
new file mode 100644
index 000000000..05cd415a7
--- /dev/null
+++ b/dev/riscv32/src/rv32im_zetas.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * WARNING: This file is auto-generated from scripts/autogen
+ *          in the mldsa-native repository.
+ *          Do not modify it directly.
+ */
+
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_RV32IM) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+#include "arith_native_rv32im.h"
+
+/*
+ * Table of zeta values used in the RV32-IM forward NTT.
+ * Each entry is a (zeta, w) Barrett pair, with zeta the plain
+ * centered twiddle (|zeta| <= q/2) and w = round(zeta * 2^32 / q)
+ * the Barrett multiplier. See autogen for details.
+ */
+MLD_ALIGN MLD_INTERNAL_DATA_DEFINITION const int32_t
+    mld_rv32im_ntt_zetas[510] = {
+        -3572223, -1830765815, 3765607,  1929875198,  3761513,  1927777021,
+        -3201494, -1640767044, -601683,  -308362795,  3542485,  1815525077,
+        -2883726, -1477910808, 2682288,  1374673747,  2129892,  1091570561,
+        -3145678, -1612161320, 3764867,  1929495947,  -1005239, -515185417,
+        -3201430, -1640734244, 557458,   285697463,   -1221177, -625853735,
+        -3370349, -1727305304, 3602218,  1846138265,  3182878,  1631226336,
+        -4063053, -2082316400, 2740543,  1404529459,  -3586446, -1838055109,
+        2663378,  1364982364,  -3110818, -1594295555, 2101410,  1076973524,
+        -1674615, -858240904,  3704823,  1898723372,  1159875,  594436433,
+        -3524442, -1806278032, 394148,   202001019,   928749,   475984260,
+        -434125,  -222489248,  1095468,  561427818,   -3506380, -1797021249,
+        676590,   346752664,   2071829,  1061813248,  -4018989, -2059733581,
+        -1335936, -684667771,  3241972,  1661512036,  2156050,  1104976547,
+        -3227876, -1654287830, 3415069,  1750224323,  1759347,  901666090,
+        1714295,  878576921,   -817536,  -418987550,  -3574466, -1831915353,
+        2453983,  1257667337,  3756790,  1925356481,  -1935799, -992097815,
+        1460718,  748618600,   -1716988, -879957084,  -3950053, -2024403852,
+        -642628,  -329347125,  -2897314, -1484874664, 3192354,  1636082790,
+        -3585098, -1837364258, 556856,   285388938,   3870317,  1983539117,
+        2815639,  1443016191,  2917338,  1495136972,  1853806,  950076368,
+        2283733,  1170414139,  3345963,  1714807468,  1858416,  952438995,
+        3073009,  1574918427,  1753,     898413,      -1935420, -991903578,
+        1277625,  654783359,   -2659525, -1363007700, -1455890, -746144248,
+        -2635473, -1350681039, 2660408,  1363460238,  -1780227, -912367099,
+        3852015,  1974159335,  -59148,   -30313375,   2772600,  1420958686,
+        4183372,  2143979939,  1182243,  605900043,   87208,    44694137,
+        -3222807, -1651689966, 636927,   326425360,   -3965306, -2032221021,
+        -3121440, -1599739335, -3956745, -2027833504, -2296397, -1176904444,
+        -274060,  -140455867,  -3284915, -1683520342, -3716946, -1904936414,
+        2508980,  1285853323,  -27812,   -14253662,   822541,   421552614,
+        2028118,  1039411342,  1009365,  517299994,   -2454145, -1257750362,
+        1937570,  993005454,   -1979497, -1014493059, 1596822,  818371958,
+        -3815725, -1955560694, -3956944, -2027935492, -3759465, -1926727420,
+        2811291,  1440787840,  -1685153, -863641633,  -3410568, -1747917558,
+        -2983781, -1529189038, 2678278,  1372618620,  -3768948, -1931587462,
+        -1109516, -568627424,  -3551006, -1819892093, 635956,   325927722,
+        4158088,  2131021878,  -250446,  -128353682,  -2455377, -1258381762,
+        1528066,  783134478,   -4146264, -2124962073, -1772588, -908452108,
+        482649,   247357819,   2192938,  1123881663,  -1727088, -885133339,
+        1148858,  588790216,   2387513,  1223601433,  -3611750, -1851023419,
+        -2962264, -1518161567, -268456,  -137583815,  -3180456, -1629985060,
+        -565603,  -289871779,  3747250,  1920467227,  2296099,  1176751719,
+        169688,   86965173,    1239911,  635454918,   -3838479, -1967222129,
+        2462444,  1262003603,  3195676,  1637785316,  2642980,  1354528380,
+        -3334383, -1708872713, 1254190,  642772911,   -12417,   -6363718,
+        -4166425, -2135294594, 2998219,  1536588520,  141835,   72690498,
+        -3488383, -1787797779, -89301,   -45766801,   2513018,  1287922800,
+        1987814,  1018755525,  -1354892, -694382729,  613238,   314284737,
+        -3197248, -1638590967, -1310261, -671509323,  -2218467, -1136965286,
+        1736313,  889861155,   -458740,  -235104446,  -1921994, -985022747,
+        235407,   120646188,   4040196,  2070602178,  -3472069, -1779436847,
+        -3250154, -1665705315, 2039144,  1045062172,  -1879878, -963438279,
+        3258457,  1669960606,  -818761,  -419615363,  -2178965, -1116720494,
+        -2579253, -1321868265, -1623354, -831969619,  2105286,  1078959975,
+        1787943,  916321552,   -2374402, -1216882040, -2033807, -1042326957,
+        -2391089, -1225434135, 586241,   300448763,   -1179613, -604552167,
+        -2254727, -1155548552, 527981,   270590488,   -2743411, -1405999311,
+        3482206,  1784632064,  -1476985, -756955444,  1994046,  1021949428,
+        -4182915, -2143745726, 2491325,  1276805128,  -1393159, -713994583,
+        -1300016, -666258756,  507927,   260312805,   -1187885, -608791570,
+        -2362063, -1210558298, -724804,  -371462360,  -1834526, -940195359,
+        -1317678, -675310538,  -3033742, -1554794072, -338420,  -173440395,
+        2461387,  1261461890,  2647994,  1357098057,  3009748,  1542497137,
+        3035980,  1555941048,  -2612853, -1339088280, 4148469,  2126092136,
+        621164,   318346816,   749577,   384158533,   -4022750, -2061661095,
+        3901472,  1999506068,  3980599,  2040058690,  2569011,  1316619236,
+        -1226661, -628664287,  -1615530, -827959816,  1723229,  883155599,
+        2925816,  1499481951,  1665318,  853476187,   2028038,  1039370342,
+        3374250,  1729304568,  1163598,  596344473,   -3369273, -1726753853,
+        1356448,  695180180,   3994671,  2047270596,  -11879,   -6087993,
+        -2775755, -1422575624, -1370517, -702390549,  3020393,  1547952704,
+        2683270,  1375177022,  3363542,  1723816713,  214880,   110126092,
+        -2778788, -1424130038, 545376,   279505433,   -770441,  -394851342,
+        -3467665, -1777179795, 3105558,  1591599803,  -1103344, -565464272,
+        2312838,  1185330464,  508145,   260424530,   -553718,  -283780712,
+        -653275,  -334803717,  860144,   440824168,   3430436,  1758099917,
+        -459163,  -235321234,  140244,   71875110,    -1514152, -776003547,
+        348812,   178766299,   -2185084, -1119856484, 3123762,  1600929361,
+        -327848,  -168022240,  2358373,  1208667171,  -2193087, -1123958025,
+        1011223,  518252220,   -3014420, -1544891539, -1716814, -879867909,
+        -2354215, -1206536194, 2926054,  1499603926,  -392707,  -201262505,
+        -3818627, -1957047970, -303005,  -155290192,  3531229,  1809756372,
+        -1922253, -985155484,  -3974485, -2036925262, -3773731, -1934038751,
+        -2236726, -1146323031, 1900052,  973777462,   -781875,  -400711272,
+        1744507,  894060583,   1054478,  540420426,   -731434,  -374860238,
+};
+
+#else /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
+
+MLD_EMPTY_CU(rv32im_zetas)
+
+#endif /* !(MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED) */
diff --git a/flake.nix b/flake.nix
index 8bd75003b..f96f910e8 100644
--- a/flake.nix
+++ b/flake.nix
@@ -162,7 +162,7 @@
 
           # autogen shell with cross compiler for the "other" architecture
           devShells.cross-autogen = util.mkShell {
-            packages = builtins.attrValues { inherit (config.packages) linters; inherit (pkgs) gcc-arm-embedded; }
+            packages = builtins.attrValues { inherit (config.packages) linters toolchain_riscv32; inherit (pkgs) gcc-arm-embedded; }
               ++ pkgs.lib.optionals pkgs.stdenv.hostPlatform.isx86_64 [ config.packages.toolchain_aarch64 ]
               ++ pkgs.lib.optionals pkgs.stdenv.hostPlatform.isAarch64 [ config.packages.toolchain_x86_64 ];
           };
diff --git a/mldsa/mldsa_native.c b/mldsa/mldsa_native.c
index 9365ed369..1f02ee81b 100644
--- a/mldsa/mldsa_native.c
+++ b/mldsa/mldsa_native.c
@@ -92,6 +92,9 @@
 #include "src/native/x86_64/src/rej_uniform_eta4_avx2.c"
 #include "src/native/x86_64/src/rej_uniform_table.c"
 #endif /* MLD_SYS_X86_64 */
+#if defined(MLD_SYS_RISCV32)
+#include "src/native/rv32im/src/rv32im_zetas.c"
+#endif
 #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */
 
 #if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202)
@@ -805,5 +808,22 @@
 #undef MLD_NATIVE_X86_64_SRC_CONSTS_H
 #undef mld_qdata
 #endif /* MLD_SYS_X86_64 */
+#if defined(MLD_SYS_RISCV32)
+/*
+ * Undefine macros from native code (Arith, RV32IM)
+ */
+/* mldsa/src/native/rv32im/meta.h */
+#undef MLD_ARITH_BACKEND_RV32IM
+#undef MLD_NATIVE_RV32IM_META_H
+#undef MLD_USE_NATIVE_INTT
+#undef MLD_USE_NATIVE_NTT
+#undef MLD_USE_NATIVE_POINTWISE_MONTGOMERY
+/* mldsa/src/native/rv32im/src/arith_native_rv32im.h */
+#undef MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H
+#undef mld_intt_rv32im_asm
+#undef mld_ntt_rv32im_asm
+#undef mld_poly_pointwise_montgomery_rv32im_asm
+#undef mld_rv32im_ntt_zetas
+#endif /* MLD_SYS_RISCV32 */
 #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */
 #endif /* !MLD_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS */
diff --git a/mldsa/mldsa_native_asm.S b/mldsa/mldsa_native_asm.S
index 4877d5156..902021a31 100644
--- a/mldsa/mldsa_native_asm.S
+++ b/mldsa/mldsa_native_asm.S
@@ -91,6 +91,13 @@
 #include "src/native/x86_64/src/polyz_unpack_17_avx2_asm.S"
 #include "src/native/x86_64/src/polyz_unpack_19_avx2_asm.S"
 #endif /* MLD_SYS_X86_64 */
+#if defined(MLD_SYS_RISCV32)
+#include "src/native/rv32im/src/intt_rv32im_asm.S"
+#include "src/native/rv32im/src/intt_rv32im_slowmul_asm.S"
+#include "src/native/rv32im/src/ntt_rv32im_asm.S"
+#include "src/native/rv32im/src/ntt_rv32im_slowmul_asm.S"
+#include "src/native/rv32im/src/poly_pointwise_montgomery_rv32im_asm.S"
+#endif /* MLD_SYS_RISCV32 */
 #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */
 
 #if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202)
@@ -818,5 +825,22 @@
 #undef MLD_NATIVE_X86_64_SRC_CONSTS_H
 #undef mld_qdata
 #endif /* MLD_SYS_X86_64 */
+#if defined(MLD_SYS_RISCV32)
+/*
+ * Undefine macros from native code (Arith, RV32IM)
+ */
+/* mldsa/src/native/rv32im/meta.h */
+#undef MLD_ARITH_BACKEND_RV32IM
+#undef MLD_NATIVE_RV32IM_META_H
+#undef MLD_USE_NATIVE_INTT
+#undef MLD_USE_NATIVE_NTT
+#undef MLD_USE_NATIVE_POINTWISE_MONTGOMERY
+/* mldsa/src/native/rv32im/src/arith_native_rv32im.h */
+#undef MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H
+#undef mld_intt_rv32im_asm
+#undef mld_ntt_rv32im_asm
+#undef mld_poly_pointwise_montgomery_rv32im_asm
+#undef mld_rv32im_ntt_zetas
+#endif /* MLD_SYS_RISCV32 */
 #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */
 #endif /* !MLD_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS */
diff --git a/mldsa/src/native/meta.h b/mldsa/src/native/meta.h
index b26232622..fc886cdff 100644
--- a/mldsa/src/native/meta.h
+++ b/mldsa/src/native/meta.h
@@ -21,4 +21,10 @@
 #include "x86_64/meta.h"
 #endif
 
+/* We do not yet include the arithmetic backend for RV32-IM by default
+ * as it is still experimental and undergoing review. */
+/* #if defined(MLD_SYS_RISCV32) */
+/* #include "rv32im/meta.h" */
+/* #endif */
+
 #endif /* !MLD_NATIVE_META_H */
diff --git a/mldsa/src/native/rv32im/meta.h b/mldsa/src/native/rv32im/meta.h
new file mode 100644
index 000000000..a83cd62c1
--- /dev/null
+++ b/mldsa/src/native/rv32im/meta.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#ifndef MLD_NATIVE_RV32IM_META_H
+#define MLD_NATIVE_RV32IM_META_H
+
+/* Set of primitives that this backend replaces */
+#define MLD_USE_NATIVE_NTT
+#define MLD_USE_NATIVE_INTT
+#define MLD_USE_NATIVE_POINTWISE_MONTGOMERY
+
+/* Identifier for this backend so that source and assembly files
+ * in the build can be appropriately guarded. */
+#define MLD_ARITH_BACKEND_RV32IM
+
+
+#if !defined(__ASSEMBLER__)
+#include "../api.h"
+#include "src/arith_native_rv32im.h"
+
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_ntt_native(int32_t data[MLDSA_N])
+{
+  mld_ntt_rv32im_asm(data, mld_rv32im_ntt_zetas);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_intt_native(int32_t data[MLDSA_N])
+{
+  mld_intt_rv32im_asm(data, mld_rv32im_ntt_zetas);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_poly_pointwise_montgomery_native(
+    int32_t a[MLDSA_N], const int32_t b[MLDSA_N])
+{
+  mld_poly_pointwise_montgomery_rv32im_asm(a, b);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+
+#endif /* !__ASSEMBLER__ */
+#endif /* !MLD_NATIVE_RV32IM_META_H */
diff --git a/mldsa/src/native/rv32im/src/arith_native_rv32im.h b/mldsa/src/native/rv32im/src/arith_native_rv32im.h
new file mode 100644
index 000000000..9a987fbfd
--- /dev/null
+++ b/mldsa/src/native/rv32im/src/arith_native_rv32im.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#ifndef MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H
+#define MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H
+
+#include "../../../cbmc.h"
+#include "../../../common.h"
+
+#define mld_rv32im_ntt_zetas MLD_NAMESPACE(rv32im_ntt_zetas)
+
+/*
+ * Forward NTT zeta table for the RV32-IM backend.
+ *
+ * 255 logical entries, each a (zeta, w) Barrett pair: zeta is the plain
+ * centered twiddle w^{bitrev_8(k)} mod q (|zeta| <= q/2) and
+ * w = round(zeta * 2^32 / q) is the Barrett multiplier used by the
+ * constant-twiddle butterfly. The order matches the consumption order of
+ * the 2+2+2+2 forward NTT.
+ */
+MLD_INTERNAL_DATA_DECLARATION const int32_t mld_rv32im_ntt_zetas[510];
+
+#define mld_ntt_rv32im_asm MLD_NAMESPACE(ntt_rv32im_asm)
+void mld_ntt_rv32im_asm(int32_t *r, const int32_t *zetas)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N))
+  requires(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q))
+  requires(zetas == mld_rv32im_ntt_zetas)
+  assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N))
+  /* Forward-NTT output bound MLD_NTT_BOUND = 9 * MLD_FQMUL_BOUND. The
+   * truncating `mulh` Barrett multiply has output bound MLD_FQMUL_BOUND =
+   * 5/4 * MLDSA_Q (vs MLDSA_Q for the rounding `sqrdmulh` used on AArch64),
+   * so the NTT output is bounded by 9 * MLD_FQMUL_BOUND, not 9 * MLDSA_Q.
+   * Spelled out inline to keep this header free of poly.h. */
+  ensures(array_abs_bound(r, 0, MLDSA_N, 9 * ((5 * MLDSA_Q + 3) / 4)))
+);
+
+#define mld_intt_rv32im_asm MLD_NAMESPACE(intt_rv32im_asm)
+void mld_intt_rv32im_asm(int32_t *r, const int32_t *zetas)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N))
+  requires(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q))
+  requires(zetas == mld_rv32im_ntt_zetas)
+  assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N))
+  ensures(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q))
+);
+
+#define mld_poly_pointwise_montgomery_rv32im_asm \
+  MLD_NAMESPACE(poly_pointwise_montgomery_rv32im_asm)
+void mld_poly_pointwise_montgomery_rv32im_asm(int32_t *a, const int32_t *b)
+__contract__(
+  requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N))
+  requires(memory_no_alias(b, sizeof(int32_t) * MLDSA_N))
+  /* Inputs bounded by MLD_NTT_BOUND = 9 * MLD_FQMUL_BOUND, the guaranteed
+   * output bound of any forward NTT. Spelled out inline to keep this header
+   * free of poly.h. */
+  requires(array_abs_bound(a, 0, MLDSA_N, 9 * ((5 * MLDSA_Q + 3) / 4)))
+  requires(array_abs_bound(b, 0, MLDSA_N, 9 * ((5 * MLDSA_Q + 3) / 4)))
+  assigns(memory_slice(a, sizeof(int32_t) * MLDSA_N))
+  ensures(array_abs_bound(a, 0, MLDSA_N, MLDSA_Q))
+);
+
+#endif /* !MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H */
diff --git a/mldsa/src/native/rv32im/src/intt_rv32im_asm.S b/mldsa/src/native/rv32im/src/intt_rv32im_asm.S
new file mode 100644
index 000000000..a52b11041
--- /dev/null
+++ b/mldsa/src/native/rv32im/src/intt_rv32im_asm.S
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * RV32-IM ML-DSA inverse NTT, fast-multiplier variant.
+ *
+ * Thin wrapper: the kernel body is shared via intt_rv32im_asm.i, which here
+ * computes the Barrett low(t*q) reduction with a single multiply by q. The
+ * slow-multiplier variant (shift-add) lives in intt_rv32im_slowmul_asm.S.
+ * Exactly one of the two is selected, by MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER.
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_RV32IM) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
+    !defined(MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER)
+
+/*
+ * WARNING: This file is auto-derived from the mldsa-native source file
+ *   dev/riscv32/src/intt_rv32im_asm.S using scripts/simpasm. Do not modify it directly.
+ */
+
+.text
+.balign 4
+.global MLD_ASM_NAMESPACE(intt_rv32im_asm)
+MLD_ASM_FN_SYMBOL(intt_rv32im_asm)
+
+        .cfi_startproc
+        addi sp, sp, -0x20
+        .cfi_adjust_cfa_offset 0x20
+        sw s0, 0x0(sp)
+        sw s1, 0x4(sp)
+        sw s2, 0x8(sp)
+        sw s3, 0xc(sp)
+        sw s4, 0x10(sp)
+        sw s5, 0x14(sp)
+        sw s6, 0x18(sp)
+        sw s7, 0x1c(sp)
+        lui t0, 0x7fe
+        addi t0, t0, 0x1
+        addi a1, a1, 0x7f8
+        mv t2, a0
+        addi t3, a0, 0x400
+
+Lintt_rv32im_p1_outer:
+        addi a1, a1, -0x18
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        lw a2, 0x0(t2)
+        lw a3, 0x4(t2)
+        lw a4, 0x8(t2)
+        lw a5, 0xc(t2)
+        sub a6, a3, a2
+        add a2, a2, a3
+        mulh a7, a6, s5
+        mul a3, a6, s4
+        mul a7, a7, t0
+        sub a3, a3, a7
+        sub a6, a5, a4
+        add a4, a4, a5
+        mulh a7, a6, s3
+        mul a5, a6, s2
+        mul a7, a7, t0
+        sub a5, a5, a7
+        sub a6, a4, a2
+        add a2, a2, a4
+        mulh a7, a6, s1
+        mul a4, a6, s0
+        mul a7, a7, t0
+        sub a4, a4, a7
+        sub a6, a5, a3
+        add a3, a3, a5
+        mulh a7, a6, s1
+        mul a5, a6, s0
+        mul a7, a7, t0
+        sub a5, a5, a7
+        sw a2, 0x0(t2)
+        sw a3, 0x4(t2)
+        sw a4, 0x8(t2)
+        sw a5, 0xc(t2)
+        addi t2, t2, 0x10
+        bne t2, t3, Lintt_rv32im_p1_outer
+        mv t2, a0
+        addi t3, a0, 0x400
+
+Lintt_rv32im_p2_outer:
+        addi a1, a1, -0x18
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        addi t4, t2, 0x10
+
+Lintt_rv32im_p2_inner:
+        lw a2, 0x0(t2)
+        lw a3, 0x10(t2)
+        lw a4, 0x20(t2)
+        lw a5, 0x30(t2)
+        sub a6, a3, a2
+        add a2, a2, a3
+        mulh a7, a6, s5
+        mul a3, a6, s4
+        mul a7, a7, t0
+        sub a3, a3, a7
+        sub a6, a5, a4
+        add a4, a4, a5
+        mulh a7, a6, s3
+        mul a5, a6, s2
+        mul a7, a7, t0
+        sub a5, a5, a7
+        sub a6, a4, a2
+        add a2, a2, a4
+        mulh a7, a6, s1
+        mul a4, a6, s0
+        mul a7, a7, t0
+        sub a4, a4, a7
+        sub a6, a5, a3
+        add a3, a3, a5
+        mulh a7, a6, s1
+        mul a5, a6, s0
+        mul a7, a7, t0
+        sub a5, a5, a7
+        sw a2, 0x0(t2)
+        sw a3, 0x10(t2)
+        sw a4, 0x20(t2)
+        sw a5, 0x30(t2)
+        addi t2, t2, 0x4
+        bne t2, t4, Lintt_rv32im_p2_inner
+        addi t2, t2, 0x30
+        bne t2, t3, Lintt_rv32im_p2_outer
+        mv t2, a0
+        addi t3, a0, 0x400
+
+Lintt_rv32im_p3_outer:
+        addi a1, a1, -0x18
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        addi t4, t2, 0x40
+
+Lintt_rv32im_p3_inner:
+        lw a2, 0x0(t2)
+        lw a3, 0x40(t2)
+        lw a4, 0x80(t2)
+        lw a5, 0xc0(t2)
+        sub a6, a3, a2
+        add a2, a2, a3
+        mulh a7, a6, s5
+        mul a3, a6, s4
+        mul a7, a7, t0
+        sub a3, a3, a7
+        sub a6, a5, a4
+        add a4, a4, a5
+        mulh a7, a6, s3
+        mul a5, a6, s2
+        mul a7, a7, t0
+        sub a5, a5, a7
+        sub a6, a4, a2
+        add a2, a2, a4
+        mulh a7, a6, s1
+        mul a4, a6, s0
+        mul a7, a7, t0
+        sub a4, a4, a7
+        sub a6, a5, a3
+        add a3, a3, a5
+        mulh a7, a6, s1
+        mul a5, a6, s0
+        mul a7, a7, t0
+        sub a5, a5, a7
+        sw a2, 0x0(t2)
+        sw a3, 0x40(t2)
+        sw a4, 0x80(t2)
+        sw a5, 0xc0(t2)
+        addi t2, t2, 0x4
+        bne t2, t4, Lintt_rv32im_p3_inner
+        addi t2, t2, 0xc0
+        bne t2, t3, Lintt_rv32im_p3_outer
+        addi a1, a1, -0x18
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        mv t2, a0
+        addi t4, a0, 0x100
+
+Lintt_rv32im_p4_inner:
+        lw a2, 0x0(t2)
+        lw a3, 0x100(t2)
+        lw a4, 0x200(t2)
+        lw a5, 0x300(t2)
+        sub a6, a3, a2
+        add a2, a2, a3
+        mulh a7, a6, s5
+        mul a3, a6, s4
+        mul a7, a7, t0
+        sub a3, a3, a7
+        sub a6, a5, a4
+        add a4, a4, a5
+        mulh a7, a6, s3
+        mul a5, a6, s2
+        mul a7, a7, t0
+        sub a5, a5, a7
+        sub a6, a4, a2
+        add a2, a2, a4
+        mulh a7, a6, s1
+        mul a4, a6, s0
+        mul a7, a7, t0
+        sub a4, a4, a7
+        sub a6, a5, a3
+        add a3, a3, a5
+        mulh a7, a6, s1
+        mul a5, a6, s0
+        mul a7, a7, t0
+        sub a5, a5, a7
+        sw a2, 0x0(t2)
+        sw a3, 0x100(t2)
+        sw a4, 0x200(t2)
+        sw a5, 0x300(t2)
+        addi t2, t2, 0x4
+        bne t2, t4, Lintt_rv32im_p4_inner
+        lui s6, 0x4
+        addi s6, s6, -0x2
+        lui s7, 0x1004
+        addi s7, s7, -0x7f4
+        mv t2, a0
+        addi t5, a0, 0x400
+
+Lintt_rv32im_scale:
+        lw a2, 0x0(t2)
+        mulh a6, a2, s7
+        addi a6, a6, 0x1
+        srai a6, a6, 0x1
+        mul a3, a2, s6
+        mul a6, a6, t0
+        sub a3, a3, a6
+        sw a3, 0x0(t2)
+        addi t2, t2, 0x4
+        bne t2, t5, Lintt_rv32im_scale
+        lw s0, 0x0(sp)
+        lw s1, 0x4(sp)
+        lw s2, 0x8(sp)
+        lw s3, 0xc(sp)
+        lw s4, 0x10(sp)
+        lw s5, 0x14(sp)
+        lw s6, 0x18(sp)
+        lw s7, 0x1c(sp)
+        addi sp, sp, 0x20
+        .cfi_adjust_cfa_offset -0x20
+        ret
+        .cfi_endproc
+
+MLD_ASM_FN_SIZE(intt_rv32im_asm)
+
+#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
+          !MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/mldsa/src/native/rv32im/src/intt_rv32im_slowmul_asm.S b/mldsa/src/native/rv32im/src/intt_rv32im_slowmul_asm.S
new file mode 100644
index 000000000..b26242a04
--- /dev/null
+++ b/mldsa/src/native/rv32im/src/intt_rv32im_slowmul_asm.S
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * RV32-IM ML-DSA inverse NTT, slow-multiplier variant.
+ *
+ * Thin wrapper: the kernel body is shared via intt_rv32im_asm.i, which here
+ * computes the Barrett low(t*q) reduction with a shift-add chain (exploiting
+ * q = 2^23 - 2^13 + 1), trading the multiply for cheap ALU ops -- preferred
+ * when the multiplier is slow. The fast-multiplier variant lives in
+ * intt_rv32im_asm.S. Exactly one of the two is selected, by
+ * MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER.
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_RV32IM) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
+    defined(MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER)
+
+/*
+ * WARNING: This file is auto-derived from the mldsa-native source file
+ *   dev/riscv32/src/intt_rv32im_slowmul_asm.S using scripts/simpasm. Do not modify it directly.
+ */
+
+.text
+.balign 4
+.global MLD_ASM_NAMESPACE(intt_rv32im_asm)
+MLD_ASM_FN_SYMBOL(intt_rv32im_asm)
+
+        .cfi_startproc
+        addi sp, sp, -0x20
+        .cfi_adjust_cfa_offset 0x20
+        sw s0, 0x0(sp)
+        sw s1, 0x4(sp)
+        sw s2, 0x8(sp)
+        sw s3, 0xc(sp)
+        sw s4, 0x10(sp)
+        sw s5, 0x14(sp)
+        sw s6, 0x18(sp)
+        sw s7, 0x1c(sp)
+        addi a1, a1, 0x7f8
+        mv t2, a0
+        addi t3, a0, 0x400
+
+Lintt_rv32im_p1_outer:
+        addi a1, a1, -0x18
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        lw a2, 0x0(t2)
+        lw a3, 0x4(t2)
+        lw a4, 0x8(t2)
+        lw a5, 0xc(t2)
+        sub a6, a3, a2
+        add a2, a2, a3
+        mulh a7, a6, s5
+        mul a3, a6, s4
+        sub a3, a3, a7
+        slli a7, a7, 0xd
+        add a3, a3, a7
+        slli a7, a7, 0xa
+        sub a3, a3, a7
+        sub a6, a5, a4
+        add a4, a4, a5
+        mulh a7, a6, s3
+        mul a5, a6, s2
+        sub a5, a5, a7
+        slli a7, a7, 0xd
+        add a5, a5, a7
+        slli a7, a7, 0xa
+        sub a5, a5, a7
+        sub a6, a4, a2
+        add a2, a2, a4
+        mulh a7, a6, s1
+        mul a4, a6, s0
+        sub a4, a4, a7
+        slli a7, a7, 0xd
+        add a4, a4, a7
+        slli a7, a7, 0xa
+        sub a4, a4, a7
+        sub a6, a5, a3
+        add a3, a3, a5
+        mulh a7, a6, s1
+        mul a5, a6, s0
+        sub a5, a5, a7
+        slli a7, a7, 0xd
+        add a5, a5, a7
+        slli a7, a7, 0xa
+        sub a5, a5, a7
+        sw a2, 0x0(t2)
+        sw a3, 0x4(t2)
+        sw a4, 0x8(t2)
+        sw a5, 0xc(t2)
+        addi t2, t2, 0x10
+        bne t2, t3, Lintt_rv32im_p1_outer
+        mv t2, a0
+        addi t3, a0, 0x400
+
+Lintt_rv32im_p2_outer:
+        addi a1, a1, -0x18
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        addi t4, t2, 0x10
+
+Lintt_rv32im_p2_inner:
+        lw a2, 0x0(t2)
+        lw a3, 0x10(t2)
+        lw a4, 0x20(t2)
+        lw a5, 0x30(t2)
+        sub a6, a3, a2
+        add a2, a2, a3
+        mulh a7, a6, s5
+        mul a3, a6, s4
+        sub a3, a3, a7
+        slli a7, a7, 0xd
+        add a3, a3, a7
+        slli a7, a7, 0xa
+        sub a3, a3, a7
+        sub a6, a5, a4
+        add a4, a4, a5
+        mulh a7, a6, s3
+        mul a5, a6, s2
+        sub a5, a5, a7
+        slli a7, a7, 0xd
+        add a5, a5, a7
+        slli a7, a7, 0xa
+        sub a5, a5, a7
+        sub a6, a4, a2
+        add a2, a2, a4
+        mulh a7, a6, s1
+        mul a4, a6, s0
+        sub a4, a4, a7
+        slli a7, a7, 0xd
+        add a4, a4, a7
+        slli a7, a7, 0xa
+        sub a4, a4, a7
+        sub a6, a5, a3
+        add a3, a3, a5
+        mulh a7, a6, s1
+        mul a5, a6, s0
+        sub a5, a5, a7
+        slli a7, a7, 0xd
+        add a5, a5, a7
+        slli a7, a7, 0xa
+        sub a5, a5, a7
+        sw a2, 0x0(t2)
+        sw a3, 0x10(t2)
+        sw a4, 0x20(t2)
+        sw a5, 0x30(t2)
+        addi t2, t2, 0x4
+        bne t2, t4, Lintt_rv32im_p2_inner
+        addi t2, t2, 0x30
+        bne t2, t3, Lintt_rv32im_p2_outer
+        mv t2, a0
+        addi t3, a0, 0x400
+
+Lintt_rv32im_p3_outer:
+        addi a1, a1, -0x18
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        addi t4, t2, 0x40
+
+Lintt_rv32im_p3_inner:
+        lw a2, 0x0(t2)
+        lw a3, 0x40(t2)
+        lw a4, 0x80(t2)
+        lw a5, 0xc0(t2)
+        sub a6, a3, a2
+        add a2, a2, a3
+        mulh a7, a6, s5
+        mul a3, a6, s4
+        sub a3, a3, a7
+        slli a7, a7, 0xd
+        add a3, a3, a7
+        slli a7, a7, 0xa
+        sub a3, a3, a7
+        sub a6, a5, a4
+        add a4, a4, a5
+        mulh a7, a6, s3
+        mul a5, a6, s2
+        sub a5, a5, a7
+        slli a7, a7, 0xd
+        add a5, a5, a7
+        slli a7, a7, 0xa
+        sub a5, a5, a7
+        sub a6, a4, a2
+        add a2, a2, a4
+        mulh a7, a6, s1
+        mul a4, a6, s0
+        sub a4, a4, a7
+        slli a7, a7, 0xd
+        add a4, a4, a7
+        slli a7, a7, 0xa
+        sub a4, a4, a7
+        sub a6, a5, a3
+        add a3, a3, a5
+        mulh a7, a6, s1
+        mul a5, a6, s0
+        sub a5, a5, a7
+        slli a7, a7, 0xd
+        add a5, a5, a7
+        slli a7, a7, 0xa
+        sub a5, a5, a7
+        sw a2, 0x0(t2)
+        sw a3, 0x40(t2)
+        sw a4, 0x80(t2)
+        sw a5, 0xc0(t2)
+        addi t2, t2, 0x4
+        bne t2, t4, Lintt_rv32im_p3_inner
+        addi t2, t2, 0xc0
+        bne t2, t3, Lintt_rv32im_p3_outer
+        addi a1, a1, -0x18
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        mv t2, a0
+        addi t4, a0, 0x100
+
+Lintt_rv32im_p4_inner:
+        lw a2, 0x0(t2)
+        lw a3, 0x100(t2)
+        lw a4, 0x200(t2)
+        lw a5, 0x300(t2)
+        sub a6, a3, a2
+        add a2, a2, a3
+        mulh a7, a6, s5
+        mul a3, a6, s4
+        sub a3, a3, a7
+        slli a7, a7, 0xd
+        add a3, a3, a7
+        slli a7, a7, 0xa
+        sub a3, a3, a7
+        sub a6, a5, a4
+        add a4, a4, a5
+        mulh a7, a6, s3
+        mul a5, a6, s2
+        sub a5, a5, a7
+        slli a7, a7, 0xd
+        add a5, a5, a7
+        slli a7, a7, 0xa
+        sub a5, a5, a7
+        sub a6, a4, a2
+        add a2, a2, a4
+        mulh a7, a6, s1
+        mul a4, a6, s0
+        sub a4, a4, a7
+        slli a7, a7, 0xd
+        add a4, a4, a7
+        slli a7, a7, 0xa
+        sub a4, a4, a7
+        sub a6, a5, a3
+        add a3, a3, a5
+        mulh a7, a6, s1
+        mul a5, a6, s0
+        sub a5, a5, a7
+        slli a7, a7, 0xd
+        add a5, a5, a7
+        slli a7, a7, 0xa
+        sub a5, a5, a7
+        sw a2, 0x0(t2)
+        sw a3, 0x100(t2)
+        sw a4, 0x200(t2)
+        sw a5, 0x300(t2)
+        addi t2, t2, 0x4
+        bne t2, t4, Lintt_rv32im_p4_inner
+        lui s6, 0x4
+        addi s6, s6, -0x2
+        lui s7, 0x1004
+        addi s7, s7, -0x7f4
+        mv t2, a0
+        addi t5, a0, 0x400
+
+Lintt_rv32im_scale:
+        lw a2, 0x0(t2)
+        mulh a6, a2, s7
+        addi a6, a6, 0x1
+        srai a6, a6, 0x1
+        mul a3, a2, s6
+        sub a3, a3, a6
+        slli a6, a6, 0xd
+        add a3, a3, a6
+        slli a6, a6, 0xa
+        sub a3, a3, a6
+        sw a3, 0x0(t2)
+        addi t2, t2, 0x4
+        bne t2, t5, Lintt_rv32im_scale
+        lw s0, 0x0(sp)
+        lw s1, 0x4(sp)
+        lw s2, 0x8(sp)
+        lw s3, 0xc(sp)
+        lw s4, 0x10(sp)
+        lw s5, 0x14(sp)
+        lw s6, 0x18(sp)
+        lw s7, 0x1c(sp)
+        addi sp, sp, 0x20
+        .cfi_adjust_cfa_offset -0x20
+        ret
+        .cfi_endproc
+
+MLD_ASM_FN_SIZE(intt_rv32im_asm)
+
+#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
+          MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/mldsa/src/native/rv32im/src/ntt_rv32im_asm.S b/mldsa/src/native/rv32im/src/ntt_rv32im_asm.S
new file mode 100644
index 000000000..3845101f3
--- /dev/null
+++ b/mldsa/src/native/rv32im/src/ntt_rv32im_asm.S
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * RV32-IM ML-DSA forward NTT, fast-multiplier variant.
+ *
+ * Thin wrapper: the kernel body is shared via ntt_rv32im_asm.i, which here
+ * computes the Barrett low(t*q) reduction with a single multiply by q. The
+ * slow-multiplier variant (shift-add) lives in ntt_rv32im_slowmul_asm.S.
+ * Exactly one of the two is selected, by MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER.
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_RV32IM) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
+    !defined(MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER)
+
+/*
+ * WARNING: This file is auto-derived from the mldsa-native source file
+ *   dev/riscv32/src/ntt_rv32im_asm.S using scripts/simpasm. Do not modify it directly.
+ */
+
+.text
+.balign 4
+.global MLD_ASM_NAMESPACE(ntt_rv32im_asm)
+MLD_ASM_FN_SYMBOL(ntt_rv32im_asm)
+
+        .cfi_startproc
+        addi sp, sp, -0x18
+        .cfi_adjust_cfa_offset 0x18
+        sw s0, 0x0(sp)
+        sw s1, 0x4(sp)
+        sw s2, 0x8(sp)
+        sw s3, 0xc(sp)
+        sw s4, 0x10(sp)
+        sw s5, 0x14(sp)
+        lui t0, 0x7fe
+        addi t0, t0, 0x1
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        addi a1, a1, 0x18
+        mv t2, a0
+        addi t4, a0, 0x100
+
+Lntt_rv32im_p1_loop:
+        lw a2, 0x0(t2)
+        lw a3, 0x100(t2)
+        lw a4, 0x200(t2)
+        lw a5, 0x300(t2)
+        mulh a7, a4, s1
+        mul a6, a4, s0
+        mul a7, a7, t0
+        sub a6, a6, a7
+        sub a4, a2, a6
+        add a2, a2, a6
+        mulh a7, a5, s1
+        mul a6, a5, s0
+        mul a7, a7, t0
+        sub a6, a6, a7
+        sub a5, a3, a6
+        add a3, a3, a6
+        mulh a7, a3, s3
+        mul a6, a3, s2
+        mul a7, a7, t0
+        sub a6, a6, a7
+        sub a3, a2, a6
+        add a2, a2, a6
+        mulh a7, a5, s5
+        mul a6, a5, s4
+        mul a7, a7, t0
+        sub a6, a6, a7
+        sub a5, a4, a6
+        add a4, a4, a6
+        sw a2, 0x0(t2)
+        sw a3, 0x100(t2)
+        sw a4, 0x200(t2)
+        sw a5, 0x300(t2)
+        addi t2, t2, 0x4
+        bne t2, t4, Lntt_rv32im_p1_loop
+        mv t2, a0
+        addi t3, a0, 0x400
+
+Lntt_rv32im_p2_outer:
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        addi a1, a1, 0x18
+        addi t4, t2, 0x40
+
+Lntt_rv32im_p2_inner:
+        lw a2, 0x0(t2)
+        lw a3, 0x40(t2)
+        lw a4, 0x80(t2)
+        lw a5, 0xc0(t2)
+        mulh a7, a4, s1
+        mul a6, a4, s0
+        mul a7, a7, t0
+        sub a6, a6, a7
+        sub a4, a2, a6
+        add a2, a2, a6
+        mulh a7, a5, s1
+        mul a6, a5, s0
+        mul a7, a7, t0
+        sub a6, a6, a7
+        sub a5, a3, a6
+        add a3, a3, a6
+        mulh a7, a3, s3
+        mul a6, a3, s2
+        mul a7, a7, t0
+        sub a6, a6, a7
+        sub a3, a2, a6
+        add a2, a2, a6
+        mulh a7, a5, s5
+        mul a6, a5, s4
+        mul a7, a7, t0
+        sub a6, a6, a7
+        sub a5, a4, a6
+        add a4, a4, a6
+        sw a2, 0x0(t2)
+        sw a3, 0x40(t2)
+        sw a4, 0x80(t2)
+        sw a5, 0xc0(t2)
+        addi t2, t2, 0x4
+        bne t2, t4, Lntt_rv32im_p2_inner
+        addi t2, t2, 0xc0
+        bne t2, t3, Lntt_rv32im_p2_outer
+        mv t2, a0
+        addi t3, a0, 0x400
+
+Lntt_rv32im_p3_outer:
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        addi a1, a1, 0x18
+        addi t4, t2, 0x10
+
+Lntt_rv32im_p3_inner:
+        lw a2, 0x0(t2)
+        lw a3, 0x10(t2)
+        lw a4, 0x20(t2)
+        lw a5, 0x30(t2)
+        mulh a7, a4, s1
+        mul a6, a4, s0
+        mul a7, a7, t0
+        sub a6, a6, a7
+        sub a4, a2, a6
+        add a2, a2, a6
+        mulh a7, a5, s1
+        mul a6, a5, s0
+        mul a7, a7, t0
+        sub a6, a6, a7
+        sub a5, a3, a6
+        add a3, a3, a6
+        mulh a7, a3, s3
+        mul a6, a3, s2
+        mul a7, a7, t0
+        sub a6, a6, a7
+        sub a3, a2, a6
+        add a2, a2, a6
+        mulh a7, a5, s5
+        mul a6, a5, s4
+        mul a7, a7, t0
+        sub a6, a6, a7
+        sub a5, a4, a6
+        add a4, a4, a6
+        sw a2, 0x0(t2)
+        sw a3, 0x10(t2)
+        sw a4, 0x20(t2)
+        sw a5, 0x30(t2)
+        addi t2, t2, 0x4
+        bne t2, t4, Lntt_rv32im_p3_inner
+        addi t2, t2, 0x30
+        bne t2, t3, Lntt_rv32im_p3_outer
+        mv t2, a0
+        addi t3, a0, 0x400
+
+Lntt_rv32im_p4_outer:
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        addi a1, a1, 0x18
+        lw a2, 0x0(t2)
+        lw a3, 0x4(t2)
+        lw a4, 0x8(t2)
+        lw a5, 0xc(t2)
+        mulh a7, a4, s1
+        mul a6, a4, s0
+        mul a7, a7, t0
+        sub a6, a6, a7
+        sub a4, a2, a6
+        add a2, a2, a6
+        mulh a7, a5, s1
+        mul a6, a5, s0
+        mul a7, a7, t0
+        sub a6, a6, a7
+        sub a5, a3, a6
+        add a3, a3, a6
+        mulh a7, a3, s3
+        mul a6, a3, s2
+        mul a7, a7, t0
+        sub a6, a6, a7
+        sub a3, a2, a6
+        add a2, a2, a6
+        mulh a7, a5, s5
+        mul a6, a5, s4
+        mul a7, a7, t0
+        sub a6, a6, a7
+        sub a5, a4, a6
+        add a4, a4, a6
+        sw a2, 0x0(t2)
+        sw a3, 0x4(t2)
+        sw a4, 0x8(t2)
+        sw a5, 0xc(t2)
+        addi t2, t2, 0x10
+        bne t2, t3, Lntt_rv32im_p4_outer
+        lw s0, 0x0(sp)
+        lw s1, 0x4(sp)
+        lw s2, 0x8(sp)
+        lw s3, 0xc(sp)
+        lw s4, 0x10(sp)
+        lw s5, 0x14(sp)
+        addi sp, sp, 0x18
+        .cfi_adjust_cfa_offset -0x18
+        ret
+        .cfi_endproc
+
+MLD_ASM_FN_SIZE(ntt_rv32im_asm)
+
+#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
+          !MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/mldsa/src/native/rv32im/src/ntt_rv32im_slowmul_asm.S b/mldsa/src/native/rv32im/src/ntt_rv32im_slowmul_asm.S
new file mode 100644
index 000000000..624769595
--- /dev/null
+++ b/mldsa/src/native/rv32im/src/ntt_rv32im_slowmul_asm.S
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * RV32-IM ML-DSA forward NTT, slow-multiplier variant.
+ *
+ * Thin wrapper: the kernel body is shared via ntt_rv32im_asm.i, which here
+ * computes the Barrett low(t*q) reduction with a shift-add chain (exploiting
+ * q = 2^23 - 2^13 + 1), trading the multiply for cheap ALU ops -- preferred
+ * when the multiplier is slow. The fast-multiplier variant lives in
+ * ntt_rv32im_asm.S. Exactly one of the two is selected, by
+ * MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER.
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_RV32IM) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
+    defined(MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER)
+
+/*
+ * WARNING: This file is auto-derived from the mldsa-native source file
+ *   dev/riscv32/src/ntt_rv32im_slowmul_asm.S using scripts/simpasm. Do not modify it directly.
+ */
+
+.text
+.balign 4
+.global MLD_ASM_NAMESPACE(ntt_rv32im_asm)
+MLD_ASM_FN_SYMBOL(ntt_rv32im_asm)
+
+        .cfi_startproc
+        addi sp, sp, -0x18
+        .cfi_adjust_cfa_offset 0x18
+        sw s0, 0x0(sp)
+        sw s1, 0x4(sp)
+        sw s2, 0x8(sp)
+        sw s3, 0xc(sp)
+        sw s4, 0x10(sp)
+        sw s5, 0x14(sp)
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        addi a1, a1, 0x18
+        mv t2, a0
+        addi t4, a0, 0x100
+
+Lntt_rv32im_p1_loop:
+        lw a2, 0x0(t2)
+        lw a3, 0x100(t2)
+        lw a4, 0x200(t2)
+        lw a5, 0x300(t2)
+        mulh a7, a4, s1
+        mul a6, a4, s0
+        sub a6, a6, a7
+        slli a7, a7, 0xd
+        add a6, a6, a7
+        slli a7, a7, 0xa
+        sub a6, a6, a7
+        sub a4, a2, a6
+        add a2, a2, a6
+        mulh a7, a5, s1
+        mul a6, a5, s0
+        sub a6, a6, a7
+        slli a7, a7, 0xd
+        add a6, a6, a7
+        slli a7, a7, 0xa
+        sub a6, a6, a7
+        sub a5, a3, a6
+        add a3, a3, a6
+        mulh a7, a3, s3
+        mul a6, a3, s2
+        sub a6, a6, a7
+        slli a7, a7, 0xd
+        add a6, a6, a7
+        slli a7, a7, 0xa
+        sub a6, a6, a7
+        sub a3, a2, a6
+        add a2, a2, a6
+        mulh a7, a5, s5
+        mul a6, a5, s4
+        sub a6, a6, a7
+        slli a7, a7, 0xd
+        add a6, a6, a7
+        slli a7, a7, 0xa
+        sub a6, a6, a7
+        sub a5, a4, a6
+        add a4, a4, a6
+        sw a2, 0x0(t2)
+        sw a3, 0x100(t2)
+        sw a4, 0x200(t2)
+        sw a5, 0x300(t2)
+        addi t2, t2, 0x4
+        bne t2, t4, Lntt_rv32im_p1_loop
+        mv t2, a0
+        addi t3, a0, 0x400
+
+Lntt_rv32im_p2_outer:
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        addi a1, a1, 0x18
+        addi t4, t2, 0x40
+
+Lntt_rv32im_p2_inner:
+        lw a2, 0x0(t2)
+        lw a3, 0x40(t2)
+        lw a4, 0x80(t2)
+        lw a5, 0xc0(t2)
+        mulh a7, a4, s1
+        mul a6, a4, s0
+        sub a6, a6, a7
+        slli a7, a7, 0xd
+        add a6, a6, a7
+        slli a7, a7, 0xa
+        sub a6, a6, a7
+        sub a4, a2, a6
+        add a2, a2, a6
+        mulh a7, a5, s1
+        mul a6, a5, s0
+        sub a6, a6, a7
+        slli a7, a7, 0xd
+        add a6, a6, a7
+        slli a7, a7, 0xa
+        sub a6, a6, a7
+        sub a5, a3, a6
+        add a3, a3, a6
+        mulh a7, a3, s3
+        mul a6, a3, s2
+        sub a6, a6, a7
+        slli a7, a7, 0xd
+        add a6, a6, a7
+        slli a7, a7, 0xa
+        sub a6, a6, a7
+        sub a3, a2, a6
+        add a2, a2, a6
+        mulh a7, a5, s5
+        mul a6, a5, s4
+        sub a6, a6, a7
+        slli a7, a7, 0xd
+        add a6, a6, a7
+        slli a7, a7, 0xa
+        sub a6, a6, a7
+        sub a5, a4, a6
+        add a4, a4, a6
+        sw a2, 0x0(t2)
+        sw a3, 0x40(t2)
+        sw a4, 0x80(t2)
+        sw a5, 0xc0(t2)
+        addi t2, t2, 0x4
+        bne t2, t4, Lntt_rv32im_p2_inner
+        addi t2, t2, 0xc0
+        bne t2, t3, Lntt_rv32im_p2_outer
+        mv t2, a0
+        addi t3, a0, 0x400
+
+Lntt_rv32im_p3_outer:
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        addi a1, a1, 0x18
+        addi t4, t2, 0x10
+
+Lntt_rv32im_p3_inner:
+        lw a2, 0x0(t2)
+        lw a3, 0x10(t2)
+        lw a4, 0x20(t2)
+        lw a5, 0x30(t2)
+        mulh a7, a4, s1
+        mul a6, a4, s0
+        sub a6, a6, a7
+        slli a7, a7, 0xd
+        add a6, a6, a7
+        slli a7, a7, 0xa
+        sub a6, a6, a7
+        sub a4, a2, a6
+        add a2, a2, a6
+        mulh a7, a5, s1
+        mul a6, a5, s0
+        sub a6, a6, a7
+        slli a7, a7, 0xd
+        add a6, a6, a7
+        slli a7, a7, 0xa
+        sub a6, a6, a7
+        sub a5, a3, a6
+        add a3, a3, a6
+        mulh a7, a3, s3
+        mul a6, a3, s2
+        sub a6, a6, a7
+        slli a7, a7, 0xd
+        add a6, a6, a7
+        slli a7, a7, 0xa
+        sub a6, a6, a7
+        sub a3, a2, a6
+        add a2, a2, a6
+        mulh a7, a5, s5
+        mul a6, a5, s4
+        sub a6, a6, a7
+        slli a7, a7, 0xd
+        add a6, a6, a7
+        slli a7, a7, 0xa
+        sub a6, a6, a7
+        sub a5, a4, a6
+        add a4, a4, a6
+        sw a2, 0x0(t2)
+        sw a3, 0x10(t2)
+        sw a4, 0x20(t2)
+        sw a5, 0x30(t2)
+        addi t2, t2, 0x4
+        bne t2, t4, Lntt_rv32im_p3_inner
+        addi t2, t2, 0x30
+        bne t2, t3, Lntt_rv32im_p3_outer
+        mv t2, a0
+        addi t3, a0, 0x400
+
+Lntt_rv32im_p4_outer:
+        lw s0, 0x0(a1)
+        lw s1, 0x4(a1)
+        lw s2, 0x8(a1)
+        lw s3, 0xc(a1)
+        lw s4, 0x10(a1)
+        lw s5, 0x14(a1)
+        addi a1, a1, 0x18
+        lw a2, 0x0(t2)
+        lw a3, 0x4(t2)
+        lw a4, 0x8(t2)
+        lw a5, 0xc(t2)
+        mulh a7, a4, s1
+        mul a6, a4, s0
+        sub a6, a6, a7
+        slli a7, a7, 0xd
+        add a6, a6, a7
+        slli a7, a7, 0xa
+        sub a6, a6, a7
+        sub a4, a2, a6
+        add a2, a2, a6
+        mulh a7, a5, s1
+        mul a6, a5, s0
+        sub a6, a6, a7
+        slli a7, a7, 0xd
+        add a6, a6, a7
+        slli a7, a7, 0xa
+        sub a6, a6, a7
+        sub a5, a3, a6
+        add a3, a3, a6
+        mulh a7, a3, s3
+        mul a6, a3, s2
+        sub a6, a6, a7
+        slli a7, a7, 0xd
+        add a6, a6, a7
+        slli a7, a7, 0xa
+        sub a6, a6, a7
+        sub a3, a2, a6
+        add a2, a2, a6
+        mulh a7, a5, s5
+        mul a6, a5, s4
+        sub a6, a6, a7
+        slli a7, a7, 0xd
+        add a6, a6, a7
+        slli a7, a7, 0xa
+        sub a6, a6, a7
+        sub a5, a4, a6
+        add a4, a4, a6
+        sw a2, 0x0(t2)
+        sw a3, 0x4(t2)
+        sw a4, 0x8(t2)
+        sw a5, 0xc(t2)
+        addi t2, t2, 0x10
+        bne t2, t3, Lntt_rv32im_p4_outer
+        lw s0, 0x0(sp)
+        lw s1, 0x4(sp)
+        lw s2, 0x8(sp)
+        lw s3, 0xc(sp)
+        lw s4, 0x10(sp)
+        lw s5, 0x14(sp)
+        addi sp, sp, 0x18
+        .cfi_adjust_cfa_offset -0x18
+        ret
+        .cfi_endproc
+
+MLD_ASM_FN_SIZE(ntt_rv32im_asm)
+
+#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
+          MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/mldsa/src/native/rv32im/src/poly_pointwise_montgomery_rv32im_asm.S b/mldsa/src/native/rv32im/src/poly_pointwise_montgomery_rv32im_asm.S
new file mode 100644
index 000000000..606a13379
--- /dev/null
+++ b/mldsa/src/native/rv32im/src/poly_pointwise_montgomery_rv32im_asm.S
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * RV32-IM ML-DSA pointwise polynomial multiplication with Montgomery
+ * reduction. Computes
+ *
+ *     a[i] = (a[i] * b[i]) * R^-1  mod q,    R = 2^32, |result| < q,
+ *
+ * for i in 0..256, in-place in a.
+ *
+ * Modular arithmetic: standard signed Montgomery reduction. Unlike the
+ * NTT, neither operand is constant, so we can't precompute a twisted
+ * form -- the kernel uses 4 multiplies per coefficient:
+ *
+ *     plo = low (a * b)            ; mul
+ *     m   = low (plo * QINV)       ; mul (low 32 of (plo * QINV))
+ *     phi = high(a * b)            ; mulh
+ *     mh  = high(m * q)            ; mulh
+ *     r   = phi - mh               ; sub
+ *
+ * Bounds: requires |a[i]|, |b[i]| < MLD_NTT_BOUND = 9*q. The product
+ * is bounded by (9q)^2 < 2^31 * q, well within the safe input range
+ * for `mld_montgomery_reduce` (which is |a| <= 2^31 * q).
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_RV32IM) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+/*
+ * WARNING: This file is auto-derived from the mldsa-native source file
+ *   dev/riscv32/src/poly_pointwise_montgomery_rv32im_asm.S using scripts/simpasm. Do not modify it directly.
+ */
+
+.text
+.balign 4
+.global MLD_ASM_NAMESPACE(poly_pointwise_montgomery_rv32im_asm)
+MLD_ASM_FN_SYMBOL(poly_pointwise_montgomery_rv32im_asm)
+
+        .cfi_startproc
+        addi sp, sp, -0x8
+        .cfi_adjust_cfa_offset 0x8
+        sw s0, 0x0(sp)
+        sw s1, 0x4(sp)
+        lui s0, 0x7fe
+        addi s0, s0, 0x1
+        lui s1, 0x3802
+        addi s1, s1, 0x1
+        addi t0, a0, 0x400
+
+Lpoly_pointwise_montgomery_rv32im_loop:
+        lw a2, 0x0(a0)
+        lw a3, 0x0(a1)
+        mul a4, a2, a3
+        mul a6, a4, s1
+        mulh a5, a2, a3
+        mulh a7, a6, s0
+        sub a2, a5, a7
+        sw a2, 0x0(a0)
+        addi a0, a0, 0x4
+        addi a1, a1, 0x4
+        bne a0, t0, Lpoly_pointwise_montgomery_rv32im_loop
+        lw s0, 0x0(sp)
+        lw s1, 0x4(sp)
+        addi sp, sp, 0x8
+        .cfi_adjust_cfa_offset -0x8
+        ret
+        .cfi_endproc
+
+MLD_ASM_FN_SIZE(poly_pointwise_montgomery_rv32im_asm)
+
+#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/mldsa/src/native/rv32im/src/rv32im_zetas.c b/mldsa/src/native/rv32im/src/rv32im_zetas.c
new file mode 100644
index 000000000..05cd415a7
--- /dev/null
+++ b/mldsa/src/native/rv32im/src/rv32im_zetas.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * WARNING: This file is auto-generated from scripts/autogen
+ *          in the mldsa-native repository.
+ *          Do not modify it directly.
+ */
+
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_RV32IM) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+#include "arith_native_rv32im.h"
+
+/*
+ * Table of zeta values used in the RV32-IM forward NTT.
+ * Each entry is a (zeta, w) Barrett pair, with zeta the plain
+ * centered twiddle (|zeta| <= q/2) and w = round(zeta * 2^32 / q)
+ * the Barrett multiplier. See autogen for details.
+ */
+MLD_ALIGN MLD_INTERNAL_DATA_DEFINITION const int32_t
+    mld_rv32im_ntt_zetas[510] = {
+        -3572223, -1830765815, 3765607,  1929875198,  3761513,  1927777021,
+        -3201494, -1640767044, -601683,  -308362795,  3542485,  1815525077,
+        -2883726, -1477910808, 2682288,  1374673747,  2129892,  1091570561,
+        -3145678, -1612161320, 3764867,  1929495947,  -1005239, -515185417,
+        -3201430, -1640734244, 557458,   285697463,   -1221177, -625853735,
+        -3370349, -1727305304, 3602218,  1846138265,  3182878,  1631226336,
+        -4063053, -2082316400, 2740543,  1404529459,  -3586446, -1838055109,
+        2663378,  1364982364,  -3110818, -1594295555, 2101410,  1076973524,
+        -1674615, -858240904,  3704823,  1898723372,  1159875,  594436433,
+        -3524442, -1806278032, 394148,   202001019,   928749,   475984260,
+        -434125,  -222489248,  1095468,  561427818,   -3506380, -1797021249,
+        676590,   346752664,   2071829,  1061813248,  -4018989, -2059733581,
+        -1335936, -684667771,  3241972,  1661512036,  2156050,  1104976547,
+        -3227876, -1654287830, 3415069,  1750224323,  1759347,  901666090,
+        1714295,  878576921,   -817536,  -418987550,  -3574466, -1831915353,
+        2453983,  1257667337,  3756790,  1925356481,  -1935799, -992097815,
+        1460718,  748618600,   -1716988, -879957084,  -3950053, -2024403852,
+        -642628,  -329347125,  -2897314, -1484874664, 3192354,  1636082790,
+        -3585098, -1837364258, 556856,   285388938,   3870317,  1983539117,
+        2815639,  1443016191,  2917338,  1495136972,  1853806,  950076368,
+        2283733,  1170414139,  3345963,  1714807468,  1858416,  952438995,
+        3073009,  1574918427,  1753,     898413,      -1935420, -991903578,
+        1277625,  654783359,   -2659525, -1363007700, -1455890, -746144248,
+        -2635473, -1350681039, 2660408,  1363460238,  -1780227, -912367099,
+        3852015,  1974159335,  -59148,   -30313375,   2772600,  1420958686,
+        4183372,  2143979939,  1182243,  605900043,   87208,    44694137,
+        -3222807, -1651689966, 636927,   326425360,   -3965306, -2032221021,
+        -3121440, -1599739335, -3956745, -2027833504, -2296397, -1176904444,
+        -274060,  -140455867,  -3284915, -1683520342, -3716946, -1904936414,
+        2508980,  1285853323,  -27812,   -14253662,   822541,   421552614,
+        2028118,  1039411342,  1009365,  517299994,   -2454145, -1257750362,
+        1937570,  993005454,   -1979497, -1014493059, 1596822,  818371958,
+        -3815725, -1955560694, -3956944, -2027935492, -3759465, -1926727420,
+        2811291,  1440787840,  -1685153, -863641633,  -3410568, -1747917558,
+        -2983781, -1529189038, 2678278,  1372618620,  -3768948, -1931587462,
+        -1109516, -568627424,  -3551006, -1819892093, 635956,   325927722,
+        4158088,  2131021878,  -250446,  -128353682,  -2455377, -1258381762,
+        1528066,  783134478,   -4146264, -2124962073, -1772588, -908452108,
+        482649,   247357819,   2192938,  1123881663,  -1727088, -885133339,
+        1148858,  588790216,   2387513,  1223601433,  -3611750, -1851023419,
+        -2962264, -1518161567, -268456,  -137583815,  -3180456, -1629985060,
+        -565603,  -289871779,  3747250,  1920467227,  2296099,  1176751719,
+        169688,   86965173,    1239911,  635454918,   -3838479, -1967222129,
+        2462444,  1262003603,  3195676,  1637785316,  2642980,  1354528380,
+        -3334383, -1708872713, 1254190,  642772911,   -12417,   -6363718,
+        -4166425, -2135294594, 2998219,  1536588520,  141835,   72690498,
+        -3488383, -1787797779, -89301,   -45766801,   2513018,  1287922800,
+        1987814,  1018755525,  -1354892, -694382729,  613238,   314284737,
+        -3197248, -1638590967, -1310261, -671509323,  -2218467, -1136965286,
+        1736313,  889861155,   -458740,  -235104446,  -1921994, -985022747,
+        235407,   120646188,   4040196,  2070602178,  -3472069, -1779436847,
+        -3250154, -1665705315, 2039144,  1045062172,  -1879878, -963438279,
+        3258457,  1669960606,  -818761,  -419615363,  -2178965, -1116720494,
+        -2579253, -1321868265, -1623354, -831969619,  2105286,  1078959975,
+        1787943,  916321552,   -2374402, -1216882040, -2033807, -1042326957,
+        -2391089, -1225434135, 586241,   300448763,   -1179613, -604552167,
+        -2254727, -1155548552, 527981,   270590488,   -2743411, -1405999311,
+        3482206,  1784632064,  -1476985, -756955444,  1994046,  1021949428,
+        -4182915, -2143745726, 2491325,  1276805128,  -1393159, -713994583,
+        -1300016, -666258756,  507927,   260312805,   -1187885, -608791570,
+        -2362063, -1210558298, -724804,  -371462360,  -1834526, -940195359,
+        -1317678, -675310538,  -3033742, -1554794072, -338420,  -173440395,
+        2461387,  1261461890,  2647994,  1357098057,  3009748,  1542497137,
+        3035980,  1555941048,  -2612853, -1339088280, 4148469,  2126092136,
+        621164,   318346816,   749577,   384158533,   -4022750, -2061661095,
+        3901472,  1999506068,  3980599,  2040058690,  2569011,  1316619236,
+        -1226661, -628664287,  -1615530, -827959816,  1723229,  883155599,
+        2925816,  1499481951,  1665318,  853476187,   2028038,  1039370342,
+        3374250,  1729304568,  1163598,  596344473,   -3369273, -1726753853,
+        1356448,  695180180,   3994671,  2047270596,  -11879,   -6087993,
+        -2775755, -1422575624, -1370517, -702390549,  3020393,  1547952704,
+        2683270,  1375177022,  3363542,  1723816713,  214880,   110126092,
+        -2778788, -1424130038, 545376,   279505433,   -770441,  -394851342,
+        -3467665, -1777179795, 3105558,  1591599803,  -1103344, -565464272,
+        2312838,  1185330464,  508145,   260424530,   -553718,  -283780712,
+        -653275,  -334803717,  860144,   440824168,   3430436,  1758099917,
+        -459163,  -235321234,  140244,   71875110,    -1514152, -776003547,
+        348812,   178766299,   -2185084, -1119856484, 3123762,  1600929361,
+        -327848,  -168022240,  2358373,  1208667171,  -2193087, -1123958025,
+        1011223,  518252220,   -3014420, -1544891539, -1716814, -879867909,
+        -2354215, -1206536194, 2926054,  1499603926,  -392707,  -201262505,
+        -3818627, -1957047970, -303005,  -155290192,  3531229,  1809756372,
+        -1922253, -985155484,  -3974485, -2036925262, -3773731, -1934038751,
+        -2236726, -1146323031, 1900052,  973777462,   -781875,  -400711272,
+        1744507,  894060583,   1054478,  540420426,   -731434,  -374860238,
+};
+
+#else /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
+
+MLD_EMPTY_CU(rv32im_zetas)
+
+#endif /* !(MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED) */
diff --git a/scripts/autogen b/scripts/autogen
index 3ce73146f..37b1bda22 100755
--- a/scripts/autogen
+++ b/scripts/autogen
@@ -712,6 +712,28 @@ def signed_reduce(a):
     return c
 
 
+def ntt_root_of_unity(layer, block, inv=False, scale=False):
+    """Root of unity (un-reduced) for the given (layer, block) of the NTT.
+
+    Shared by all backends; the arch-specific generators differ only in how
+    they post-process this value via their prepare_root_for_* helper.
+
+    We are computing a negacyclic NTT; the twiddles needed are the second
+    half of the twiddles for a cyclic NTT of twice the size. Layers are
+    numbered 0 through 7.
+
+    inv:   negate the exponent (inverse NTT).
+    scale: fold in the 2^{-8} of the inverse NTT and the Montgomery factor
+           2^32 (used for the final scaling twiddle)."""
+    log = bitreverse(pow(2, layer) + block, 8)
+    if inv is True:
+        log = -log
+    root = pow(root_of_unity, log, modulus)
+    if scale is True:
+        root = root * pow(2, 32 - 8, modulus)
+    return root
+
+
 def gen_c_zetas():
     """Generate source and header file for zeta values used in
     the reference NTT and invNTT"""
@@ -743,43 +765,52 @@ def gen_c_zeta_file():
     update_file("mldsa/src/zetas.inc", "\n".join(gen()), force_format=True)
 
 
-def prepare_root_for_barrett(root):
-    """Takes a constant that the code needs to Barrett-multiply with,
-    and returns the pair of (a) its signed canonical form, (b) the
-    twisted constant used in the high-mul part of the Barrett multiplication."""
+def prepare_root_for_barrett(root, sqrdmulh=False):
+    """Takes a constant that the code needs to Barrett-multiply with, and
+    returns the pair (z, w): z the signed canonical form of `root`, and w the
+    multiplier for the high-mul step that approximates t = round(a * z / q),
+    with w ~= z * 2^32 / q.
+
+    The exact form of w depends on the high-mul instruction:
+
+      sqrdmulh=True  (AArch64 NEON `sqrdmulh`, which computes
+                      round(2 * a * c / 2^32)): the multiplier is halved, so
+                      the instruction's built-in doubling restores the full
+                      value. Rounding to even before halving keeps it exact.
+      sqrdmulh=False (a plain signed `mulh`): the full round-to-nearest value.
+                      It fits int32 since |z| <= q/2 implies |w| <= 2^31."""
 
     # Signed canonical reduction
-    root = signed_reduce(root)
-
-    def round_to_even(t):
-        rt = round(t)
-        if rt % 2 == 0:
-            return rt
-        # Make sure to pick a rounding target
-        # that's <= 1 away from x in absolute value.
-        if rt <= t:
-            return rt + 1
-        return rt - 1
-
-    root_twisted = round_to_even((root * 2**32) / modulus) // 2
-    return root, root_twisted
+    z = signed_reduce(root)
+    scaled = (z * 2**32) / modulus
+
+    if sqrdmulh:
+
+        def round_to_even(t):
+            rt = round(t)
+            if rt % 2 == 0:
+                return rt
+            # Make sure to pick a rounding target
+            # that's <= 1 away from x in absolute value.
+            if rt <= t:
+                return rt + 1
+            return rt - 1
+
+        w = round_to_even(scaled) // 2
+    else:
+        w = round(scaled)
 
+    # The multiplier is stored in a signed 32-bit table lane. This holds for
+    # both forms since |z| <= q/2: the mulh value peaks near 2^31 and the
+    # (halved) sqrdmulh value near 2^30.
+    assert -(2**31) <= w < 2**31
 
-def gen_aarch64_root_of_unity_for_block(layer, block, inv=False, scale=False):
-    # We are computing a negacyclic NTT; the twiddles needed here is
-    # the second half of the twiddles for a cyclic NTT of twice the size.
-    # For ease of calculating the roots, layers are numbers 0 through 7
-    # in this function.
-    log = bitreverse(pow(2, layer) + block, 8)
-    if inv is True:
-        log = -log
-    root = pow(root_of_unity, log, modulus)
+    return z, w
 
-    if scale is True:
-        # Integrate scaling by 2**(-8) and Montgomery factor 2**32 into twiddle
-        root = root * pow(2, 32 - 8, modulus)
 
-    root, root_twisted = prepare_root_for_barrett(root)
+def gen_aarch64_root_of_unity_for_block(layer, block, inv=False, scale=False):
+    root = ntt_root_of_unity(layer, block, inv=inv, scale=scale)
+    root, root_twisted = prepare_root_for_barrett(root, sqrdmulh=True)
     return root, root_twisted
 
 
@@ -958,6 +989,79 @@ def _fmt_indexed_rows(data):
         yield ",".join(map(str, row)) + f" /* {i} */,"
 
 
+def gen_rv32im_root_for_block(layer, block):
+    """Forward NTT zeta for the given (layer, block), in plain (non-Montgomery)
+    centered form, returned as the (z, w) Barrett pair consumed by the RV32-IM
+    assembly. The RV32-IM kernel uses a plain signed `mulh` (not NEON
+    `sqrdmulh`), hence sqrdmulh=False:
+
+        t = mulh(a, w)            ~= round(a * z / q)
+        r = lo(a * z) - lo(t * q)  == (a * z) mod q   (|r| < q + epsilon).
+
+    No Montgomery factor is folded in: the Barrett multiplication computes
+    (a * z) mod q directly, which matches the input/output domain of the
+    previous Montgomery kernel (R was folded into the twiddle and cancelled
+    by R^-1, so both conventions are plain-domain)."""
+    root = ntt_root_of_unity(layer, block)
+    return prepare_root_for_barrett(root, sqrdmulh=False)
+
+
+def gen_rv32im_fwd_ntt_zetas():
+    """Yield (z, z') pairs in the order consumed by the 2+2+2+2 forward NTT.
+
+    Each of the 4 passes (L1+L2, L3+L4, L5+L6, L7+L8) emits one set of
+    3 pairs per outer iteration. Layers are 0-indexed here:
+
+        pass p uses layers (lo, hi) = (2p, 2p+1)
+
+    For outer index o in pass p:
+        zeta_lo  = layer lo, block o
+        zeta_hi0 = layer hi, block 2*o
+        zeta_hi1 = layer hi, block 2*o + 1
+
+    Total: 1 + 4 + 16 + 64 = 85 outer iters * 3 pairs = 255 pairs."""
+    for p in range(4):
+        lo = 2 * p
+        hi = 2 * p + 1
+        n_outer = 1 << lo  # 1, 4, 16, 64
+        for o in range(n_outer):
+            yield from gen_rv32im_root_for_block(lo, o)
+            yield from gen_rv32im_root_for_block(hi, 2 * o + 0)
+            yield from gen_rv32im_root_for_block(hi, 2 * o + 1)
+
+
+def gen_rv32im_zeta_file():
+    def gen():
+        yield from gen_header()
+        yield '#include "../../../common.h"'
+        yield ""
+        yield "#if defined(MLD_ARITH_BACKEND_RV32IM) && \\"
+        yield "    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)"
+        yield ""
+        yield '#include "arith_native_rv32im.h"'
+        yield ""
+        yield "/*"
+        yield " * Table of zeta values used in the RV32-IM forward NTT."
+        yield " * Each entry is a (zeta, w) Barrett pair, with zeta the plain"
+        yield " * centered twiddle (|zeta| <= q/2) and w = round(zeta * 2^32 / q)"
+        yield " * the Barrett multiplier. See autogen for details."
+        yield " */"
+        yield from emit_c_array(
+            "const int32_t",
+            "mld_rv32im_ntt_zetas",
+            gen_rv32im_fwd_ntt_zetas(),
+        )
+        yield ""
+        yield "#else"
+        yield ""
+        yield "MLD_EMPTY_CU(rv32im_zetas)"
+        yield ""
+        yield "#endif"
+        yield ""
+
+    update_file("dev/riscv32/src/rv32im_zetas.c", "\n".join(gen()))
+
+
 def gen_aarch64_zeta_file():
     def gen():
         yield from gen_header()
@@ -1551,10 +1655,7 @@ def prepare_root_for_montmul(root, mult):
 
 
 def gen_avx2_root_of_unity_for_block(layer, block, mult=False):
-    # We are computing a negacyclic NTT; the twiddles needed here is
-    # the second half of the twiddles for a cyclic NTT of twice the size.
-    log = bitreverse(pow(2, layer) + block, 8)
-    root = pow(root_of_unity, log, modulus)
+    root = ntt_root_of_unity(layer, block)
     return prepare_root_for_montmul(root, mult)
 
 
@@ -1892,6 +1993,10 @@ def riscv64(c):
     return "/riscv64/" in c
 
 
+def rv32im(c):
+    return "/rv32im/" in c
+
+
 def armv81m(c):
     return "/armv81m/" in c
 
@@ -1937,12 +2042,17 @@ def native_arith_riscv64(c):
     return native_arith(c) and riscv64(c)
 
 
+def native_arith_rv32im(c):
+    return native_arith(c) and rv32im(c)
+
+
 def native_arith_core(c):
     return (
         native_arith(c)
         and not native_arith_x86_64(c)
         and not native_arith_aarch64(c)
         and not native_arith_riscv64(c)
+        and not native_arith_rv32im(c)
     )
 
 
@@ -2048,6 +2158,11 @@ def gen_macro_undefs(extra_notes=None):
         filt=native_arith_x86_64, desc="native code (Arith, X86_64)"
     )
     yield "#endif"
+    yield "#if defined(MLD_SYS_RISCV32)"
+    yield from gen_monolithic_undef_all_core(
+        filt=native_arith_rv32im, desc="native code (Arith, RV32IM)"
+    )
+    yield "#endif"
     yield "#endif"
     yield "#endif"
     yield ""
@@ -2125,6 +2240,10 @@ def gen_monolithic_source_file():
         for c in filter(native_arith_x86_64, c_sources):
             yield f'#include "{c}"'
         yield "#endif"
+        yield "#if defined(MLD_SYS_RISCV32)"
+        for c in filter(native_arith_rv32im, c_sources):
+            yield f'#include "{c}"'
+        yield "#endif"
         yield "#endif"
         yield ""
         yield "#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202)"
@@ -2208,6 +2327,10 @@ def gen_monolithic_asm_file():
         for c in filter(native_arith_x86_64, asm_sources):
             yield f'#include "{c}"'
         yield "#endif"
+        yield "#if defined(MLD_SYS_RISCV32)"
+        for c in filter(native_arith_rv32im, asm_sources):
+            yield f'#include "{c}"'
+        yield "#endif"
         yield "#endif"
         yield ""
         yield "#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202)"
@@ -2260,6 +2383,8 @@ def get_config_options():
         "MLD_FORCE_RISCV64",
         "MLD_FORCE_RISCV32",
         "MLD_SYS_AARCH64_SLOW_BARREL_SHIFTER",
+        "MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER",
+        "MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER",
         "MLDSA_DEBUG",  # TODO: Rename?
         "MLD_BREAK_PCT",  # Use in PCT breakage test
         "MLD_ALLOW_NONCOMPLIANT_SIGNING_BOUND",  # Internal testing escape hatch
@@ -2359,7 +2484,10 @@ def check_macro_typos():
                 return True
 
         # 5. AWS-LC importer patch
-        if is_autogen or filename == "integration/awslc/awslc.patch":
+        if is_autogen or filename in [
+            "integration/awslc/pre_import.patch",
+            "integration/awslc/post_import.patch",
+        ]:
             return True
 
         if is_autogen or filename == "mldsa/src/common.h":
@@ -2435,6 +2563,12 @@ def check_asm_loop_labels_for_file(filename):
     # Find function symbol name
     res = _RE_FUNC_SYMBOL.search(content)
     if res is None:
+        # Shared .i kernel bodies carry the loop labels but not the function
+        # symbol (which lives in the wrapper .S next to its .global). With the
+        # symbol in another file we cannot derive the label prefix here, so
+        # skip; the wrapper .S (which does carry the symbol) is checked instead.
+        if filename.endswith(".i"):
+            return
         raise Exception(f"Could not find function symbol in assembly file {filename}")
     funcname = res.group(1)
     lbl_prefix = re.sub(r"(_(aarch64|avx2|mve))?_asm$", "", funcname) + "_"
@@ -2532,7 +2666,7 @@ def normalize_asm_macro_syntax():
 
 # Architectures autogen knows how to (cross-)compile assembly for. Used to
 # expand a bare `--force-cross` (no value) into "force all architectures".
-FORCE_CROSS_ALL_ARCHES = {"aarch64", "x86_64", "armv81m"}
+FORCE_CROSS_ALL_ARCHES = {"aarch64", "x86_64", "armv81m", "rv32"}
 
 
 def resolve_force_cross(value):
@@ -2563,6 +2697,11 @@ def update_via_simpasm(
     force_cross=(),
     x86_64_syntax="att",
 ):
+    # force_cross: set of source architectures for which a missing cross
+    # toolchain is a hard error rather than a silent skip. Pass None or an
+    # empty set to skip silently for every arch.
+    if force_cross is None:
+        force_cross = set()
     _, infile = os.path.split(infile_full)
     if outfile is None:
         outfile = infile
@@ -2579,6 +2718,8 @@ def update_via_simpasm(
         source_arch = "x86_64"
     elif "armv81m" in infile_full:
         source_arch = "armv81m"
+    elif "riscv32" in infile_full or "rv32im" in infile_full:
+        source_arch = "riscv32"
     else:
         raise Exception(f"Could not detect architecture of source file {infile_full}.")
     # Check native architecture
@@ -2595,6 +2736,14 @@ def update_via_simpasm(
             if source_arch not in force_cross:
                 return
             raise Exception(f"Could not find cross toolchain {cross_prefix}")
+    # RISC-V 32-bit is always cross-compiled.
+    elif source_arch == "riscv32":
+        cross_prefix = "riscv32-unknown-linux-gnu-"
+        cross_gcc = cross_prefix + "gcc"
+        if shutil.which(cross_gcc) is None:
+            if source_arch not in force_cross:
+                return
+            raise Exception(f"Could not find cross toolchain {cross_prefix}")
     elif native_arch != source_arch:
         cross_prefix = f"{source_arch}-unknown-linux-gnu-"
         cross_gcc = cross_prefix + "gcc"
@@ -2613,6 +2762,8 @@ def update_via_simpasm(
                 arch = "aarch64"
             elif "armv81m" in infile_full:
                 arch = "armv81m"
+            elif "riscv32" in infile_full or "rv32im" in infile_full:
+                arch = "riscv32"
             else:
                 arch = "x86_64"
 
@@ -2924,8 +3075,11 @@ def update_via_remove(filename):
     update_file(filename, None)
 
 
-# Only synchronize sources, but not README.md, Makefile and so on
-SYNCHRONIZED_EXTENSIONS = (".c", ".h", ".i", ".inc", ".S")
+# Only synchronize sources, but not README.md, Makefile and so on.
+# Note: .i files are dev-only kernel bodies #include'd by wrapper .S files;
+# they are flattened into the synchronized .S output by simpasm and are not
+# copied to the backend mirror themselves.
+SYNCHRONIZED_EXTENSIONS = (".c", ".h", ".inc", ".S")
 
 
 def synchronize_file(f, in_dir, out_dir, delete=False, no_simplify=False, **kwargs):
@@ -3019,6 +3173,14 @@ def synchronize_backends(
             ),
         )
 
+        update_via_copy(
+            "dev/riscv32/meta.h",
+            "mldsa/src/native/rv32im/meta.h",
+            transform=lambda c: adjust_header_guard_for_filename(
+                c, "mldsa/src/native/rv32im/meta.h"
+            ),
+        )
+
     synchronize_backend(
         f"dev/aarch64_{ty}/src",
         "mldsa/src/native/aarch64/src",
@@ -3100,6 +3262,14 @@ def synchronize_backends(
         no_simplify=no_simplify,
         cflags="-Idev/fips202/armv81m -Imldsa/src/fips202/native/armv81m -march=armv8.1-m.main+mve -mthumb",
     )
+    synchronize_backend(
+        "dev/riscv32/src",
+        "mldsa/src/native/rv32im/src",
+        delete=delete,
+        force_cross=force_cross,
+        no_simplify=no_simplify,
+        cflags="-Idev/riscv32/src -Imldsa/src/native/rv32im/src -march=rv32im -mabi=ilp32",
+    )
 
 
 def adjust_header_guard_for_filename(content, header_file):
@@ -4204,6 +4374,7 @@ def _main():
     def gen_zeta_tables():
         gen_c_zeta_file()
         gen_aarch64_zeta_file()
+        gen_rv32im_zeta_file()
         gen_aarch64_hol_light_zeta_file()
         gen_aarch64_rej_uniform_table()
         gen_hol_light_rej_uniform_table()
diff --git a/scripts/cfify b/scripts/cfify
index f37a6fa30..58ba047e0 100755
--- a/scripts/cfify
+++ b/scripts/cfify
@@ -160,6 +160,19 @@ ARMV81M_ADD_SP_PATTERN = re.compile(
 ARMV81M_BX_LR_PATTERN = re.compile(r"(\s*)bx\s+lr\s*$", re.IGNORECASE)
 
 
+# -----------------------------------------------------------------------------
+# riscv32 module-scope constants
+# -----------------------------------------------------------------------------
+# `addi sp, sp, -OFF` (allocate) and `addi sp, sp, +OFF` (free).
+RISCV32_SUB_SP_PATTERN = re.compile(
+    r"(\s*)addi\s+sp,\s*sp,\s*-(0x[0-9a-fA-F]+|\d+)", re.IGNORECASE
+)
+RISCV32_ADD_SP_PATTERN = re.compile(
+    r"(\s*)addi\s+sp,\s*sp,\s*(0x[0-9a-fA-F]+|\d+)", re.IGNORECASE
+)
+RISCV32_RET_PATTERN = re.compile(r"(\s*)ret\s*$", re.IGNORECASE)
+
+
 def armv81m_parse_reg(s):
     """Parse a single register token, returning its canonical name
     (e.g. 'r14' -> 'lr'). Raises ValueError on unrecognised input."""
@@ -524,6 +537,44 @@ def add_cfi_directives(text, arch):
                 i += 1
                 continue
 
+        elif arch == "riscv32":
+            # addi sp, sp, -OFF — stack allocation
+            match = RISCV32_SUB_SP_PATTERN.match(line)
+            if match:
+                indent, offset_str = match.groups()
+                offset = (
+                    int(offset_str, 16)
+                    if offset_str.lower().startswith("0x")
+                    else int(offset_str)
+                )
+                result.append(line)
+                result.append(f"{indent}.cfi_adjust_cfa_offset {offset:#x}")
+                i += 1
+                continue
+
+            # addi sp, sp, +OFF — stack deallocation
+            match = RISCV32_ADD_SP_PATTERN.match(line)
+            if match:
+                indent, offset_str = match.groups()
+                offset = (
+                    int(offset_str, 16)
+                    if offset_str.lower().startswith("0x")
+                    else int(offset_str)
+                )
+                result.append(line)
+                result.append(f"{indent}.cfi_adjust_cfa_offset -{offset:#x}")
+                i += 1
+                continue
+
+            # ret — function return
+            match = RISCV32_RET_PATTERN.match(line)
+            if match:
+                indent = match.group(1)
+                result.append(line)
+                result.append(f"{indent}.cfi_endproc")
+                i += 1
+                continue
+
         result.append(line)
         i += 1
 
@@ -543,7 +594,7 @@ def main():
     )
     parser.add_argument(
         "--arch",
-        choices=["aarch64", "x86_64", "armv81m"],
+        choices=["aarch64", "x86_64", "armv81m", "riscv32"],
         default="aarch64",
         help="Target architecture (default: aarch64)",
     )
diff --git a/scripts/simpasm b/scripts/simpasm
index dc34079a1..62cca9adf 100755
--- a/scripts/simpasm
+++ b/scripts/simpasm
@@ -256,6 +256,11 @@ def simplify(logger, args, asm_input, asm_output=None):
         # Armv8.1-M requires explicit triple for Thumb disassembly
         if args.arch == "armv81m":
             cmd += ["--triple=thumbv8.1m.main-none-eabi"]
+        # RISC-V 32-bit ILP32 needs an explicit triple so llvm-objdump
+        # decodes the M extension (mul/mulh) instead of marking them
+        # as illegal.
+        if args.arch == "riscv32":
+            cmd += ["--triple=riscv32", "--mattr=+m"]
 
         # Add syntax option if specified
         if args.syntax and args.syntax.lower() != "att":
diff --git a/test/mk/components.mk b/test/mk/components.mk
index 67698aabe..00df4d67c 100644
--- a/test/mk/components.mk
+++ b/test/mk/components.mk
@@ -10,7 +10,7 @@ endif
 
 SOURCES += $(wildcard mldsa/src/*.c)
 ifeq ($(OPT),1)
-	SOURCES += $(wildcard mldsa/src/native/aarch64/src/*.[csS]) $(wildcard mldsa/src/native/x86_64/src/*.[csS])
+	SOURCES += $(wildcard mldsa/src/native/aarch64/src/*.[csS]) $(wildcard mldsa/src/native/x86_64/src/*.[csS]) $(wildcard mldsa/src/native/rv32im/src/*.[csS])
 	CFLAGS += -DMLD_CONFIG_USE_NATIVE_BACKEND_ARITH -DMLD_CONFIG_USE_NATIVE_BACKEND_FIPS202
 endif