diff --git a/.github/actions/multi-functest/action.yml b/.github/actions/multi-functest/action.yml index 74e8194c8..4e4c0b426 100644 --- a/.github/actions/multi-functest/action.yml +++ b/.github/actions/multi-functest/action.yml @@ -238,7 +238,10 @@ runs: nix-verbose: ${{ inputs.nix-verbose }} gh_token: ${{ inputs.gh_token }} custom_shell: ${{ inputs.custom_shell }} - cflags: "${{ inputs.cflags }} -DMLD_FORCE_RISCV32" + # The RV32-IM arithmetic backend is experimental and not picked + # up by native/meta.h's defaults; select it explicitly here. + # No-op for OPT=0 builds (MLD_CONFIG_ARITH_BACKEND_FILE is unused). + cflags: "${{ inputs.cflags }} -DMLD_FORCE_RISCV32 -DMLD_CONFIG_ARITH_BACKEND_FILE=\\\\\\\"native/rv32im/meta.h\\\\\\\"" ldflags: ${{ inputs.ldflags }} cross_prefix: riscv32-unknown-linux-gnu- exec_wrapper: "${{ inputs.exec_wrapper != '' && inputs.exec_wrapper || 'qemu-riscv32' }}" @@ -255,4 +258,3 @@ runs: rng_fail: ${{ inputs.rng_fail }} extra_args: ${{ inputs.extra_args }} extra_env: ${{ inputs.extra_env }} - diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index afffdd299..88bcaa96e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -122,8 +122,8 @@ jobs: check_namespace: 'false' - name: build + test (cross, opt) uses: ./.github/actions/multi-functest - # There is no native code yet on PPC64LE, riscv32 or AArch64_be, so no point running opt tests - if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') }} + # There is no native code yet on PPC64LE or AArch64_be, so no point running opt tests + if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'aarch64_be') }} with: nix-shell: ${{ matrix.target.nix_shell }} nix-cache: ${{ matrix.target.mode == 'native' && 'false' || 'true' }} @@ -134,8 +134,8 @@ jobs: opt: 'opt' - name: build + test (cross, opt, +debug) uses: ./.github/actions/multi-functest - # There is no native code yet on PPC64LE, riscv32 or AArch64_be, so no point running opt tests - if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') }} + # There is no native code yet on PPC64LE or AArch64_be, so no point running opt tests + if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'aarch64_be') }} with: nix-shell: ${{ matrix.target.nix_shell }} nix-cache: ${{ matrix.target.mode == 'native' && 'false' || 'true' }} @@ -145,6 +145,31 @@ jobs: exec_wrapper: ${{ matrix.target.exec_wrapper || '' }} cflags: "-DMLDSA_DEBUG" opt: 'opt' + # The RV32IM backend has two interchangeable variants of the Barrett + # low(t*q) reduction, selected by MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER: + # a single multiply by q (default, exercised by the opt steps above) and + # a multiply-free shift-add chain (for cores with a slow multiplier). + # Exercise the slow-multiplier variant here so both are covered. + - name: build + test (riscv32, slow-multiplier, opt) + uses: ./.github/actions/multi-functest + if: ${{ matrix.target.arch == 'riscv32' }} + with: + nix-shell: ${{ matrix.target.nix_shell }} + nix-cache: 'true' + gh_token: ${{ secrets.GITHUB_TOKEN }} + compile_mode: ${{ matrix.target.mode }} + cflags: "-DMLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER" + opt: 'opt' + - name: build + test (riscv32, slow-multiplier, opt, +debug) + uses: ./.github/actions/multi-functest + if: ${{ matrix.target.arch == 'riscv32' }} + with: + nix-shell: ${{ matrix.target.nix_shell }} + nix-cache: 'true' + gh_token: ${{ secrets.GITHUB_TOKEN }} + compile_mode: ${{ matrix.target.mode }} + cflags: "-DMLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER -DMLDSA_DEBUG" + opt: 'opt' backend_tests: name: AArch64 FIPS202 backends (${{ matrix.backend }}) strategy: diff --git a/README.md b/README.md index a8508b602..d2c2e8cd8 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ mldsa-native allows developers to support ML-DSA with minimal performance and ma **Maintainability and Safety:** Memory safety, type safety and absence of various classes of timing leakage are automatically checked on every change, using a combination of static model checking (using CBMC) and dynamic instrumentation (using valgrind). This reduces review and maintenance burden and accelerates safe code delivery. See [Formal Verification](#formal-verification) and [Security](#security). -**Architecture Support:** Native backends are added under a unified interface, minimizing duplicated code and reasoning. mldsa-native comes with backends for AArch64 and x86-64. See [Design](#design). +**Architecture Support:** Native backends are added under a unified interface, minimizing duplicated code and reasoning. mldsa-native comes with backends for AArch64 and x86-64, and experimental backends for Armv8.1-M and RV32-IM. See [Design](#design). ## Quickstart for Ubuntu @@ -94,6 +94,7 @@ mldsa-native currently offers the following backends: * 64-bit Arm backend (using Neon) * 64-bit Intel/AMD backend (using AVX2) * 32-bit Armv8.1-M backend (using Helium/MVE). This is still experimental and disabled by default. +* 32-bit RISC-V backend (RV32-IM, base integer + M-extension only). This is still experimental and disabled by default. If you'd like contribute new backends, please reach out! diff --git a/dev/riscv32/meta.h b/dev/riscv32/meta.h new file mode 100644 index 000000000..a83cd62c1 --- /dev/null +++ b/dev/riscv32/meta.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLD_NATIVE_RV32IM_META_H +#define MLD_NATIVE_RV32IM_META_H + +/* Set of primitives that this backend replaces */ +#define MLD_USE_NATIVE_NTT +#define MLD_USE_NATIVE_INTT +#define MLD_USE_NATIVE_POINTWISE_MONTGOMERY + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLD_ARITH_BACKEND_RV32IM + + +#if !defined(__ASSEMBLER__) +#include "../api.h" +#include "src/arith_native_rv32im.h" + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_ntt_native(int32_t data[MLDSA_N]) +{ + mld_ntt_rv32im_asm(data, mld_rv32im_ntt_zetas); + return MLD_NATIVE_FUNC_SUCCESS; +} + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_intt_native(int32_t data[MLDSA_N]) +{ + mld_intt_rv32im_asm(data, mld_rv32im_ntt_zetas); + return MLD_NATIVE_FUNC_SUCCESS; +} + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_poly_pointwise_montgomery_native( + int32_t a[MLDSA_N], const int32_t b[MLDSA_N]) +{ + mld_poly_pointwise_montgomery_rv32im_asm(a, b); + return MLD_NATIVE_FUNC_SUCCESS; +} + +#endif /* !__ASSEMBLER__ */ +#endif /* !MLD_NATIVE_RV32IM_META_H */ diff --git a/dev/riscv32/src/arith_native_rv32im.h b/dev/riscv32/src/arith_native_rv32im.h new file mode 100644 index 000000000..9a987fbfd --- /dev/null +++ b/dev/riscv32/src/arith_native_rv32im.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H +#define MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H + +#include "../../../cbmc.h" +#include "../../../common.h" + +#define mld_rv32im_ntt_zetas MLD_NAMESPACE(rv32im_ntt_zetas) + +/* + * Forward NTT zeta table for the RV32-IM backend. + * + * 255 logical entries, each a (zeta, w) Barrett pair: zeta is the plain + * centered twiddle w^{bitrev_8(k)} mod q (|zeta| <= q/2) and + * w = round(zeta * 2^32 / q) is the Barrett multiplier used by the + * constant-twiddle butterfly. The order matches the consumption order of + * the 2+2+2+2 forward NTT. + */ +MLD_INTERNAL_DATA_DECLARATION const int32_t mld_rv32im_ntt_zetas[510]; + +#define mld_ntt_rv32im_asm MLD_NAMESPACE(ntt_rv32im_asm) +void mld_ntt_rv32im_asm(int32_t *r, const int32_t *zetas) +__contract__( + requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N)) + requires(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q)) + requires(zetas == mld_rv32im_ntt_zetas) + assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N)) + /* Forward-NTT output bound MLD_NTT_BOUND = 9 * MLD_FQMUL_BOUND. The + * truncating `mulh` Barrett multiply has output bound MLD_FQMUL_BOUND = + * 5/4 * MLDSA_Q (vs MLDSA_Q for the rounding `sqrdmulh` used on AArch64), + * so the NTT output is bounded by 9 * MLD_FQMUL_BOUND, not 9 * MLDSA_Q. + * Spelled out inline to keep this header free of poly.h. */ + ensures(array_abs_bound(r, 0, MLDSA_N, 9 * ((5 * MLDSA_Q + 3) / 4))) +); + +#define mld_intt_rv32im_asm MLD_NAMESPACE(intt_rv32im_asm) +void mld_intt_rv32im_asm(int32_t *r, const int32_t *zetas) +__contract__( + requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N)) + requires(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q)) + requires(zetas == mld_rv32im_ntt_zetas) + assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N)) + ensures(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q)) +); + +#define mld_poly_pointwise_montgomery_rv32im_asm \ + MLD_NAMESPACE(poly_pointwise_montgomery_rv32im_asm) +void mld_poly_pointwise_montgomery_rv32im_asm(int32_t *a, const int32_t *b) +__contract__( + requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(b, sizeof(int32_t) * MLDSA_N)) + /* Inputs bounded by MLD_NTT_BOUND = 9 * MLD_FQMUL_BOUND, the guaranteed + * output bound of any forward NTT. Spelled out inline to keep this header + * free of poly.h. */ + requires(array_abs_bound(a, 0, MLDSA_N, 9 * ((5 * MLDSA_Q + 3) / 4))) + requires(array_abs_bound(b, 0, MLDSA_N, 9 * ((5 * MLDSA_Q + 3) / 4))) + assigns(memory_slice(a, sizeof(int32_t) * MLDSA_N)) + ensures(array_abs_bound(a, 0, MLDSA_N, MLDSA_Q)) +); + +#endif /* !MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H */ diff --git a/dev/riscv32/src/intt_rv32im_asm.S b/dev/riscv32/src/intt_rv32im_asm.S new file mode 100644 index 000000000..11222d02d --- /dev/null +++ b/dev/riscv32/src/intt_rv32im_asm.S @@ -0,0 +1,30 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * RV32-IM ML-DSA inverse NTT, fast-multiplier variant. + * + * Thin wrapper: the kernel body is shared via intt_rv32im_asm.i, which here + * computes the Barrett low(t*q) reduction with a single multiply by q. The + * slow-multiplier variant (shift-add) lives in intt_rv32im_slowmul_asm.S. + * Exactly one of the two is selected, by MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER. + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_RV32IM) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + !defined(MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER) +/* simpasm: header-end */ + + .text + .global MLD_ASM_NAMESPACE(intt_rv32im_asm) + .balign 4 +MLD_ASM_FN_SYMBOL(intt_rv32im_asm) + +#include "intt_rv32im_asm.i" + +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + !MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER */ diff --git a/dev/riscv32/src/intt_rv32im_asm.i b/dev/riscv32/src/intt_rv32im_asm.i new file mode 100644 index 000000000..c84c20f6f --- /dev/null +++ b/dev/riscv32/src/intt_rv32im_asm.i @@ -0,0 +1,413 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * RV32-IM ML-DSA inverse NTT -- shared kernel body. + * + * This file is #include'd by the thin wrapper .S files + * intt_rv32im_asm.S (fast multiplier: low(t*q) via a single mul) + * intt_rv32im_slowmul_asm.S (slow multiplier: low(t*q) via shift-add) + * which differ only in whether they #define + * MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER before the include. It is not a + * standalone translation unit: the backend guard, the .global directive, + * and the simpasm header/footer markers live in the wrappers. + * + * Layered structure: 2+2+2+2 (mirror of the forward NTT, with passes + * applied in reverse layer order). Each pass merges two C-layers into a + * radix-4 inner kernel that holds 4 coefficients in registers. + * + * inv-pass-1: C-layers 8, 7 (inner stride = 4 B, 64 outer iters) + * inv-pass-2: C-layers 6, 5 (inner stride = 16 B, 16 outer iters) + * inv-pass-3: C-layers 4, 3 (inner stride = 64 B, 4 outer iters) + * inv-pass-4: C-layers 2, 1 (inner stride = 256 B, 1 outer iter ) + * + * Twiddles: this routine reuses `mld_rv32im_ntt_zetas` (the forward-NTT + * table). The forward pass-(5-k) consumes its 3*N_outer pairs in + * outer order 0,1,...,N-1; the inv pass-k requires the *same* zetas but + * in reverse outer order, with the two "hi" zetas swapped. We implement + * this by initializing zeta_ptr at the end of each pass region and + * subtracting 24 bytes per outer iter; within the iter the lo zeta is + * read from offset 0 and the hi zetas from offsets 8/16 swapped via the + * GS kernel argument order. The negation that the C reference applies + * (`-mld_zetas[k]`) is absorbed by the GS butterfly form + * a' = a + b + * b' = barrett(b - a, +zeta) + * which produces the same result as the canonical + * t = a; a' = t + b; b' = barrett(t - b, -zeta). + * + * Modular arithmetic: Barrett multiplication by a constant twiddle + * (2-mul kernel t = hi(a*w), r = low(a*zeta) - low(t*q)), matching the + * forward NTT. Each zeta is a (zeta, w) pair (plain centered twiddle and + * its Barrett multiplier). The plain-domain result matches the previous + * Montgomery convention. The low(t*q) reduction has two bit-identical + * forms (see mul_q_sub), selected by + * MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER: shift-add or a single multiply. + * + * Final scaling: after the four passes, every coefficient is multiplied + * by the plain twiddle f = 16382 = R * 2^{-8} mod q (= 2^24 mod q), + * which folds in both the 2^{-8} of the inverse NTT and the R factor of + * the previous Montgomery output convention. This uses a rounding Barrett + * (see barrett_round): a doubled multiplier round(f*2^33/q) and a (t+1)>>1 + * round-to-nearest of the quotient, tightening the output to |coef| < q + * (measured <= 0.503 q). The truncating Barrett of the butterflies gives + * |coef| < 1.01 q, so the rounding form is used here to meet the invntt + * output contract of |coef| < q. + * + * Bounds (after each inv-pass): + * + * start : |coef| < q (= 1*q) + * after inv-pass-1 (C-L 8,7) : |coef| < 4*q + * after inv-pass-2 (C-L 6,5) : |coef| < 16*q + * after inv-pass-3 (C-L 4,3) : |coef| < 64*q + * after inv-pass-4 (C-L 2,1) : |coef| < 256*q (~ 2^31, fits int32) + * after final fqscale : |coef| < q (rounding Barrett) + */ + +/***************************************************************** + * Register aliases + *****************************************************************/ + +/* Arguments */ +#define in_ptr a0 +#define zeta_ptr a1 + +/* Working pointers / counters */ +#define data t2 +#define outer_end t3 +#define inner_end t4 +#define scale_end t5 /* end pointer for final-scaling loop */ + +/* Coefficient registers */ +#define ca a2 +#define cb a3 +#define cc a4 +#define cd a5 + +/* Butterfly temporaries */ +#define tmp0 a6 +#define tmp1 a7 + +/* Loaded zeta pair registers. Each pair is (zeta, w): the plain centered + * twiddle and its Barrett multiplier w = round(zeta * 2^32 / q). */ +#define zeta_lo s0 +#define zeta_lo_w s1 +#define zeta_h0 s2 +#define zeta_h0_w s3 +#define zeta_h1 s4 +#define zeta_h1_w s5 + +/* Constants (used only by the Barrett final-scale post-loop). */ +#define f s6 /* plain fqscale: 16382 = R*2^-8 mod q */ +#define f_w2 s7 /* doubled Barrett mult: round(f*2^33/q) */ + +/* Constant q register, used only by mul_q_sub when + * MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER is undefined. t0 is caller-saved + * and otherwise unused, so no extra save/restore is needed. */ +#define q t0 /* MLDSA_Q = 8380417 */ + +/***************************************************************** + * Macros + *****************************************************************/ + +/* mul_q_sub rd, rt : + * + * rd = rd - low(rt * q) (mod 2^32), clobbers rt. + * + * Two bit-identical implementations, selected by + * MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER: + * + * defined : shift-add, exploiting q = 2^23 - 2^13 + 1, no multiply. + * undefined : single low multiply by q (q held in `q`). + * + * The reduction is the only multiplier-dependent step; the Barrett kernels, + * butterflies, final scaling and zeta table are shared. + */ +.macro mul_q_sub rd, rt +#if defined(MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER) + sub \rd, \rd, \rt /* - rt */ + slli \rt, \rt, 13 + add \rd, \rd, \rt /* + (rt<<13) */ + slli \rt, \rt, 10 + sub \rd, \rd, \rt /* - (rt<<23) => - low(rt*q) */ +#else /* MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER */ + mul \rt, \rt, q /* low(rt * q) */ + sub \rd, \rd, \rt +#endif /* !MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER */ +.endm + +/* barrett rd, ra, rzeta, rw, rt : + * + * rd = (ra * rzeta) mod q (plain domain, |rd| < 1.01 q). Clobbers: rt. + * t = hi(ra * rw) ; rd = low(ra * rzeta) - low(t * q). + * + * Uses a truncating quotient estimate t = hi(ra * w) with + * w = round(rzeta * 2^32 / q). Good enough for the butterfly bound. + */ +.macro barrett rd, ra, rzeta, rw, rt + mulh \rt, \ra, \rw /* t = hi(ra * w) */ + mul \rd, \ra, \rzeta /* azl = low(ra * zeta) */ + mul_q_sub \rd, \rt /* rd = azl - low(t * q) */ +.endm + +/* barrett_round rd, ra, rf, rf_w2, rt : + * + * rd = (ra * rf) mod q (plain domain, |rd| < q). Clobbers: rt. + * + * Rounding Barrett: instead of the truncating hi(ra*w) of `barrett`, it + * uses the doubled multiplier rf_w2 = round(rf * 2^33 / q) and recovers a + * round-to-nearest quotient by qhat = (hi(ra*rf_w2) + 1) >> 1: + * t = hi(ra * rf_w2) ~ floor(2 * ra * rf / q) + * qhat = (t + 1) >> 1 ~ round(ra * rf / q) + * rd = low(ra * rf) - low(qhat * q) + * The round-to-nearest quotient gives the tighter bound |rd| < q (measured + * <= 0.503 q), versus |rd| < 1.01 q for the truncating `barrett`. + * + * rf_w2 fits int32 only because rf is small (here 16382); a general twiddle + * up to q/2 would overflow the doubled constant. Final scaling only. + */ +.macro barrett_round rd, ra, rf, rf_w2, rt + mulh \rt, \ra, \rf_w2 /* t = hi(ra * (2*f)~) */ + addi \rt, \rt, 1 + srai \rt, \rt, 1 /* qhat = (t + 1) >> 1 */ + mul \rd, \ra, \rf /* azl = low(ra * f) */ + mul_q_sub \rd, \rt /* rd = azl - low(qhat*q) */ +.endm + +/* gs_bfly ra, rb, rzeta, rw, rt0, rt1 : + * + * t = rb - ra + * ra = ra + rb + * rb = barrett(t, +rzeta) + * + * Gentleman-Sande butterfly. Each application grows |coef| by a factor of 2 + * (or by ~q, whichever is greater): the additive part doubles, the + * multiplicative part is bounded by ~q. + * + * The algebraic equivalence with the C reference's + * t = ra; ra = t + rb; rb = barrett(t - rb, -zeta) + * follows from barrett being linear in its constant: + * barrett(t - rb, -zeta) = -barrett(t - rb, +zeta) + * = barrett(rb - t, +zeta) + * = barrett(rb - ra, +zeta) (t == ra) + * which is what this macro computes. This lets us reuse the (un-negated) + * forward-NTT zeta table. + * + * Clobbers: rt0, rt1. + */ +.macro gs_bfly ra, rb, rzeta, rw, rt0, rt1 + sub \rt0, \rb, \ra + add \ra, \ra, \rb + barrett \rb, \rt0, \rzeta, \rw, \rt1 +.endm + +/* gs_radix4 stride : + * + * Reads four coefficients from offsets [0, s, 2s, 3s] of `data`, + * applies the inverse-NTT radix-4 kernel using the loaded zetas, + * writes them back. + * + * Within a single inv-pass: + * - "Inner" layer (the smaller-stride C-layer, run first) pairs + * (a,b) and (c,d). The C reference uses two distinct zetas here + * (k = (1< - low(rt*q) */ +#else /* MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER */ + mul \rt, \rt, q /* low(rt * q) */ + sub \rd, \rd, \rt +#endif /* !MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER */ +.endm + +/* barrett rd, ra, rzeta, rw, rt : + * + * rd = (ra * rzeta) mod q (plain domain, |rd| < 1.01 q). + * + * rzeta : plain centered twiddle (constant) + * rw : Barrett multiplier round(rzeta * 2^32 / q) (constant) + * t = hi(ra * rw) + * rd = low(ra * rzeta) - low(t * q) + * with low(t*q) computed by mul_q_sub. Clobbers: rt. + */ +.macro barrett rd, ra, rzeta, rw, rt + mulh \rt, \ra, \rw /* t = hi(ra * w) */ + mul \rd, \ra, \rzeta /* azl = low(ra * zeta) */ + mul_q_sub \rd, \rt /* rd = azl - low(t * q) */ +.endm + +/* ct_bfly ra, rb, rzeta, rw, rt0, rt1 : + * + * t = barrett(rb, rzeta) + * rb = ra - t + * ra = ra + t + * + * Cooley-Tukey butterfly. Each application grows |coeff| by at most ~q. + * Clobbers: rt0, rt1. + */ +.macro ct_bfly ra, rb, rzeta, rw, rt0, rt1 + barrett \rt0, \rb, \rzeta, \rw, \rt1 + sub \rb, \ra, \rt0 + add \ra, \ra, \rt0 +.endm + +/* radix4_kernel stride (in bytes): + * + * Reads four coefficients from offsets [0, s, 2s, 3s] of `data`, runs + * two layers of CT butterflies using the loaded zeta pairs, writes back. + */ +.macro radix4_kernel stride + lw ca, 0(data) + lw cb, (1*\stride)(data) + lw cc, (2*\stride)(data) + lw cd, (3*\stride)(data) + + /* "Lo" layer: pair (ca,cc) and (cb,cd), both with zeta_lo. */ + ct_bfly ca, cc, zeta_lo, zeta_lo_w, tmp0, tmp1 + ct_bfly cb, cd, zeta_lo, zeta_lo_w, tmp0, tmp1 + + /* "Hi" layer: (ca,cb) with zeta_h0, (cc,cd) with zeta_h1. */ + ct_bfly ca, cb, zeta_h0, zeta_h0_w, tmp0, tmp1 + ct_bfly cc, cd, zeta_h1, zeta_h1_w, tmp0, tmp1 + + sw ca, 0(data) + sw cb, (1*\stride)(data) + sw cc, (2*\stride)(data) + sw cd, (3*\stride)(data) +.endm + +/* load_outer_zetas: load 3 (zeta, w) pairs (24 bytes) for one outer iter + * from `zeta_ptr`, advancing it. */ +.macro load_outer_zetas + lw zeta_lo, 0(zeta_ptr) + lw zeta_lo_w, 4(zeta_ptr) + lw zeta_h0, 8(zeta_ptr) + lw zeta_h0_w, 12(zeta_ptr) + lw zeta_h1, 16(zeta_ptr) + lw zeta_h1_w, 20(zeta_ptr) + addi zeta_ptr, zeta_ptr, 24 +.endm + +/* save / restore the callee-saved regs s0..s5 we use. */ +.macro save_regs + addi sp, sp, -24 + sw s0, 0(sp) + sw s1, 4(sp) + sw s2, 8(sp) + sw s3, 12(sp) + sw s4, 16(sp) + sw s5, 20(sp) +.endm + +.macro restore_regs + lw s0, 0(sp) + lw s1, 4(sp) + lw s2, 8(sp) + lw s3, 12(sp) + lw s4, 16(sp) + lw s5, 20(sp) + addi sp, sp, 24 +.endm + +/***************************************************************** + * Function + * + * The MLD_ASM_FN_SYMBOL(ntt_rv32im_asm) entry label lives in the wrapper + * .S file (next to its .global), so it is the first thing in .text. + *****************************************************************/ + + save_regs + +#if !defined(MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER) + /* q = 8380417 = 0x007FE001, for the multiply in mul_q_sub. */ + lui q, 0x7FE + addi q, q, 1 +#endif + + /*************************************************** + * Pass 1: C-layers 1, 2. + * 1 outer iter, 64 inner iters, butterfly stride = 256 B. + ***************************************************/ + load_outer_zetas + mv data, in_ptr + addi inner_end, in_ptr, 256 /* 64 * 4 B */ +ntt_rv32im_p1_loop: + radix4_kernel 256 + addi data, data, 4 + bne data, inner_end, ntt_rv32im_p1_loop + + /*************************************************** + * Pass 2: C-layers 3, 4. + * 4 outer iters, 16 inner iters each, stride = 64 B. + * Each outer block is 256 B (= 64 coefs). + ***************************************************/ + mv data, in_ptr + addi outer_end, in_ptr, 1024 +ntt_rv32im_p2_outer: + load_outer_zetas + addi inner_end, data, 64 /* 16 * 4 B */ +ntt_rv32im_p2_inner: + radix4_kernel 64 + addi data, data, 4 + bne data, inner_end, ntt_rv32im_p2_inner + addi data, data, (256 - 64) /* skip to next 256 B block */ + bne data, outer_end, ntt_rv32im_p2_outer + + /*************************************************** + * Pass 3: C-layers 5, 6. + * 16 outer iters, 4 inner iters each, stride = 16 B. + * Each outer block is 64 B (= 16 coefs). + ***************************************************/ + mv data, in_ptr + addi outer_end, in_ptr, 1024 +ntt_rv32im_p3_outer: + load_outer_zetas + addi inner_end, data, 16 /* 4 * 4 B */ +ntt_rv32im_p3_inner: + radix4_kernel 16 + addi data, data, 4 + bne data, inner_end, ntt_rv32im_p3_inner + addi data, data, (64 - 16) /* skip to next 64 B block */ + bne data, outer_end, ntt_rv32im_p3_outer + + /*************************************************** + * Pass 4: C-layers 7, 8. + * 64 outer iters, 1 inner iter each, stride = 4 B. + * Each outer iter handles 4 consecutive coefficients. + ***************************************************/ + mv data, in_ptr + addi outer_end, in_ptr, 1024 +ntt_rv32im_p4_outer: + load_outer_zetas + radix4_kernel 4 + addi data, data, 16 + bne data, outer_end, ntt_rv32im_p4_outer + + restore_regs + ret + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. */ +#undef in_ptr +#undef zeta_ptr +#undef data +#undef outer_end +#undef inner_end +#undef ca +#undef cb +#undef cc +#undef cd +#undef tmp0 +#undef tmp1 +#undef zeta_lo +#undef zeta_lo_w +#undef zeta_h0 +#undef zeta_h0_w +#undef zeta_h1 +#undef zeta_h1_w +#undef q diff --git a/dev/riscv32/src/ntt_rv32im_slowmul_asm.S b/dev/riscv32/src/ntt_rv32im_slowmul_asm.S new file mode 100644 index 000000000..b3ae6464b --- /dev/null +++ b/dev/riscv32/src/ntt_rv32im_slowmul_asm.S @@ -0,0 +1,37 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * RV32-IM ML-DSA forward NTT, slow-multiplier variant. + * + * Thin wrapper: the kernel body is shared via ntt_rv32im_asm.i, which here + * computes the Barrett low(t*q) reduction with a shift-add chain (exploiting + * q = 2^23 - 2^13 + 1), trading the multiply for cheap ALU ops -- preferred + * when the multiplier is slow. The fast-multiplier variant lives in + * ntt_rv32im_asm.S. Exactly one of the two is selected, by + * MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER. + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_RV32IM) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + defined(MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER) +/* simpasm: header-end */ + + .text + .global MLD_ASM_NAMESPACE(ntt_rv32im_asm) + .balign 4 +MLD_ASM_FN_SYMBOL(ntt_rv32im_asm) + +#define MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER +#include "ntt_rv32im_asm.i" + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER + +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER */ diff --git a/dev/riscv32/src/poly_pointwise_montgomery_rv32im_asm.S b/dev/riscv32/src/poly_pointwise_montgomery_rv32im_asm.S new file mode 100644 index 000000000..4d51c9afc --- /dev/null +++ b/dev/riscv32/src/poly_pointwise_montgomery_rv32im_asm.S @@ -0,0 +1,123 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * RV32-IM ML-DSA pointwise polynomial multiplication with Montgomery + * reduction. Computes + * + * a[i] = (a[i] * b[i]) * R^-1 mod q, R = 2^32, |result| < q, + * + * for i in 0..256, in-place in a. + * + * Modular arithmetic: standard signed Montgomery reduction. Unlike the + * NTT, neither operand is constant, so we can't precompute a twisted + * form -- the kernel uses 4 multiplies per coefficient: + * + * plo = low (a * b) ; mul + * m = low (plo * QINV) ; mul (low 32 of (plo * QINV)) + * phi = high(a * b) ; mulh + * mh = high(m * q) ; mulh + * r = phi - mh ; sub + * + * Bounds: requires |a[i]|, |b[i]| < MLD_NTT_BOUND = 9*q. The product + * is bounded by (9q)^2 < 2^31 * q, well within the safe input range + * for `mld_montgomery_reduce` (which is |a| <= 2^31 * q). + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_RV32IM) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +/***************************************************************** + * Register aliases + *****************************************************************/ + +/* Arguments */ +#define a_ptr a0 +#define b_ptr a1 + +/* Loop control */ +#define a_end t0 /* end-of-array sentinel for a_ptr */ + +/* Per-coef working set (caller-saved) */ +#define a_val a2 +#define b_val a3 +#define plo a4 +#define phi a5 +#define mlo a6 +#define mhi a7 + +/* Constants (callee-saved) */ +#define q s0 /* MLDSA_Q = 8380417 */ +#define qinv s1 /* QINV = 58728449 */ + +/***************************************************************** + * Function + *****************************************************************/ + + .text + .global MLD_ASM_NAMESPACE(poly_pointwise_montgomery_rv32im_asm) + .balign 4 +MLD_ASM_FN_SYMBOL(poly_pointwise_montgomery_rv32im_asm) + + addi sp, sp, -8 + sw s0, 0(sp) + sw s1, 4(sp) + + /* q = 0x007FE001 */ + lui q, 0x7FE + addi q, q, 1 + /* qinv = 0x03802001 = 58728449 + * lui qinv, 0x3802; addi qinv, qinv, 1 -> 0x03802001 */ + lui qinv, 0x3802 + addi qinv, qinv, 1 + + addi a_end, a_ptr, 1024 /* 256 * 4 bytes */ + +poly_pointwise_montgomery_rv32im_loop: + lw a_val, 0(a_ptr) + lw b_val, 0(b_ptr) + + /* Standard signed Montgomery reduction of a*b: + * plo = (a*b) low 32 + * mlo = plo*QINV low 32 + * phi = (a*b) high 32 (signed) + * mhi = mlo*q high 32 (signed) + * res = phi - mhi + */ + mul plo, a_val, b_val + mul mlo, plo, qinv + mulh phi, a_val, b_val + mulh mhi, mlo, q + sub a_val, phi, mhi + + sw a_val, 0(a_ptr) + + addi a_ptr, a_ptr, 4 + addi b_ptr, b_ptr, 4 + bne a_ptr, a_end, poly_pointwise_montgomery_rv32im_loop + + lw s0, 0(sp) + lw s1, 4(sp) + addi sp, sp, 8 + ret + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef a_ptr +#undef b_ptr +#undef a_end +#undef a_val +#undef b_val +#undef plo +#undef phi +#undef mlo +#undef mhi +#undef q +#undef qinv + +/* simpasm: footer-start */ +#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/riscv32/src/rv32im_zetas.c b/dev/riscv32/src/rv32im_zetas.c new file mode 100644 index 000000000..05cd415a7 --- /dev/null +++ b/dev/riscv32/src/rv32im_zetas.c @@ -0,0 +1,118 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mldsa-native repository. + * Do not modify it directly. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_RV32IM) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include "arith_native_rv32im.h" + +/* + * Table of zeta values used in the RV32-IM forward NTT. + * Each entry is a (zeta, w) Barrett pair, with zeta the plain + * centered twiddle (|zeta| <= q/2) and w = round(zeta * 2^32 / q) + * the Barrett multiplier. See autogen for details. + */ +MLD_ALIGN MLD_INTERNAL_DATA_DEFINITION const int32_t + mld_rv32im_ntt_zetas[510] = { + -3572223, -1830765815, 3765607, 1929875198, 3761513, 1927777021, + -3201494, -1640767044, -601683, -308362795, 3542485, 1815525077, + -2883726, -1477910808, 2682288, 1374673747, 2129892, 1091570561, + -3145678, -1612161320, 3764867, 1929495947, -1005239, -515185417, + -3201430, -1640734244, 557458, 285697463, -1221177, -625853735, + -3370349, -1727305304, 3602218, 1846138265, 3182878, 1631226336, + -4063053, -2082316400, 2740543, 1404529459, -3586446, -1838055109, + 2663378, 1364982364, -3110818, -1594295555, 2101410, 1076973524, + -1674615, -858240904, 3704823, 1898723372, 1159875, 594436433, + -3524442, -1806278032, 394148, 202001019, 928749, 475984260, + -434125, -222489248, 1095468, 561427818, -3506380, -1797021249, + 676590, 346752664, 2071829, 1061813248, -4018989, -2059733581, + -1335936, -684667771, 3241972, 1661512036, 2156050, 1104976547, + -3227876, -1654287830, 3415069, 1750224323, 1759347, 901666090, + 1714295, 878576921, -817536, -418987550, -3574466, -1831915353, + 2453983, 1257667337, 3756790, 1925356481, -1935799, -992097815, + 1460718, 748618600, -1716988, -879957084, -3950053, -2024403852, + -642628, -329347125, -2897314, -1484874664, 3192354, 1636082790, + -3585098, -1837364258, 556856, 285388938, 3870317, 1983539117, + 2815639, 1443016191, 2917338, 1495136972, 1853806, 950076368, + 2283733, 1170414139, 3345963, 1714807468, 1858416, 952438995, + 3073009, 1574918427, 1753, 898413, -1935420, -991903578, + 1277625, 654783359, -2659525, -1363007700, -1455890, -746144248, + -2635473, -1350681039, 2660408, 1363460238, -1780227, -912367099, + 3852015, 1974159335, -59148, -30313375, 2772600, 1420958686, + 4183372, 2143979939, 1182243, 605900043, 87208, 44694137, + -3222807, -1651689966, 636927, 326425360, -3965306, -2032221021, + -3121440, -1599739335, -3956745, -2027833504, -2296397, -1176904444, + -274060, -140455867, -3284915, -1683520342, -3716946, -1904936414, + 2508980, 1285853323, -27812, -14253662, 822541, 421552614, + 2028118, 1039411342, 1009365, 517299994, -2454145, -1257750362, + 1937570, 993005454, -1979497, -1014493059, 1596822, 818371958, + -3815725, -1955560694, -3956944, -2027935492, -3759465, -1926727420, + 2811291, 1440787840, -1685153, -863641633, -3410568, -1747917558, + -2983781, -1529189038, 2678278, 1372618620, -3768948, -1931587462, + -1109516, -568627424, -3551006, -1819892093, 635956, 325927722, + 4158088, 2131021878, -250446, -128353682, -2455377, -1258381762, + 1528066, 783134478, -4146264, -2124962073, -1772588, -908452108, + 482649, 247357819, 2192938, 1123881663, -1727088, -885133339, + 1148858, 588790216, 2387513, 1223601433, -3611750, -1851023419, + -2962264, -1518161567, -268456, -137583815, -3180456, -1629985060, + -565603, -289871779, 3747250, 1920467227, 2296099, 1176751719, + 169688, 86965173, 1239911, 635454918, -3838479, -1967222129, + 2462444, 1262003603, 3195676, 1637785316, 2642980, 1354528380, + -3334383, -1708872713, 1254190, 642772911, -12417, -6363718, + -4166425, -2135294594, 2998219, 1536588520, 141835, 72690498, + -3488383, -1787797779, -89301, -45766801, 2513018, 1287922800, + 1987814, 1018755525, -1354892, -694382729, 613238, 314284737, + -3197248, -1638590967, -1310261, -671509323, -2218467, -1136965286, + 1736313, 889861155, -458740, -235104446, -1921994, -985022747, + 235407, 120646188, 4040196, 2070602178, -3472069, -1779436847, + -3250154, -1665705315, 2039144, 1045062172, -1879878, -963438279, + 3258457, 1669960606, -818761, -419615363, -2178965, -1116720494, + -2579253, -1321868265, -1623354, -831969619, 2105286, 1078959975, + 1787943, 916321552, -2374402, -1216882040, -2033807, -1042326957, + -2391089, -1225434135, 586241, 300448763, -1179613, -604552167, + -2254727, -1155548552, 527981, 270590488, -2743411, -1405999311, + 3482206, 1784632064, -1476985, -756955444, 1994046, 1021949428, + -4182915, -2143745726, 2491325, 1276805128, -1393159, -713994583, + -1300016, -666258756, 507927, 260312805, -1187885, -608791570, + -2362063, -1210558298, -724804, -371462360, -1834526, -940195359, + -1317678, -675310538, -3033742, -1554794072, -338420, -173440395, + 2461387, 1261461890, 2647994, 1357098057, 3009748, 1542497137, + 3035980, 1555941048, -2612853, -1339088280, 4148469, 2126092136, + 621164, 318346816, 749577, 384158533, -4022750, -2061661095, + 3901472, 1999506068, 3980599, 2040058690, 2569011, 1316619236, + -1226661, -628664287, -1615530, -827959816, 1723229, 883155599, + 2925816, 1499481951, 1665318, 853476187, 2028038, 1039370342, + 3374250, 1729304568, 1163598, 596344473, -3369273, -1726753853, + 1356448, 695180180, 3994671, 2047270596, -11879, -6087993, + -2775755, -1422575624, -1370517, -702390549, 3020393, 1547952704, + 2683270, 1375177022, 3363542, 1723816713, 214880, 110126092, + -2778788, -1424130038, 545376, 279505433, -770441, -394851342, + -3467665, -1777179795, 3105558, 1591599803, -1103344, -565464272, + 2312838, 1185330464, 508145, 260424530, -553718, -283780712, + -653275, -334803717, 860144, 440824168, 3430436, 1758099917, + -459163, -235321234, 140244, 71875110, -1514152, -776003547, + 348812, 178766299, -2185084, -1119856484, 3123762, 1600929361, + -327848, -168022240, 2358373, 1208667171, -2193087, -1123958025, + 1011223, 518252220, -3014420, -1544891539, -1716814, -879867909, + -2354215, -1206536194, 2926054, 1499603926, -392707, -201262505, + -3818627, -1957047970, -303005, -155290192, 3531229, 1809756372, + -1922253, -985155484, -3974485, -2036925262, -3773731, -1934038751, + -2236726, -1146323031, 1900052, 973777462, -781875, -400711272, + 1744507, 894060583, 1054478, 540420426, -731434, -374860238, +}; + +#else /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ + +MLD_EMPTY_CU(rv32im_zetas) + +#endif /* !(MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/flake.nix b/flake.nix index 8bd75003b..f96f910e8 100644 --- a/flake.nix +++ b/flake.nix @@ -162,7 +162,7 @@ # autogen shell with cross compiler for the "other" architecture devShells.cross-autogen = util.mkShell { - packages = builtins.attrValues { inherit (config.packages) linters; inherit (pkgs) gcc-arm-embedded; } + packages = builtins.attrValues { inherit (config.packages) linters toolchain_riscv32; inherit (pkgs) gcc-arm-embedded; } ++ pkgs.lib.optionals pkgs.stdenv.hostPlatform.isx86_64 [ config.packages.toolchain_aarch64 ] ++ pkgs.lib.optionals pkgs.stdenv.hostPlatform.isAarch64 [ config.packages.toolchain_x86_64 ]; }; diff --git a/mldsa/mldsa_native.c b/mldsa/mldsa_native.c index 9365ed369..1f02ee81b 100644 --- a/mldsa/mldsa_native.c +++ b/mldsa/mldsa_native.c @@ -92,6 +92,9 @@ #include "src/native/x86_64/src/rej_uniform_eta4_avx2.c" #include "src/native/x86_64/src/rej_uniform_table.c" #endif /* MLD_SYS_X86_64 */ +#if defined(MLD_SYS_RISCV32) +#include "src/native/rv32im/src/rv32im_zetas.c" +#endif #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */ #if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202) @@ -805,5 +808,22 @@ #undef MLD_NATIVE_X86_64_SRC_CONSTS_H #undef mld_qdata #endif /* MLD_SYS_X86_64 */ +#if defined(MLD_SYS_RISCV32) +/* + * Undefine macros from native code (Arith, RV32IM) + */ +/* mldsa/src/native/rv32im/meta.h */ +#undef MLD_ARITH_BACKEND_RV32IM +#undef MLD_NATIVE_RV32IM_META_H +#undef MLD_USE_NATIVE_INTT +#undef MLD_USE_NATIVE_NTT +#undef MLD_USE_NATIVE_POINTWISE_MONTGOMERY +/* mldsa/src/native/rv32im/src/arith_native_rv32im.h */ +#undef MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H +#undef mld_intt_rv32im_asm +#undef mld_ntt_rv32im_asm +#undef mld_poly_pointwise_montgomery_rv32im_asm +#undef mld_rv32im_ntt_zetas +#endif /* MLD_SYS_RISCV32 */ #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */ #endif /* !MLD_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS */ diff --git a/mldsa/mldsa_native_asm.S b/mldsa/mldsa_native_asm.S index 4877d5156..902021a31 100644 --- a/mldsa/mldsa_native_asm.S +++ b/mldsa/mldsa_native_asm.S @@ -91,6 +91,13 @@ #include "src/native/x86_64/src/polyz_unpack_17_avx2_asm.S" #include "src/native/x86_64/src/polyz_unpack_19_avx2_asm.S" #endif /* MLD_SYS_X86_64 */ +#if defined(MLD_SYS_RISCV32) +#include "src/native/rv32im/src/intt_rv32im_asm.S" +#include "src/native/rv32im/src/intt_rv32im_slowmul_asm.S" +#include "src/native/rv32im/src/ntt_rv32im_asm.S" +#include "src/native/rv32im/src/ntt_rv32im_slowmul_asm.S" +#include "src/native/rv32im/src/poly_pointwise_montgomery_rv32im_asm.S" +#endif /* MLD_SYS_RISCV32 */ #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */ #if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202) @@ -818,5 +825,22 @@ #undef MLD_NATIVE_X86_64_SRC_CONSTS_H #undef mld_qdata #endif /* MLD_SYS_X86_64 */ +#if defined(MLD_SYS_RISCV32) +/* + * Undefine macros from native code (Arith, RV32IM) + */ +/* mldsa/src/native/rv32im/meta.h */ +#undef MLD_ARITH_BACKEND_RV32IM +#undef MLD_NATIVE_RV32IM_META_H +#undef MLD_USE_NATIVE_INTT +#undef MLD_USE_NATIVE_NTT +#undef MLD_USE_NATIVE_POINTWISE_MONTGOMERY +/* mldsa/src/native/rv32im/src/arith_native_rv32im.h */ +#undef MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H +#undef mld_intt_rv32im_asm +#undef mld_ntt_rv32im_asm +#undef mld_poly_pointwise_montgomery_rv32im_asm +#undef mld_rv32im_ntt_zetas +#endif /* MLD_SYS_RISCV32 */ #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */ #endif /* !MLD_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS */ diff --git a/mldsa/src/native/meta.h b/mldsa/src/native/meta.h index b26232622..fc886cdff 100644 --- a/mldsa/src/native/meta.h +++ b/mldsa/src/native/meta.h @@ -21,4 +21,10 @@ #include "x86_64/meta.h" #endif +/* We do not yet include the arithmetic backend for RV32-IM by default + * as it is still experimental and undergoing review. */ +/* #if defined(MLD_SYS_RISCV32) */ +/* #include "rv32im/meta.h" */ +/* #endif */ + #endif /* !MLD_NATIVE_META_H */ diff --git a/mldsa/src/native/rv32im/meta.h b/mldsa/src/native/rv32im/meta.h new file mode 100644 index 000000000..a83cd62c1 --- /dev/null +++ b/mldsa/src/native/rv32im/meta.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLD_NATIVE_RV32IM_META_H +#define MLD_NATIVE_RV32IM_META_H + +/* Set of primitives that this backend replaces */ +#define MLD_USE_NATIVE_NTT +#define MLD_USE_NATIVE_INTT +#define MLD_USE_NATIVE_POINTWISE_MONTGOMERY + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLD_ARITH_BACKEND_RV32IM + + +#if !defined(__ASSEMBLER__) +#include "../api.h" +#include "src/arith_native_rv32im.h" + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_ntt_native(int32_t data[MLDSA_N]) +{ + mld_ntt_rv32im_asm(data, mld_rv32im_ntt_zetas); + return MLD_NATIVE_FUNC_SUCCESS; +} + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_intt_native(int32_t data[MLDSA_N]) +{ + mld_intt_rv32im_asm(data, mld_rv32im_ntt_zetas); + return MLD_NATIVE_FUNC_SUCCESS; +} + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_poly_pointwise_montgomery_native( + int32_t a[MLDSA_N], const int32_t b[MLDSA_N]) +{ + mld_poly_pointwise_montgomery_rv32im_asm(a, b); + return MLD_NATIVE_FUNC_SUCCESS; +} + +#endif /* !__ASSEMBLER__ */ +#endif /* !MLD_NATIVE_RV32IM_META_H */ diff --git a/mldsa/src/native/rv32im/src/arith_native_rv32im.h b/mldsa/src/native/rv32im/src/arith_native_rv32im.h new file mode 100644 index 000000000..9a987fbfd --- /dev/null +++ b/mldsa/src/native/rv32im/src/arith_native_rv32im.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H +#define MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H + +#include "../../../cbmc.h" +#include "../../../common.h" + +#define mld_rv32im_ntt_zetas MLD_NAMESPACE(rv32im_ntt_zetas) + +/* + * Forward NTT zeta table for the RV32-IM backend. + * + * 255 logical entries, each a (zeta, w) Barrett pair: zeta is the plain + * centered twiddle w^{bitrev_8(k)} mod q (|zeta| <= q/2) and + * w = round(zeta * 2^32 / q) is the Barrett multiplier used by the + * constant-twiddle butterfly. The order matches the consumption order of + * the 2+2+2+2 forward NTT. + */ +MLD_INTERNAL_DATA_DECLARATION const int32_t mld_rv32im_ntt_zetas[510]; + +#define mld_ntt_rv32im_asm MLD_NAMESPACE(ntt_rv32im_asm) +void mld_ntt_rv32im_asm(int32_t *r, const int32_t *zetas) +__contract__( + requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N)) + requires(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q)) + requires(zetas == mld_rv32im_ntt_zetas) + assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N)) + /* Forward-NTT output bound MLD_NTT_BOUND = 9 * MLD_FQMUL_BOUND. The + * truncating `mulh` Barrett multiply has output bound MLD_FQMUL_BOUND = + * 5/4 * MLDSA_Q (vs MLDSA_Q for the rounding `sqrdmulh` used on AArch64), + * so the NTT output is bounded by 9 * MLD_FQMUL_BOUND, not 9 * MLDSA_Q. + * Spelled out inline to keep this header free of poly.h. */ + ensures(array_abs_bound(r, 0, MLDSA_N, 9 * ((5 * MLDSA_Q + 3) / 4))) +); + +#define mld_intt_rv32im_asm MLD_NAMESPACE(intt_rv32im_asm) +void mld_intt_rv32im_asm(int32_t *r, const int32_t *zetas) +__contract__( + requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N)) + requires(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q)) + requires(zetas == mld_rv32im_ntt_zetas) + assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N)) + ensures(array_abs_bound(r, 0, MLDSA_N, MLDSA_Q)) +); + +#define mld_poly_pointwise_montgomery_rv32im_asm \ + MLD_NAMESPACE(poly_pointwise_montgomery_rv32im_asm) +void mld_poly_pointwise_montgomery_rv32im_asm(int32_t *a, const int32_t *b) +__contract__( + requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(b, sizeof(int32_t) * MLDSA_N)) + /* Inputs bounded by MLD_NTT_BOUND = 9 * MLD_FQMUL_BOUND, the guaranteed + * output bound of any forward NTT. Spelled out inline to keep this header + * free of poly.h. */ + requires(array_abs_bound(a, 0, MLDSA_N, 9 * ((5 * MLDSA_Q + 3) / 4))) + requires(array_abs_bound(b, 0, MLDSA_N, 9 * ((5 * MLDSA_Q + 3) / 4))) + assigns(memory_slice(a, sizeof(int32_t) * MLDSA_N)) + ensures(array_abs_bound(a, 0, MLDSA_N, MLDSA_Q)) +); + +#endif /* !MLD_NATIVE_RV32IM_SRC_ARITH_NATIVE_RV32IM_H */ diff --git a/mldsa/src/native/rv32im/src/intt_rv32im_asm.S b/mldsa/src/native/rv32im/src/intt_rv32im_asm.S new file mode 100644 index 000000000..a52b11041 --- /dev/null +++ b/mldsa/src/native/rv32im/src/intt_rv32im_asm.S @@ -0,0 +1,272 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * RV32-IM ML-DSA inverse NTT, fast-multiplier variant. + * + * Thin wrapper: the kernel body is shared via intt_rv32im_asm.i, which here + * computes the Barrett low(t*q) reduction with a single multiply by q. The + * slow-multiplier variant (shift-add) lives in intt_rv32im_slowmul_asm.S. + * Exactly one of the two is selected, by MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER. + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_RV32IM) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + !defined(MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER) + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/riscv32/src/intt_rv32im_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(intt_rv32im_asm) +MLD_ASM_FN_SYMBOL(intt_rv32im_asm) + + .cfi_startproc + addi sp, sp, -0x20 + .cfi_adjust_cfa_offset 0x20 + sw s0, 0x0(sp) + sw s1, 0x4(sp) + sw s2, 0x8(sp) + sw s3, 0xc(sp) + sw s4, 0x10(sp) + sw s5, 0x14(sp) + sw s6, 0x18(sp) + sw s7, 0x1c(sp) + lui t0, 0x7fe + addi t0, t0, 0x1 + addi a1, a1, 0x7f8 + mv t2, a0 + addi t3, a0, 0x400 + +Lintt_rv32im_p1_outer: + addi a1, a1, -0x18 + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + lw a2, 0x0(t2) + lw a3, 0x4(t2) + lw a4, 0x8(t2) + lw a5, 0xc(t2) + sub a6, a3, a2 + add a2, a2, a3 + mulh a7, a6, s5 + mul a3, a6, s4 + mul a7, a7, t0 + sub a3, a3, a7 + sub a6, a5, a4 + add a4, a4, a5 + mulh a7, a6, s3 + mul a5, a6, s2 + mul a7, a7, t0 + sub a5, a5, a7 + sub a6, a4, a2 + add a2, a2, a4 + mulh a7, a6, s1 + mul a4, a6, s0 + mul a7, a7, t0 + sub a4, a4, a7 + sub a6, a5, a3 + add a3, a3, a5 + mulh a7, a6, s1 + mul a5, a6, s0 + mul a7, a7, t0 + sub a5, a5, a7 + sw a2, 0x0(t2) + sw a3, 0x4(t2) + sw a4, 0x8(t2) + sw a5, 0xc(t2) + addi t2, t2, 0x10 + bne t2, t3, Lintt_rv32im_p1_outer + mv t2, a0 + addi t3, a0, 0x400 + +Lintt_rv32im_p2_outer: + addi a1, a1, -0x18 + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + addi t4, t2, 0x10 + +Lintt_rv32im_p2_inner: + lw a2, 0x0(t2) + lw a3, 0x10(t2) + lw a4, 0x20(t2) + lw a5, 0x30(t2) + sub a6, a3, a2 + add a2, a2, a3 + mulh a7, a6, s5 + mul a3, a6, s4 + mul a7, a7, t0 + sub a3, a3, a7 + sub a6, a5, a4 + add a4, a4, a5 + mulh a7, a6, s3 + mul a5, a6, s2 + mul a7, a7, t0 + sub a5, a5, a7 + sub a6, a4, a2 + add a2, a2, a4 + mulh a7, a6, s1 + mul a4, a6, s0 + mul a7, a7, t0 + sub a4, a4, a7 + sub a6, a5, a3 + add a3, a3, a5 + mulh a7, a6, s1 + mul a5, a6, s0 + mul a7, a7, t0 + sub a5, a5, a7 + sw a2, 0x0(t2) + sw a3, 0x10(t2) + sw a4, 0x20(t2) + sw a5, 0x30(t2) + addi t2, t2, 0x4 + bne t2, t4, Lintt_rv32im_p2_inner + addi t2, t2, 0x30 + bne t2, t3, Lintt_rv32im_p2_outer + mv t2, a0 + addi t3, a0, 0x400 + +Lintt_rv32im_p3_outer: + addi a1, a1, -0x18 + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + addi t4, t2, 0x40 + +Lintt_rv32im_p3_inner: + lw a2, 0x0(t2) + lw a3, 0x40(t2) + lw a4, 0x80(t2) + lw a5, 0xc0(t2) + sub a6, a3, a2 + add a2, a2, a3 + mulh a7, a6, s5 + mul a3, a6, s4 + mul a7, a7, t0 + sub a3, a3, a7 + sub a6, a5, a4 + add a4, a4, a5 + mulh a7, a6, s3 + mul a5, a6, s2 + mul a7, a7, t0 + sub a5, a5, a7 + sub a6, a4, a2 + add a2, a2, a4 + mulh a7, a6, s1 + mul a4, a6, s0 + mul a7, a7, t0 + sub a4, a4, a7 + sub a6, a5, a3 + add a3, a3, a5 + mulh a7, a6, s1 + mul a5, a6, s0 + mul a7, a7, t0 + sub a5, a5, a7 + sw a2, 0x0(t2) + sw a3, 0x40(t2) + sw a4, 0x80(t2) + sw a5, 0xc0(t2) + addi t2, t2, 0x4 + bne t2, t4, Lintt_rv32im_p3_inner + addi t2, t2, 0xc0 + bne t2, t3, Lintt_rv32im_p3_outer + addi a1, a1, -0x18 + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + mv t2, a0 + addi t4, a0, 0x100 + +Lintt_rv32im_p4_inner: + lw a2, 0x0(t2) + lw a3, 0x100(t2) + lw a4, 0x200(t2) + lw a5, 0x300(t2) + sub a6, a3, a2 + add a2, a2, a3 + mulh a7, a6, s5 + mul a3, a6, s4 + mul a7, a7, t0 + sub a3, a3, a7 + sub a6, a5, a4 + add a4, a4, a5 + mulh a7, a6, s3 + mul a5, a6, s2 + mul a7, a7, t0 + sub a5, a5, a7 + sub a6, a4, a2 + add a2, a2, a4 + mulh a7, a6, s1 + mul a4, a6, s0 + mul a7, a7, t0 + sub a4, a4, a7 + sub a6, a5, a3 + add a3, a3, a5 + mulh a7, a6, s1 + mul a5, a6, s0 + mul a7, a7, t0 + sub a5, a5, a7 + sw a2, 0x0(t2) + sw a3, 0x100(t2) + sw a4, 0x200(t2) + sw a5, 0x300(t2) + addi t2, t2, 0x4 + bne t2, t4, Lintt_rv32im_p4_inner + lui s6, 0x4 + addi s6, s6, -0x2 + lui s7, 0x1004 + addi s7, s7, -0x7f4 + mv t2, a0 + addi t5, a0, 0x400 + +Lintt_rv32im_scale: + lw a2, 0x0(t2) + mulh a6, a2, s7 + addi a6, a6, 0x1 + srai a6, a6, 0x1 + mul a3, a2, s6 + mul a6, a6, t0 + sub a3, a3, a6 + sw a3, 0x0(t2) + addi t2, t2, 0x4 + bne t2, t5, Lintt_rv32im_scale + lw s0, 0x0(sp) + lw s1, 0x4(sp) + lw s2, 0x8(sp) + lw s3, 0xc(sp) + lw s4, 0x10(sp) + lw s5, 0x14(sp) + lw s6, 0x18(sp) + lw s7, 0x1c(sp) + addi sp, sp, 0x20 + .cfi_adjust_cfa_offset -0x20 + ret + .cfi_endproc + +MLD_ASM_FN_SIZE(intt_rv32im_asm) + +#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + !MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/mldsa/src/native/rv32im/src/intt_rv32im_slowmul_asm.S b/mldsa/src/native/rv32im/src/intt_rv32im_slowmul_asm.S new file mode 100644 index 000000000..b26242a04 --- /dev/null +++ b/mldsa/src/native/rv32im/src/intt_rv32im_slowmul_asm.S @@ -0,0 +1,323 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * RV32-IM ML-DSA inverse NTT, slow-multiplier variant. + * + * Thin wrapper: the kernel body is shared via intt_rv32im_asm.i, which here + * computes the Barrett low(t*q) reduction with a shift-add chain (exploiting + * q = 2^23 - 2^13 + 1), trading the multiply for cheap ALU ops -- preferred + * when the multiplier is slow. The fast-multiplier variant lives in + * intt_rv32im_asm.S. Exactly one of the two is selected, by + * MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER. + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_RV32IM) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + defined(MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER) + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/riscv32/src/intt_rv32im_slowmul_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(intt_rv32im_asm) +MLD_ASM_FN_SYMBOL(intt_rv32im_asm) + + .cfi_startproc + addi sp, sp, -0x20 + .cfi_adjust_cfa_offset 0x20 + sw s0, 0x0(sp) + sw s1, 0x4(sp) + sw s2, 0x8(sp) + sw s3, 0xc(sp) + sw s4, 0x10(sp) + sw s5, 0x14(sp) + sw s6, 0x18(sp) + sw s7, 0x1c(sp) + addi a1, a1, 0x7f8 + mv t2, a0 + addi t3, a0, 0x400 + +Lintt_rv32im_p1_outer: + addi a1, a1, -0x18 + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + lw a2, 0x0(t2) + lw a3, 0x4(t2) + lw a4, 0x8(t2) + lw a5, 0xc(t2) + sub a6, a3, a2 + add a2, a2, a3 + mulh a7, a6, s5 + mul a3, a6, s4 + sub a3, a3, a7 + slli a7, a7, 0xd + add a3, a3, a7 + slli a7, a7, 0xa + sub a3, a3, a7 + sub a6, a5, a4 + add a4, a4, a5 + mulh a7, a6, s3 + mul a5, a6, s2 + sub a5, a5, a7 + slli a7, a7, 0xd + add a5, a5, a7 + slli a7, a7, 0xa + sub a5, a5, a7 + sub a6, a4, a2 + add a2, a2, a4 + mulh a7, a6, s1 + mul a4, a6, s0 + sub a4, a4, a7 + slli a7, a7, 0xd + add a4, a4, a7 + slli a7, a7, 0xa + sub a4, a4, a7 + sub a6, a5, a3 + add a3, a3, a5 + mulh a7, a6, s1 + mul a5, a6, s0 + sub a5, a5, a7 + slli a7, a7, 0xd + add a5, a5, a7 + slli a7, a7, 0xa + sub a5, a5, a7 + sw a2, 0x0(t2) + sw a3, 0x4(t2) + sw a4, 0x8(t2) + sw a5, 0xc(t2) + addi t2, t2, 0x10 + bne t2, t3, Lintt_rv32im_p1_outer + mv t2, a0 + addi t3, a0, 0x400 + +Lintt_rv32im_p2_outer: + addi a1, a1, -0x18 + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + addi t4, t2, 0x10 + +Lintt_rv32im_p2_inner: + lw a2, 0x0(t2) + lw a3, 0x10(t2) + lw a4, 0x20(t2) + lw a5, 0x30(t2) + sub a6, a3, a2 + add a2, a2, a3 + mulh a7, a6, s5 + mul a3, a6, s4 + sub a3, a3, a7 + slli a7, a7, 0xd + add a3, a3, a7 + slli a7, a7, 0xa + sub a3, a3, a7 + sub a6, a5, a4 + add a4, a4, a5 + mulh a7, a6, s3 + mul a5, a6, s2 + sub a5, a5, a7 + slli a7, a7, 0xd + add a5, a5, a7 + slli a7, a7, 0xa + sub a5, a5, a7 + sub a6, a4, a2 + add a2, a2, a4 + mulh a7, a6, s1 + mul a4, a6, s0 + sub a4, a4, a7 + slli a7, a7, 0xd + add a4, a4, a7 + slli a7, a7, 0xa + sub a4, a4, a7 + sub a6, a5, a3 + add a3, a3, a5 + mulh a7, a6, s1 + mul a5, a6, s0 + sub a5, a5, a7 + slli a7, a7, 0xd + add a5, a5, a7 + slli a7, a7, 0xa + sub a5, a5, a7 + sw a2, 0x0(t2) + sw a3, 0x10(t2) + sw a4, 0x20(t2) + sw a5, 0x30(t2) + addi t2, t2, 0x4 + bne t2, t4, Lintt_rv32im_p2_inner + addi t2, t2, 0x30 + bne t2, t3, Lintt_rv32im_p2_outer + mv t2, a0 + addi t3, a0, 0x400 + +Lintt_rv32im_p3_outer: + addi a1, a1, -0x18 + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + addi t4, t2, 0x40 + +Lintt_rv32im_p3_inner: + lw a2, 0x0(t2) + lw a3, 0x40(t2) + lw a4, 0x80(t2) + lw a5, 0xc0(t2) + sub a6, a3, a2 + add a2, a2, a3 + mulh a7, a6, s5 + mul a3, a6, s4 + sub a3, a3, a7 + slli a7, a7, 0xd + add a3, a3, a7 + slli a7, a7, 0xa + sub a3, a3, a7 + sub a6, a5, a4 + add a4, a4, a5 + mulh a7, a6, s3 + mul a5, a6, s2 + sub a5, a5, a7 + slli a7, a7, 0xd + add a5, a5, a7 + slli a7, a7, 0xa + sub a5, a5, a7 + sub a6, a4, a2 + add a2, a2, a4 + mulh a7, a6, s1 + mul a4, a6, s0 + sub a4, a4, a7 + slli a7, a7, 0xd + add a4, a4, a7 + slli a7, a7, 0xa + sub a4, a4, a7 + sub a6, a5, a3 + add a3, a3, a5 + mulh a7, a6, s1 + mul a5, a6, s0 + sub a5, a5, a7 + slli a7, a7, 0xd + add a5, a5, a7 + slli a7, a7, 0xa + sub a5, a5, a7 + sw a2, 0x0(t2) + sw a3, 0x40(t2) + sw a4, 0x80(t2) + sw a5, 0xc0(t2) + addi t2, t2, 0x4 + bne t2, t4, Lintt_rv32im_p3_inner + addi t2, t2, 0xc0 + bne t2, t3, Lintt_rv32im_p3_outer + addi a1, a1, -0x18 + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + mv t2, a0 + addi t4, a0, 0x100 + +Lintt_rv32im_p4_inner: + lw a2, 0x0(t2) + lw a3, 0x100(t2) + lw a4, 0x200(t2) + lw a5, 0x300(t2) + sub a6, a3, a2 + add a2, a2, a3 + mulh a7, a6, s5 + mul a3, a6, s4 + sub a3, a3, a7 + slli a7, a7, 0xd + add a3, a3, a7 + slli a7, a7, 0xa + sub a3, a3, a7 + sub a6, a5, a4 + add a4, a4, a5 + mulh a7, a6, s3 + mul a5, a6, s2 + sub a5, a5, a7 + slli a7, a7, 0xd + add a5, a5, a7 + slli a7, a7, 0xa + sub a5, a5, a7 + sub a6, a4, a2 + add a2, a2, a4 + mulh a7, a6, s1 + mul a4, a6, s0 + sub a4, a4, a7 + slli a7, a7, 0xd + add a4, a4, a7 + slli a7, a7, 0xa + sub a4, a4, a7 + sub a6, a5, a3 + add a3, a3, a5 + mulh a7, a6, s1 + mul a5, a6, s0 + sub a5, a5, a7 + slli a7, a7, 0xd + add a5, a5, a7 + slli a7, a7, 0xa + sub a5, a5, a7 + sw a2, 0x0(t2) + sw a3, 0x100(t2) + sw a4, 0x200(t2) + sw a5, 0x300(t2) + addi t2, t2, 0x4 + bne t2, t4, Lintt_rv32im_p4_inner + lui s6, 0x4 + addi s6, s6, -0x2 + lui s7, 0x1004 + addi s7, s7, -0x7f4 + mv t2, a0 + addi t5, a0, 0x400 + +Lintt_rv32im_scale: + lw a2, 0x0(t2) + mulh a6, a2, s7 + addi a6, a6, 0x1 + srai a6, a6, 0x1 + mul a3, a2, s6 + sub a3, a3, a6 + slli a6, a6, 0xd + add a3, a3, a6 + slli a6, a6, 0xa + sub a3, a3, a6 + sw a3, 0x0(t2) + addi t2, t2, 0x4 + bne t2, t5, Lintt_rv32im_scale + lw s0, 0x0(sp) + lw s1, 0x4(sp) + lw s2, 0x8(sp) + lw s3, 0xc(sp) + lw s4, 0x10(sp) + lw s5, 0x14(sp) + lw s6, 0x18(sp) + lw s7, 0x1c(sp) + addi sp, sp, 0x20 + .cfi_adjust_cfa_offset -0x20 + ret + .cfi_endproc + +MLD_ASM_FN_SIZE(intt_rv32im_asm) + +#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/mldsa/src/native/rv32im/src/ntt_rv32im_asm.S b/mldsa/src/native/rv32im/src/ntt_rv32im_asm.S new file mode 100644 index 000000000..3845101f3 --- /dev/null +++ b/mldsa/src/native/rv32im/src/ntt_rv32im_asm.S @@ -0,0 +1,249 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * RV32-IM ML-DSA forward NTT, fast-multiplier variant. + * + * Thin wrapper: the kernel body is shared via ntt_rv32im_asm.i, which here + * computes the Barrett low(t*q) reduction with a single multiply by q. The + * slow-multiplier variant (shift-add) lives in ntt_rv32im_slowmul_asm.S. + * Exactly one of the two is selected, by MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER. + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_RV32IM) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + !defined(MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER) + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/riscv32/src/ntt_rv32im_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(ntt_rv32im_asm) +MLD_ASM_FN_SYMBOL(ntt_rv32im_asm) + + .cfi_startproc + addi sp, sp, -0x18 + .cfi_adjust_cfa_offset 0x18 + sw s0, 0x0(sp) + sw s1, 0x4(sp) + sw s2, 0x8(sp) + sw s3, 0xc(sp) + sw s4, 0x10(sp) + sw s5, 0x14(sp) + lui t0, 0x7fe + addi t0, t0, 0x1 + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + addi a1, a1, 0x18 + mv t2, a0 + addi t4, a0, 0x100 + +Lntt_rv32im_p1_loop: + lw a2, 0x0(t2) + lw a3, 0x100(t2) + lw a4, 0x200(t2) + lw a5, 0x300(t2) + mulh a7, a4, s1 + mul a6, a4, s0 + mul a7, a7, t0 + sub a6, a6, a7 + sub a4, a2, a6 + add a2, a2, a6 + mulh a7, a5, s1 + mul a6, a5, s0 + mul a7, a7, t0 + sub a6, a6, a7 + sub a5, a3, a6 + add a3, a3, a6 + mulh a7, a3, s3 + mul a6, a3, s2 + mul a7, a7, t0 + sub a6, a6, a7 + sub a3, a2, a6 + add a2, a2, a6 + mulh a7, a5, s5 + mul a6, a5, s4 + mul a7, a7, t0 + sub a6, a6, a7 + sub a5, a4, a6 + add a4, a4, a6 + sw a2, 0x0(t2) + sw a3, 0x100(t2) + sw a4, 0x200(t2) + sw a5, 0x300(t2) + addi t2, t2, 0x4 + bne t2, t4, Lntt_rv32im_p1_loop + mv t2, a0 + addi t3, a0, 0x400 + +Lntt_rv32im_p2_outer: + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + addi a1, a1, 0x18 + addi t4, t2, 0x40 + +Lntt_rv32im_p2_inner: + lw a2, 0x0(t2) + lw a3, 0x40(t2) + lw a4, 0x80(t2) + lw a5, 0xc0(t2) + mulh a7, a4, s1 + mul a6, a4, s0 + mul a7, a7, t0 + sub a6, a6, a7 + sub a4, a2, a6 + add a2, a2, a6 + mulh a7, a5, s1 + mul a6, a5, s0 + mul a7, a7, t0 + sub a6, a6, a7 + sub a5, a3, a6 + add a3, a3, a6 + mulh a7, a3, s3 + mul a6, a3, s2 + mul a7, a7, t0 + sub a6, a6, a7 + sub a3, a2, a6 + add a2, a2, a6 + mulh a7, a5, s5 + mul a6, a5, s4 + mul a7, a7, t0 + sub a6, a6, a7 + sub a5, a4, a6 + add a4, a4, a6 + sw a2, 0x0(t2) + sw a3, 0x40(t2) + sw a4, 0x80(t2) + sw a5, 0xc0(t2) + addi t2, t2, 0x4 + bne t2, t4, Lntt_rv32im_p2_inner + addi t2, t2, 0xc0 + bne t2, t3, Lntt_rv32im_p2_outer + mv t2, a0 + addi t3, a0, 0x400 + +Lntt_rv32im_p3_outer: + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + addi a1, a1, 0x18 + addi t4, t2, 0x10 + +Lntt_rv32im_p3_inner: + lw a2, 0x0(t2) + lw a3, 0x10(t2) + lw a4, 0x20(t2) + lw a5, 0x30(t2) + mulh a7, a4, s1 + mul a6, a4, s0 + mul a7, a7, t0 + sub a6, a6, a7 + sub a4, a2, a6 + add a2, a2, a6 + mulh a7, a5, s1 + mul a6, a5, s0 + mul a7, a7, t0 + sub a6, a6, a7 + sub a5, a3, a6 + add a3, a3, a6 + mulh a7, a3, s3 + mul a6, a3, s2 + mul a7, a7, t0 + sub a6, a6, a7 + sub a3, a2, a6 + add a2, a2, a6 + mulh a7, a5, s5 + mul a6, a5, s4 + mul a7, a7, t0 + sub a6, a6, a7 + sub a5, a4, a6 + add a4, a4, a6 + sw a2, 0x0(t2) + sw a3, 0x10(t2) + sw a4, 0x20(t2) + sw a5, 0x30(t2) + addi t2, t2, 0x4 + bne t2, t4, Lntt_rv32im_p3_inner + addi t2, t2, 0x30 + bne t2, t3, Lntt_rv32im_p3_outer + mv t2, a0 + addi t3, a0, 0x400 + +Lntt_rv32im_p4_outer: + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + addi a1, a1, 0x18 + lw a2, 0x0(t2) + lw a3, 0x4(t2) + lw a4, 0x8(t2) + lw a5, 0xc(t2) + mulh a7, a4, s1 + mul a6, a4, s0 + mul a7, a7, t0 + sub a6, a6, a7 + sub a4, a2, a6 + add a2, a2, a6 + mulh a7, a5, s1 + mul a6, a5, s0 + mul a7, a7, t0 + sub a6, a6, a7 + sub a5, a3, a6 + add a3, a3, a6 + mulh a7, a3, s3 + mul a6, a3, s2 + mul a7, a7, t0 + sub a6, a6, a7 + sub a3, a2, a6 + add a2, a2, a6 + mulh a7, a5, s5 + mul a6, a5, s4 + mul a7, a7, t0 + sub a6, a6, a7 + sub a5, a4, a6 + add a4, a4, a6 + sw a2, 0x0(t2) + sw a3, 0x4(t2) + sw a4, 0x8(t2) + sw a5, 0xc(t2) + addi t2, t2, 0x10 + bne t2, t3, Lntt_rv32im_p4_outer + lw s0, 0x0(sp) + lw s1, 0x4(sp) + lw s2, 0x8(sp) + lw s3, 0xc(sp) + lw s4, 0x10(sp) + lw s5, 0x14(sp) + addi sp, sp, 0x18 + .cfi_adjust_cfa_offset -0x18 + ret + .cfi_endproc + +MLD_ASM_FN_SIZE(ntt_rv32im_asm) + +#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + !MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/mldsa/src/native/rv32im/src/ntt_rv32im_slowmul_asm.S b/mldsa/src/native/rv32im/src/ntt_rv32im_slowmul_asm.S new file mode 100644 index 000000000..624769595 --- /dev/null +++ b/mldsa/src/native/rv32im/src/ntt_rv32im_slowmul_asm.S @@ -0,0 +1,297 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * RV32-IM ML-DSA forward NTT, slow-multiplier variant. + * + * Thin wrapper: the kernel body is shared via ntt_rv32im_asm.i, which here + * computes the Barrett low(t*q) reduction with a shift-add chain (exploiting + * q = 2^23 - 2^13 + 1), trading the multiply for cheap ALU ops -- preferred + * when the multiplier is slow. The fast-multiplier variant lives in + * ntt_rv32im_asm.S. Exactly one of the two is selected, by + * MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER. + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_RV32IM) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + defined(MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER) + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/riscv32/src/ntt_rv32im_slowmul_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(ntt_rv32im_asm) +MLD_ASM_FN_SYMBOL(ntt_rv32im_asm) + + .cfi_startproc + addi sp, sp, -0x18 + .cfi_adjust_cfa_offset 0x18 + sw s0, 0x0(sp) + sw s1, 0x4(sp) + sw s2, 0x8(sp) + sw s3, 0xc(sp) + sw s4, 0x10(sp) + sw s5, 0x14(sp) + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + addi a1, a1, 0x18 + mv t2, a0 + addi t4, a0, 0x100 + +Lntt_rv32im_p1_loop: + lw a2, 0x0(t2) + lw a3, 0x100(t2) + lw a4, 0x200(t2) + lw a5, 0x300(t2) + mulh a7, a4, s1 + mul a6, a4, s0 + sub a6, a6, a7 + slli a7, a7, 0xd + add a6, a6, a7 + slli a7, a7, 0xa + sub a6, a6, a7 + sub a4, a2, a6 + add a2, a2, a6 + mulh a7, a5, s1 + mul a6, a5, s0 + sub a6, a6, a7 + slli a7, a7, 0xd + add a6, a6, a7 + slli a7, a7, 0xa + sub a6, a6, a7 + sub a5, a3, a6 + add a3, a3, a6 + mulh a7, a3, s3 + mul a6, a3, s2 + sub a6, a6, a7 + slli a7, a7, 0xd + add a6, a6, a7 + slli a7, a7, 0xa + sub a6, a6, a7 + sub a3, a2, a6 + add a2, a2, a6 + mulh a7, a5, s5 + mul a6, a5, s4 + sub a6, a6, a7 + slli a7, a7, 0xd + add a6, a6, a7 + slli a7, a7, 0xa + sub a6, a6, a7 + sub a5, a4, a6 + add a4, a4, a6 + sw a2, 0x0(t2) + sw a3, 0x100(t2) + sw a4, 0x200(t2) + sw a5, 0x300(t2) + addi t2, t2, 0x4 + bne t2, t4, Lntt_rv32im_p1_loop + mv t2, a0 + addi t3, a0, 0x400 + +Lntt_rv32im_p2_outer: + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + addi a1, a1, 0x18 + addi t4, t2, 0x40 + +Lntt_rv32im_p2_inner: + lw a2, 0x0(t2) + lw a3, 0x40(t2) + lw a4, 0x80(t2) + lw a5, 0xc0(t2) + mulh a7, a4, s1 + mul a6, a4, s0 + sub a6, a6, a7 + slli a7, a7, 0xd + add a6, a6, a7 + slli a7, a7, 0xa + sub a6, a6, a7 + sub a4, a2, a6 + add a2, a2, a6 + mulh a7, a5, s1 + mul a6, a5, s0 + sub a6, a6, a7 + slli a7, a7, 0xd + add a6, a6, a7 + slli a7, a7, 0xa + sub a6, a6, a7 + sub a5, a3, a6 + add a3, a3, a6 + mulh a7, a3, s3 + mul a6, a3, s2 + sub a6, a6, a7 + slli a7, a7, 0xd + add a6, a6, a7 + slli a7, a7, 0xa + sub a6, a6, a7 + sub a3, a2, a6 + add a2, a2, a6 + mulh a7, a5, s5 + mul a6, a5, s4 + sub a6, a6, a7 + slli a7, a7, 0xd + add a6, a6, a7 + slli a7, a7, 0xa + sub a6, a6, a7 + sub a5, a4, a6 + add a4, a4, a6 + sw a2, 0x0(t2) + sw a3, 0x40(t2) + sw a4, 0x80(t2) + sw a5, 0xc0(t2) + addi t2, t2, 0x4 + bne t2, t4, Lntt_rv32im_p2_inner + addi t2, t2, 0xc0 + bne t2, t3, Lntt_rv32im_p2_outer + mv t2, a0 + addi t3, a0, 0x400 + +Lntt_rv32im_p3_outer: + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + addi a1, a1, 0x18 + addi t4, t2, 0x10 + +Lntt_rv32im_p3_inner: + lw a2, 0x0(t2) + lw a3, 0x10(t2) + lw a4, 0x20(t2) + lw a5, 0x30(t2) + mulh a7, a4, s1 + mul a6, a4, s0 + sub a6, a6, a7 + slli a7, a7, 0xd + add a6, a6, a7 + slli a7, a7, 0xa + sub a6, a6, a7 + sub a4, a2, a6 + add a2, a2, a6 + mulh a7, a5, s1 + mul a6, a5, s0 + sub a6, a6, a7 + slli a7, a7, 0xd + add a6, a6, a7 + slli a7, a7, 0xa + sub a6, a6, a7 + sub a5, a3, a6 + add a3, a3, a6 + mulh a7, a3, s3 + mul a6, a3, s2 + sub a6, a6, a7 + slli a7, a7, 0xd + add a6, a6, a7 + slli a7, a7, 0xa + sub a6, a6, a7 + sub a3, a2, a6 + add a2, a2, a6 + mulh a7, a5, s5 + mul a6, a5, s4 + sub a6, a6, a7 + slli a7, a7, 0xd + add a6, a6, a7 + slli a7, a7, 0xa + sub a6, a6, a7 + sub a5, a4, a6 + add a4, a4, a6 + sw a2, 0x0(t2) + sw a3, 0x10(t2) + sw a4, 0x20(t2) + sw a5, 0x30(t2) + addi t2, t2, 0x4 + bne t2, t4, Lntt_rv32im_p3_inner + addi t2, t2, 0x30 + bne t2, t3, Lntt_rv32im_p3_outer + mv t2, a0 + addi t3, a0, 0x400 + +Lntt_rv32im_p4_outer: + lw s0, 0x0(a1) + lw s1, 0x4(a1) + lw s2, 0x8(a1) + lw s3, 0xc(a1) + lw s4, 0x10(a1) + lw s5, 0x14(a1) + addi a1, a1, 0x18 + lw a2, 0x0(t2) + lw a3, 0x4(t2) + lw a4, 0x8(t2) + lw a5, 0xc(t2) + mulh a7, a4, s1 + mul a6, a4, s0 + sub a6, a6, a7 + slli a7, a7, 0xd + add a6, a6, a7 + slli a7, a7, 0xa + sub a6, a6, a7 + sub a4, a2, a6 + add a2, a2, a6 + mulh a7, a5, s1 + mul a6, a5, s0 + sub a6, a6, a7 + slli a7, a7, 0xd + add a6, a6, a7 + slli a7, a7, 0xa + sub a6, a6, a7 + sub a5, a3, a6 + add a3, a3, a6 + mulh a7, a3, s3 + mul a6, a3, s2 + sub a6, a6, a7 + slli a7, a7, 0xd + add a6, a6, a7 + slli a7, a7, 0xa + sub a6, a6, a7 + sub a3, a2, a6 + add a2, a2, a6 + mulh a7, a5, s5 + mul a6, a5, s4 + sub a6, a6, a7 + slli a7, a7, 0xd + add a6, a6, a7 + slli a7, a7, 0xa + sub a6, a6, a7 + sub a5, a4, a6 + add a4, a4, a6 + sw a2, 0x0(t2) + sw a3, 0x4(t2) + sw a4, 0x8(t2) + sw a5, 0xc(t2) + addi t2, t2, 0x10 + bne t2, t3, Lntt_rv32im_p4_outer + lw s0, 0x0(sp) + lw s1, 0x4(sp) + lw s2, 0x8(sp) + lw s3, 0xc(sp) + lw s4, 0x10(sp) + lw s5, 0x14(sp) + addi sp, sp, 0x18 + .cfi_adjust_cfa_offset -0x18 + ret + .cfi_endproc + +MLD_ASM_FN_SIZE(ntt_rv32im_asm) + +#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/mldsa/src/native/rv32im/src/poly_pointwise_montgomery_rv32im_asm.S b/mldsa/src/native/rv32im/src/poly_pointwise_montgomery_rv32im_asm.S new file mode 100644 index 000000000..606a13379 --- /dev/null +++ b/mldsa/src/native/rv32im/src/poly_pointwise_montgomery_rv32im_asm.S @@ -0,0 +1,79 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * RV32-IM ML-DSA pointwise polynomial multiplication with Montgomery + * reduction. Computes + * + * a[i] = (a[i] * b[i]) * R^-1 mod q, R = 2^32, |result| < q, + * + * for i in 0..256, in-place in a. + * + * Modular arithmetic: standard signed Montgomery reduction. Unlike the + * NTT, neither operand is constant, so we can't precompute a twisted + * form -- the kernel uses 4 multiplies per coefficient: + * + * plo = low (a * b) ; mul + * m = low (plo * QINV) ; mul (low 32 of (plo * QINV)) + * phi = high(a * b) ; mulh + * mh = high(m * q) ; mulh + * r = phi - mh ; sub + * + * Bounds: requires |a[i]|, |b[i]| < MLD_NTT_BOUND = 9*q. The product + * is bounded by (9q)^2 < 2^31 * q, well within the safe input range + * for `mld_montgomery_reduce` (which is |a| <= 2^31 * q). + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_RV32IM) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/riscv32/src/poly_pointwise_montgomery_rv32im_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(poly_pointwise_montgomery_rv32im_asm) +MLD_ASM_FN_SYMBOL(poly_pointwise_montgomery_rv32im_asm) + + .cfi_startproc + addi sp, sp, -0x8 + .cfi_adjust_cfa_offset 0x8 + sw s0, 0x0(sp) + sw s1, 0x4(sp) + lui s0, 0x7fe + addi s0, s0, 0x1 + lui s1, 0x3802 + addi s1, s1, 0x1 + addi t0, a0, 0x400 + +Lpoly_pointwise_montgomery_rv32im_loop: + lw a2, 0x0(a0) + lw a3, 0x0(a1) + mul a4, a2, a3 + mul a6, a4, s1 + mulh a5, a2, a3 + mulh a7, a6, s0 + sub a2, a5, a7 + sw a2, 0x0(a0) + addi a0, a0, 0x4 + addi a1, a1, 0x4 + bne a0, t0, Lpoly_pointwise_montgomery_rv32im_loop + lw s0, 0x0(sp) + lw s1, 0x4(sp) + addi sp, sp, 0x8 + .cfi_adjust_cfa_offset -0x8 + ret + .cfi_endproc + +MLD_ASM_FN_SIZE(poly_pointwise_montgomery_rv32im_asm) + +#endif /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/mldsa/src/native/rv32im/src/rv32im_zetas.c b/mldsa/src/native/rv32im/src/rv32im_zetas.c new file mode 100644 index 000000000..05cd415a7 --- /dev/null +++ b/mldsa/src/native/rv32im/src/rv32im_zetas.c @@ -0,0 +1,118 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mldsa-native repository. + * Do not modify it directly. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_RV32IM) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include "arith_native_rv32im.h" + +/* + * Table of zeta values used in the RV32-IM forward NTT. + * Each entry is a (zeta, w) Barrett pair, with zeta the plain + * centered twiddle (|zeta| <= q/2) and w = round(zeta * 2^32 / q) + * the Barrett multiplier. See autogen for details. + */ +MLD_ALIGN MLD_INTERNAL_DATA_DEFINITION const int32_t + mld_rv32im_ntt_zetas[510] = { + -3572223, -1830765815, 3765607, 1929875198, 3761513, 1927777021, + -3201494, -1640767044, -601683, -308362795, 3542485, 1815525077, + -2883726, -1477910808, 2682288, 1374673747, 2129892, 1091570561, + -3145678, -1612161320, 3764867, 1929495947, -1005239, -515185417, + -3201430, -1640734244, 557458, 285697463, -1221177, -625853735, + -3370349, -1727305304, 3602218, 1846138265, 3182878, 1631226336, + -4063053, -2082316400, 2740543, 1404529459, -3586446, -1838055109, + 2663378, 1364982364, -3110818, -1594295555, 2101410, 1076973524, + -1674615, -858240904, 3704823, 1898723372, 1159875, 594436433, + -3524442, -1806278032, 394148, 202001019, 928749, 475984260, + -434125, -222489248, 1095468, 561427818, -3506380, -1797021249, + 676590, 346752664, 2071829, 1061813248, -4018989, -2059733581, + -1335936, -684667771, 3241972, 1661512036, 2156050, 1104976547, + -3227876, -1654287830, 3415069, 1750224323, 1759347, 901666090, + 1714295, 878576921, -817536, -418987550, -3574466, -1831915353, + 2453983, 1257667337, 3756790, 1925356481, -1935799, -992097815, + 1460718, 748618600, -1716988, -879957084, -3950053, -2024403852, + -642628, -329347125, -2897314, -1484874664, 3192354, 1636082790, + -3585098, -1837364258, 556856, 285388938, 3870317, 1983539117, + 2815639, 1443016191, 2917338, 1495136972, 1853806, 950076368, + 2283733, 1170414139, 3345963, 1714807468, 1858416, 952438995, + 3073009, 1574918427, 1753, 898413, -1935420, -991903578, + 1277625, 654783359, -2659525, -1363007700, -1455890, -746144248, + -2635473, -1350681039, 2660408, 1363460238, -1780227, -912367099, + 3852015, 1974159335, -59148, -30313375, 2772600, 1420958686, + 4183372, 2143979939, 1182243, 605900043, 87208, 44694137, + -3222807, -1651689966, 636927, 326425360, -3965306, -2032221021, + -3121440, -1599739335, -3956745, -2027833504, -2296397, -1176904444, + -274060, -140455867, -3284915, -1683520342, -3716946, -1904936414, + 2508980, 1285853323, -27812, -14253662, 822541, 421552614, + 2028118, 1039411342, 1009365, 517299994, -2454145, -1257750362, + 1937570, 993005454, -1979497, -1014493059, 1596822, 818371958, + -3815725, -1955560694, -3956944, -2027935492, -3759465, -1926727420, + 2811291, 1440787840, -1685153, -863641633, -3410568, -1747917558, + -2983781, -1529189038, 2678278, 1372618620, -3768948, -1931587462, + -1109516, -568627424, -3551006, -1819892093, 635956, 325927722, + 4158088, 2131021878, -250446, -128353682, -2455377, -1258381762, + 1528066, 783134478, -4146264, -2124962073, -1772588, -908452108, + 482649, 247357819, 2192938, 1123881663, -1727088, -885133339, + 1148858, 588790216, 2387513, 1223601433, -3611750, -1851023419, + -2962264, -1518161567, -268456, -137583815, -3180456, -1629985060, + -565603, -289871779, 3747250, 1920467227, 2296099, 1176751719, + 169688, 86965173, 1239911, 635454918, -3838479, -1967222129, + 2462444, 1262003603, 3195676, 1637785316, 2642980, 1354528380, + -3334383, -1708872713, 1254190, 642772911, -12417, -6363718, + -4166425, -2135294594, 2998219, 1536588520, 141835, 72690498, + -3488383, -1787797779, -89301, -45766801, 2513018, 1287922800, + 1987814, 1018755525, -1354892, -694382729, 613238, 314284737, + -3197248, -1638590967, -1310261, -671509323, -2218467, -1136965286, + 1736313, 889861155, -458740, -235104446, -1921994, -985022747, + 235407, 120646188, 4040196, 2070602178, -3472069, -1779436847, + -3250154, -1665705315, 2039144, 1045062172, -1879878, -963438279, + 3258457, 1669960606, -818761, -419615363, -2178965, -1116720494, + -2579253, -1321868265, -1623354, -831969619, 2105286, 1078959975, + 1787943, 916321552, -2374402, -1216882040, -2033807, -1042326957, + -2391089, -1225434135, 586241, 300448763, -1179613, -604552167, + -2254727, -1155548552, 527981, 270590488, -2743411, -1405999311, + 3482206, 1784632064, -1476985, -756955444, 1994046, 1021949428, + -4182915, -2143745726, 2491325, 1276805128, -1393159, -713994583, + -1300016, -666258756, 507927, 260312805, -1187885, -608791570, + -2362063, -1210558298, -724804, -371462360, -1834526, -940195359, + -1317678, -675310538, -3033742, -1554794072, -338420, -173440395, + 2461387, 1261461890, 2647994, 1357098057, 3009748, 1542497137, + 3035980, 1555941048, -2612853, -1339088280, 4148469, 2126092136, + 621164, 318346816, 749577, 384158533, -4022750, -2061661095, + 3901472, 1999506068, 3980599, 2040058690, 2569011, 1316619236, + -1226661, -628664287, -1615530, -827959816, 1723229, 883155599, + 2925816, 1499481951, 1665318, 853476187, 2028038, 1039370342, + 3374250, 1729304568, 1163598, 596344473, -3369273, -1726753853, + 1356448, 695180180, 3994671, 2047270596, -11879, -6087993, + -2775755, -1422575624, -1370517, -702390549, 3020393, 1547952704, + 2683270, 1375177022, 3363542, 1723816713, 214880, 110126092, + -2778788, -1424130038, 545376, 279505433, -770441, -394851342, + -3467665, -1777179795, 3105558, 1591599803, -1103344, -565464272, + 2312838, 1185330464, 508145, 260424530, -553718, -283780712, + -653275, -334803717, 860144, 440824168, 3430436, 1758099917, + -459163, -235321234, 140244, 71875110, -1514152, -776003547, + 348812, 178766299, -2185084, -1119856484, 3123762, 1600929361, + -327848, -168022240, 2358373, 1208667171, -2193087, -1123958025, + 1011223, 518252220, -3014420, -1544891539, -1716814, -879867909, + -2354215, -1206536194, 2926054, 1499603926, -392707, -201262505, + -3818627, -1957047970, -303005, -155290192, 3531229, 1809756372, + -1922253, -985155484, -3974485, -2036925262, -3773731, -1934038751, + -2236726, -1146323031, 1900052, 973777462, -781875, -400711272, + 1744507, 894060583, 1054478, 540420426, -731434, -374860238, +}; + +#else /* MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ + +MLD_EMPTY_CU(rv32im_zetas) + +#endif /* !(MLD_ARITH_BACKEND_RV32IM && !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/scripts/autogen b/scripts/autogen index 3ce73146f..37b1bda22 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -712,6 +712,28 @@ def signed_reduce(a): return c +def ntt_root_of_unity(layer, block, inv=False, scale=False): + """Root of unity (un-reduced) for the given (layer, block) of the NTT. + + Shared by all backends; the arch-specific generators differ only in how + they post-process this value via their prepare_root_for_* helper. + + We are computing a negacyclic NTT; the twiddles needed are the second + half of the twiddles for a cyclic NTT of twice the size. Layers are + numbered 0 through 7. + + inv: negate the exponent (inverse NTT). + scale: fold in the 2^{-8} of the inverse NTT and the Montgomery factor + 2^32 (used for the final scaling twiddle).""" + log = bitreverse(pow(2, layer) + block, 8) + if inv is True: + log = -log + root = pow(root_of_unity, log, modulus) + if scale is True: + root = root * pow(2, 32 - 8, modulus) + return root + + def gen_c_zetas(): """Generate source and header file for zeta values used in the reference NTT and invNTT""" @@ -743,43 +765,52 @@ def gen_c_zeta_file(): update_file("mldsa/src/zetas.inc", "\n".join(gen()), force_format=True) -def prepare_root_for_barrett(root): - """Takes a constant that the code needs to Barrett-multiply with, - and returns the pair of (a) its signed canonical form, (b) the - twisted constant used in the high-mul part of the Barrett multiplication.""" +def prepare_root_for_barrett(root, sqrdmulh=False): + """Takes a constant that the code needs to Barrett-multiply with, and + returns the pair (z, w): z the signed canonical form of `root`, and w the + multiplier for the high-mul step that approximates t = round(a * z / q), + with w ~= z * 2^32 / q. + + The exact form of w depends on the high-mul instruction: + + sqrdmulh=True (AArch64 NEON `sqrdmulh`, which computes + round(2 * a * c / 2^32)): the multiplier is halved, so + the instruction's built-in doubling restores the full + value. Rounding to even before halving keeps it exact. + sqrdmulh=False (a plain signed `mulh`): the full round-to-nearest value. + It fits int32 since |z| <= q/2 implies |w| <= 2^31.""" # Signed canonical reduction - root = signed_reduce(root) - - def round_to_even(t): - rt = round(t) - if rt % 2 == 0: - return rt - # Make sure to pick a rounding target - # that's <= 1 away from x in absolute value. - if rt <= t: - return rt + 1 - return rt - 1 - - root_twisted = round_to_even((root * 2**32) / modulus) // 2 - return root, root_twisted + z = signed_reduce(root) + scaled = (z * 2**32) / modulus + + if sqrdmulh: + + def round_to_even(t): + rt = round(t) + if rt % 2 == 0: + return rt + # Make sure to pick a rounding target + # that's <= 1 away from x in absolute value. + if rt <= t: + return rt + 1 + return rt - 1 + + w = round_to_even(scaled) // 2 + else: + w = round(scaled) + # The multiplier is stored in a signed 32-bit table lane. This holds for + # both forms since |z| <= q/2: the mulh value peaks near 2^31 and the + # (halved) sqrdmulh value near 2^30. + assert -(2**31) <= w < 2**31 -def gen_aarch64_root_of_unity_for_block(layer, block, inv=False, scale=False): - # We are computing a negacyclic NTT; the twiddles needed here is - # the second half of the twiddles for a cyclic NTT of twice the size. - # For ease of calculating the roots, layers are numbers 0 through 7 - # in this function. - log = bitreverse(pow(2, layer) + block, 8) - if inv is True: - log = -log - root = pow(root_of_unity, log, modulus) + return z, w - if scale is True: - # Integrate scaling by 2**(-8) and Montgomery factor 2**32 into twiddle - root = root * pow(2, 32 - 8, modulus) - root, root_twisted = prepare_root_for_barrett(root) +def gen_aarch64_root_of_unity_for_block(layer, block, inv=False, scale=False): + root = ntt_root_of_unity(layer, block, inv=inv, scale=scale) + root, root_twisted = prepare_root_for_barrett(root, sqrdmulh=True) return root, root_twisted @@ -958,6 +989,79 @@ def _fmt_indexed_rows(data): yield ",".join(map(str, row)) + f" /* {i} */," +def gen_rv32im_root_for_block(layer, block): + """Forward NTT zeta for the given (layer, block), in plain (non-Montgomery) + centered form, returned as the (z, w) Barrett pair consumed by the RV32-IM + assembly. The RV32-IM kernel uses a plain signed `mulh` (not NEON + `sqrdmulh`), hence sqrdmulh=False: + + t = mulh(a, w) ~= round(a * z / q) + r = lo(a * z) - lo(t * q) == (a * z) mod q (|r| < q + epsilon). + + No Montgomery factor is folded in: the Barrett multiplication computes + (a * z) mod q directly, which matches the input/output domain of the + previous Montgomery kernel (R was folded into the twiddle and cancelled + by R^-1, so both conventions are plain-domain).""" + root = ntt_root_of_unity(layer, block) + return prepare_root_for_barrett(root, sqrdmulh=False) + + +def gen_rv32im_fwd_ntt_zetas(): + """Yield (z, z') pairs in the order consumed by the 2+2+2+2 forward NTT. + + Each of the 4 passes (L1+L2, L3+L4, L5+L6, L7+L8) emits one set of + 3 pairs per outer iteration. Layers are 0-indexed here: + + pass p uses layers (lo, hi) = (2p, 2p+1) + + For outer index o in pass p: + zeta_lo = layer lo, block o + zeta_hi0 = layer hi, block 2*o + zeta_hi1 = layer hi, block 2*o + 1 + + Total: 1 + 4 + 16 + 64 = 85 outer iters * 3 pairs = 255 pairs.""" + for p in range(4): + lo = 2 * p + hi = 2 * p + 1 + n_outer = 1 << lo # 1, 4, 16, 64 + for o in range(n_outer): + yield from gen_rv32im_root_for_block(lo, o) + yield from gen_rv32im_root_for_block(hi, 2 * o + 0) + yield from gen_rv32im_root_for_block(hi, 2 * o + 1) + + +def gen_rv32im_zeta_file(): + def gen(): + yield from gen_header() + yield '#include "../../../common.h"' + yield "" + yield "#if defined(MLD_ARITH_BACKEND_RV32IM) && \\" + yield " !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)" + yield "" + yield '#include "arith_native_rv32im.h"' + yield "" + yield "/*" + yield " * Table of zeta values used in the RV32-IM forward NTT." + yield " * Each entry is a (zeta, w) Barrett pair, with zeta the plain" + yield " * centered twiddle (|zeta| <= q/2) and w = round(zeta * 2^32 / q)" + yield " * the Barrett multiplier. See autogen for details." + yield " */" + yield from emit_c_array( + "const int32_t", + "mld_rv32im_ntt_zetas", + gen_rv32im_fwd_ntt_zetas(), + ) + yield "" + yield "#else" + yield "" + yield "MLD_EMPTY_CU(rv32im_zetas)" + yield "" + yield "#endif" + yield "" + + update_file("dev/riscv32/src/rv32im_zetas.c", "\n".join(gen())) + + def gen_aarch64_zeta_file(): def gen(): yield from gen_header() @@ -1551,10 +1655,7 @@ def prepare_root_for_montmul(root, mult): def gen_avx2_root_of_unity_for_block(layer, block, mult=False): - # We are computing a negacyclic NTT; the twiddles needed here is - # the second half of the twiddles for a cyclic NTT of twice the size. - log = bitreverse(pow(2, layer) + block, 8) - root = pow(root_of_unity, log, modulus) + root = ntt_root_of_unity(layer, block) return prepare_root_for_montmul(root, mult) @@ -1892,6 +1993,10 @@ def riscv64(c): return "/riscv64/" in c +def rv32im(c): + return "/rv32im/" in c + + def armv81m(c): return "/armv81m/" in c @@ -1937,12 +2042,17 @@ def native_arith_riscv64(c): return native_arith(c) and riscv64(c) +def native_arith_rv32im(c): + return native_arith(c) and rv32im(c) + + def native_arith_core(c): return ( native_arith(c) and not native_arith_x86_64(c) and not native_arith_aarch64(c) and not native_arith_riscv64(c) + and not native_arith_rv32im(c) ) @@ -2048,6 +2158,11 @@ def gen_macro_undefs(extra_notes=None): filt=native_arith_x86_64, desc="native code (Arith, X86_64)" ) yield "#endif" + yield "#if defined(MLD_SYS_RISCV32)" + yield from gen_monolithic_undef_all_core( + filt=native_arith_rv32im, desc="native code (Arith, RV32IM)" + ) + yield "#endif" yield "#endif" yield "#endif" yield "" @@ -2125,6 +2240,10 @@ def gen_monolithic_source_file(): for c in filter(native_arith_x86_64, c_sources): yield f'#include "{c}"' yield "#endif" + yield "#if defined(MLD_SYS_RISCV32)" + for c in filter(native_arith_rv32im, c_sources): + yield f'#include "{c}"' + yield "#endif" yield "#endif" yield "" yield "#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202)" @@ -2208,6 +2327,10 @@ def gen_monolithic_asm_file(): for c in filter(native_arith_x86_64, asm_sources): yield f'#include "{c}"' yield "#endif" + yield "#if defined(MLD_SYS_RISCV32)" + for c in filter(native_arith_rv32im, asm_sources): + yield f'#include "{c}"' + yield "#endif" yield "#endif" yield "" yield "#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202)" @@ -2260,6 +2383,8 @@ def get_config_options(): "MLD_FORCE_RISCV64", "MLD_FORCE_RISCV32", "MLD_SYS_AARCH64_SLOW_BARREL_SHIFTER", + "MLD_USE_NATIVE_RV32IM_SLOW_MULTIPLIER", + "MLD_RV32IM_INTERNAL_USE_SLOW_MULTIPLIER", "MLDSA_DEBUG", # TODO: Rename? "MLD_BREAK_PCT", # Use in PCT breakage test "MLD_ALLOW_NONCOMPLIANT_SIGNING_BOUND", # Internal testing escape hatch @@ -2359,7 +2484,10 @@ def check_macro_typos(): return True # 5. AWS-LC importer patch - if is_autogen or filename == "integration/awslc/awslc.patch": + if is_autogen or filename in [ + "integration/awslc/pre_import.patch", + "integration/awslc/post_import.patch", + ]: return True if is_autogen or filename == "mldsa/src/common.h": @@ -2435,6 +2563,12 @@ def check_asm_loop_labels_for_file(filename): # Find function symbol name res = _RE_FUNC_SYMBOL.search(content) if res is None: + # Shared .i kernel bodies carry the loop labels but not the function + # symbol (which lives in the wrapper .S next to its .global). With the + # symbol in another file we cannot derive the label prefix here, so + # skip; the wrapper .S (which does carry the symbol) is checked instead. + if filename.endswith(".i"): + return raise Exception(f"Could not find function symbol in assembly file {filename}") funcname = res.group(1) lbl_prefix = re.sub(r"(_(aarch64|avx2|mve))?_asm$", "", funcname) + "_" @@ -2532,7 +2666,7 @@ def normalize_asm_macro_syntax(): # Architectures autogen knows how to (cross-)compile assembly for. Used to # expand a bare `--force-cross` (no value) into "force all architectures". -FORCE_CROSS_ALL_ARCHES = {"aarch64", "x86_64", "armv81m"} +FORCE_CROSS_ALL_ARCHES = {"aarch64", "x86_64", "armv81m", "rv32"} def resolve_force_cross(value): @@ -2563,6 +2697,11 @@ def update_via_simpasm( force_cross=(), x86_64_syntax="att", ): + # force_cross: set of source architectures for which a missing cross + # toolchain is a hard error rather than a silent skip. Pass None or an + # empty set to skip silently for every arch. + if force_cross is None: + force_cross = set() _, infile = os.path.split(infile_full) if outfile is None: outfile = infile @@ -2579,6 +2718,8 @@ def update_via_simpasm( source_arch = "x86_64" elif "armv81m" in infile_full: source_arch = "armv81m" + elif "riscv32" in infile_full or "rv32im" in infile_full: + source_arch = "riscv32" else: raise Exception(f"Could not detect architecture of source file {infile_full}.") # Check native architecture @@ -2595,6 +2736,14 @@ def update_via_simpasm( if source_arch not in force_cross: return raise Exception(f"Could not find cross toolchain {cross_prefix}") + # RISC-V 32-bit is always cross-compiled. + elif source_arch == "riscv32": + cross_prefix = "riscv32-unknown-linux-gnu-" + cross_gcc = cross_prefix + "gcc" + if shutil.which(cross_gcc) is None: + if source_arch not in force_cross: + return + raise Exception(f"Could not find cross toolchain {cross_prefix}") elif native_arch != source_arch: cross_prefix = f"{source_arch}-unknown-linux-gnu-" cross_gcc = cross_prefix + "gcc" @@ -2613,6 +2762,8 @@ def update_via_simpasm( arch = "aarch64" elif "armv81m" in infile_full: arch = "armv81m" + elif "riscv32" in infile_full or "rv32im" in infile_full: + arch = "riscv32" else: arch = "x86_64" @@ -2924,8 +3075,11 @@ def update_via_remove(filename): update_file(filename, None) -# Only synchronize sources, but not README.md, Makefile and so on -SYNCHRONIZED_EXTENSIONS = (".c", ".h", ".i", ".inc", ".S") +# Only synchronize sources, but not README.md, Makefile and so on. +# Note: .i files are dev-only kernel bodies #include'd by wrapper .S files; +# they are flattened into the synchronized .S output by simpasm and are not +# copied to the backend mirror themselves. +SYNCHRONIZED_EXTENSIONS = (".c", ".h", ".inc", ".S") def synchronize_file(f, in_dir, out_dir, delete=False, no_simplify=False, **kwargs): @@ -3019,6 +3173,14 @@ def synchronize_backends( ), ) + update_via_copy( + "dev/riscv32/meta.h", + "mldsa/src/native/rv32im/meta.h", + transform=lambda c: adjust_header_guard_for_filename( + c, "mldsa/src/native/rv32im/meta.h" + ), + ) + synchronize_backend( f"dev/aarch64_{ty}/src", "mldsa/src/native/aarch64/src", @@ -3100,6 +3262,14 @@ def synchronize_backends( no_simplify=no_simplify, cflags="-Idev/fips202/armv81m -Imldsa/src/fips202/native/armv81m -march=armv8.1-m.main+mve -mthumb", ) + synchronize_backend( + "dev/riscv32/src", + "mldsa/src/native/rv32im/src", + delete=delete, + force_cross=force_cross, + no_simplify=no_simplify, + cflags="-Idev/riscv32/src -Imldsa/src/native/rv32im/src -march=rv32im -mabi=ilp32", + ) def adjust_header_guard_for_filename(content, header_file): @@ -4204,6 +4374,7 @@ def _main(): def gen_zeta_tables(): gen_c_zeta_file() gen_aarch64_zeta_file() + gen_rv32im_zeta_file() gen_aarch64_hol_light_zeta_file() gen_aarch64_rej_uniform_table() gen_hol_light_rej_uniform_table() diff --git a/scripts/cfify b/scripts/cfify index f37a6fa30..58ba047e0 100755 --- a/scripts/cfify +++ b/scripts/cfify @@ -160,6 +160,19 @@ ARMV81M_ADD_SP_PATTERN = re.compile( ARMV81M_BX_LR_PATTERN = re.compile(r"(\s*)bx\s+lr\s*$", re.IGNORECASE) +# ----------------------------------------------------------------------------- +# riscv32 module-scope constants +# ----------------------------------------------------------------------------- +# `addi sp, sp, -OFF` (allocate) and `addi sp, sp, +OFF` (free). +RISCV32_SUB_SP_PATTERN = re.compile( + r"(\s*)addi\s+sp,\s*sp,\s*-(0x[0-9a-fA-F]+|\d+)", re.IGNORECASE +) +RISCV32_ADD_SP_PATTERN = re.compile( + r"(\s*)addi\s+sp,\s*sp,\s*(0x[0-9a-fA-F]+|\d+)", re.IGNORECASE +) +RISCV32_RET_PATTERN = re.compile(r"(\s*)ret\s*$", re.IGNORECASE) + + def armv81m_parse_reg(s): """Parse a single register token, returning its canonical name (e.g. 'r14' -> 'lr'). Raises ValueError on unrecognised input.""" @@ -524,6 +537,44 @@ def add_cfi_directives(text, arch): i += 1 continue + elif arch == "riscv32": + # addi sp, sp, -OFF — stack allocation + match = RISCV32_SUB_SP_PATTERN.match(line) + if match: + indent, offset_str = match.groups() + offset = ( + int(offset_str, 16) + if offset_str.lower().startswith("0x") + else int(offset_str) + ) + result.append(line) + result.append(f"{indent}.cfi_adjust_cfa_offset {offset:#x}") + i += 1 + continue + + # addi sp, sp, +OFF — stack deallocation + match = RISCV32_ADD_SP_PATTERN.match(line) + if match: + indent, offset_str = match.groups() + offset = ( + int(offset_str, 16) + if offset_str.lower().startswith("0x") + else int(offset_str) + ) + result.append(line) + result.append(f"{indent}.cfi_adjust_cfa_offset -{offset:#x}") + i += 1 + continue + + # ret — function return + match = RISCV32_RET_PATTERN.match(line) + if match: + indent = match.group(1) + result.append(line) + result.append(f"{indent}.cfi_endproc") + i += 1 + continue + result.append(line) i += 1 @@ -543,7 +594,7 @@ def main(): ) parser.add_argument( "--arch", - choices=["aarch64", "x86_64", "armv81m"], + choices=["aarch64", "x86_64", "armv81m", "riscv32"], default="aarch64", help="Target architecture (default: aarch64)", ) diff --git a/scripts/simpasm b/scripts/simpasm index dc34079a1..62cca9adf 100755 --- a/scripts/simpasm +++ b/scripts/simpasm @@ -256,6 +256,11 @@ def simplify(logger, args, asm_input, asm_output=None): # Armv8.1-M requires explicit triple for Thumb disassembly if args.arch == "armv81m": cmd += ["--triple=thumbv8.1m.main-none-eabi"] + # RISC-V 32-bit ILP32 needs an explicit triple so llvm-objdump + # decodes the M extension (mul/mulh) instead of marking them + # as illegal. + if args.arch == "riscv32": + cmd += ["--triple=riscv32", "--mattr=+m"] # Add syntax option if specified if args.syntax and args.syntax.lower() != "att": diff --git a/test/mk/components.mk b/test/mk/components.mk index 67698aabe..00df4d67c 100644 --- a/test/mk/components.mk +++ b/test/mk/components.mk @@ -10,7 +10,7 @@ endif SOURCES += $(wildcard mldsa/src/*.c) ifeq ($(OPT),1) - SOURCES += $(wildcard mldsa/src/native/aarch64/src/*.[csS]) $(wildcard mldsa/src/native/x86_64/src/*.[csS]) + SOURCES += $(wildcard mldsa/src/native/aarch64/src/*.[csS]) $(wildcard mldsa/src/native/x86_64/src/*.[csS]) $(wildcard mldsa/src/native/rv32im/src/*.[csS]) CFLAGS += -DMLD_CONFIG_USE_NATIVE_BACKEND_ARITH -DMLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 endif