diff --git a/mldsa/src/rounding.h b/mldsa/src/rounding.h index 21fae3c98..9438a329e 100644 --- a/mldsa/src/rounding.h +++ b/mldsa/src/rounding.h @@ -91,78 +91,34 @@ __contract__( ) { /* - * The goal is to compute f1 = round-(f / (2*GAMMA2)), which can be computed - * alternatively as round-(f / (128B)) = round-(ceil(f / 128) / B) where - * B = 2*GAMMA2 / 128. Here round-() denotes "round half down". - * - * The equality round-(f / (128B)) = round-(ceil(f / 128) / B) can deduced - * as follows. Since changing f to align-up(f, 128) can move f onto but not - * across a rounding boundary for division by 128*B (note that we need B to be - * even for this to work), and round- rounds down on the boundary, we have - * - * round-(f / (128B)) = round-(align-up(f, 128) / (128B)) - * = round-((align-up(f, 128) / 128) / B) - * = round-(ceil(f / 128) / B). + * Compute a1 = round-(a / (2*GAMMA2)) with a single Barrett-style high + * multiplication, where round-() denotes "round half down". This mirrors the + * AArch64 backend (which uses sqdmulh + srshr) and is exact for + * 0 <= a < MLDSA_Q. See proofs/isabelle/compress for a formalization of the + * argument. */ - *a1 = (a + 127) >> 7; - /* We know a >= 0 and a < MLDSA_Q, so... */ - /* check-magic: 65472 == round((MLDSA_Q-1)/128) */ - mld_assert(*a1 >= 0 && *a1 <= 65472); - #if MLD_CONFIG_PARAMETER_SET == 44 - /* check-magic: 1488 == 2 * intdiv(intdiv(MLDSA_Q - 1, 88), 128) */ - /* check-magic: 11275 == floor(2**24 / 1488) */ - /* check-magic: 1560281088 == 1 / (1 / 1488 - 11275 / 2**24) */ + /* check-magic: 1477838209 == floor(2**48 / 190464) */ /* - * Compute f1 = round-(f1' / B) ≈ round(f1' * 11275 / 2^24). This is exact for - * 0 <= f1' < 2^16. - * - * To see this, consider the (signed) error f1' * (1 / B - 11275 / 2^24) - * between f1' / B and the (under-)approximation f1' * 11275 / 2^24. Because - * eps := 1 / B - 11275 / 2^24 is 1 / 1560281088 ≈ 2^(-30.54) < 2^(-30), we - * have 0 <= f1' * eps < 2^16 * 2^(-30) = 1 / 2^14 < 1 / 2^11 < 1 / B (note - * that f1' is non-negative). - * - * On the other hand, 1 / B is the spacing between the integral multiples - * of 1 / B, which includes all rounding boundaries n + 0.5 (since B is even). - * Hence, if f1' / B is not of the form n + 0.5, then it is at least 1 / B - * away from the nearest rounding boundary, so moving from f1' / B to - * f1' * 11275 / 2^24 does not affect the rounding result, no matter the type - * of rounding used in either side. In particular, we have round-(f1' / B) = - * round(f1' * 11275 / 2^24) as claimed. - * - * As for the remaining case where f1' / B _is_ of the form n + 0.5, because - * f1' * 11275 / 2^24 is slightly but strictly below f1' / B = n + 0.5 (note - * that f1' and thus the error f1' * eps cannot be 0 here), it is always - * rounded down to n. More precisely, we have round-(f1' / B) = - * round(f1' * 11275 / 2^24), where the round-down on the LHS is essential, - * and on the RHS the type of rounding again does not matter. This concludes - * the proof. - * - * See proofs/isabelle/compress for a formalization of the above argument. + * a1 = round-(a / (2*GAMMA2)) = round(a * 1477838209 / 2^48). Half is rounded + * down since 1477838209 / 2^48 < 1 / (2*GAMMA2). */ - *a1 = (*a1 * 11275 + (1 << 23)) >> 24; + *a1 = (int32_t)(((int64_t)a * 1477838209 + ((int64_t)1 << 47)) >> 48); mld_assert(*a1 >= 0 && *a1 <= 44); *a1 = mld_ct_sel_int32(0, *a1, mld_ct_cmask_neg_i32(43 - *a1)); mld_assert(*a1 >= 0 && *a1 <= 43); -#else /* MLD_CONFIG_PARAMETER_SET == 44 */ - /* check-magic: 4092 == 2 * intdiv(intdiv(MLDSA_Q - 1, 32), 128) */ - /* check-magic: 1025 == floor(2**22 / 4092) */ - /* check-magic: 4290772992 == 1 / (1 / 4092 - 1025 / 2**22) */ +#else /* MLD_CONFIG_PARAMETER_SET == 44 */ + /* check-magic: 1074791425 == floor(2**49 / 523776) */ /* - * Compute f1 = round-(f1' / B) ≈ round(f1' * 1025 / 2^22). This is exact for - * 0 <= f1' < 2^16. Following the same argument above, it suffices to show - * that f1' * eps < 1 / B, where eps := 1 / B - 1025 / 2^22. Indeed, we have - * eps = 1 / 4290772992 ≈ 2^(-31.99) < 2^(-31), therefore f1' * eps < - * 2^16 * 2^(-31) = 1 / 2^15 < 1 / 2^12 < 1 / B. + * a1 = round-(a / (2*GAMMA2)) = round(a * 1074791425 / 2^49). Half is rounded + * down since 1074791425 / 2^49 < 1 / (2*GAMMA2). */ - *a1 = (*a1 * 1025 + (1 << 21)) >> 22; + *a1 = (int32_t)(((int64_t)a * 1074791425 + ((int64_t)1 << 48)) >> 49); mld_assert(*a1 >= 0 && *a1 <= 16); *a1 &= 15; mld_assert(*a1 >= 0 && *a1 <= 15); - #endif /* MLD_CONFIG_PARAMETER_SET != 44 */ *a0 = a - *a1 * 2 * MLDSA_GAMMA2; diff --git a/proofs/isabelle/compress/ML-DSA_Compress.thy b/proofs/isabelle/compress/ML-DSA_Compress.thy index 89a1c595b..7ec2a2cc7 100644 --- a/proofs/isabelle/compress/ML-DSA_Compress.thy +++ b/proofs/isabelle/compress/ML-DSA_Compress.thy @@ -13,8 +13,8 @@ text \ \<^item> Parameter set 44 (ML-DSA-44): \<^verbatim>\GAMMA2 = 95232\, divisor \<^verbatim>\2*GAMMA2 = 190464\ The implementations use different Barrett division strategies: - \<^item> C and AVX2: Factor out \<^verbatim>\128\ first, then Barrett divide by \<^verbatim>\4092\ or \<^verbatim>\1488\ - \<^item> AArch64: Barrett divide directly by \<^verbatim>\523776\ or \<^verbatim>\190464\ + \<^item> AVX2: Factor out \<^verbatim>\128\ first, then Barrett divide by \<^verbatim>\4092\ or \<^verbatim>\1488\ + \<^item> C and AArch64: Barrett divide directly by \<^verbatim>\523776\ or \<^verbatim>\190464\ \<^bold>\Limitation:\ The results below operate on unbounded integers (@{typ int}), not fixed-width machine words. Connecting these results to the actual implementations @@ -22,10 +22,10 @@ text \ \<^verbatim>\mulhi\, overflow behavior) which is left for future work. \ -subsection \C and AVX2 implementations\ +subsection \AVX2 implementation\ text \ - The C reference \<^file>\../../../mldsa/src/rounding.h\ and AVX2 implementations + The AVX2 implementations \<^file>\../../../dev/x86_64/src/poly_decompose_32_avx2.c\ and \<^file>\../../../dev/x86_64/src/poly_decompose_88_avx2.c\ first compute \<^verbatim>\ceil(f / 128)\, then Barrett divide by \<^verbatim>\B = 2*GAMMA2 / 128\. @@ -41,13 +41,14 @@ corollary barrett_decompose_88_c_avx2: shows \round_half_down (rat_of_int a / 1488) = (a * 11275 + 2^23) div 2^24\ using barrett_division_correct[where b=1488 and N=24, simplified barrett_division, eval] assms by simp -subsection \AArch64 implementation\ +subsection \C and AArch64 implementations\ text \ - The AArch64 implementations + The C reference \<^file>\../../../mldsa/src/rounding.h\ and the AArch64 implementations \<^file>\../../../dev/aarch64_clean/src/poly_decompose_32_aarch64_asm.S\ and \<^file>\../../../dev/aarch64_clean/src/poly_decompose_88_aarch64_asm.S\ - Barrett divide directly by \<^verbatim>\2*GAMMA2\ using \<^verbatim>\sqdmulh\ and \<^verbatim>\srshr\. + Barrett divide directly by \<^verbatim>\2*GAMMA2\ (the AArch64 backend using \<^verbatim>\sqdmulh\ and \<^verbatim>\srshr\). + The corollaries below carry the suffix \<^verbatim>\aarch64\ for historical reasons but apply to the C reference as well. \ corollary barrett_decompose_32_aarch64: diff --git a/proofs/isabelle/neon_ntt/Barrett_Division_Even.thy b/proofs/isabelle/neon_ntt/Barrett_Division_Even.thy index 41e298d45..425955aef 100644 --- a/proofs/isabelle/neon_ntt/Barrett_Division_Even.thy +++ b/proofs/isabelle/neon_ntt/Barrett_Division_Even.thy @@ -555,7 +555,9 @@ lemma %visible \2 * MLDSA_GAMMA2_88 = 190464\ text\The following results are at the heart of the correctness of the AArch64 assembly computing \cite[Algorithm~36]{FIPS204} in \texttt{mldsa-native}, proved -against the instruction model in \autoref{sec:aarch64_decompose}:\ +against the instruction model in \autoref{sec:aarch64_decompose}. The C reference +implementation uses the same direct Barrett-division strategy (the corollary names +carry the \texttt{aarch64} suffix for historical reasons but apply to it as well):\ corollary barrett_decompose_32_aarch64: assumes \a \ 0\ and \a < MLDSA_Q\ @@ -585,8 +587,8 @@ proof - thus ?thesis unfolding g using m by simp qed -text \The C and AVX2 backends in mldsa-native instead divide by \<^term>\128\ first --- rounding \<^emph>\up\, -i.e.\ taking \<^term>\\a /\<^sub>\ 128\\ --- and then Barrett-divide by \<^term>\(2 * \\<^sub>2) div 128\:\ +text \The AVX2 backend in mldsa-native instead divides by \<^term>\128\ first --- rounding \<^emph>\up\, +i.e.\ taking \<^term>\\a /\<^sub>\ 128\\ --- and then Barrett-divides by \<^term>\(2 * \\<^sub>2) div 128\:\ corollary barrett_decompose_32_c_avx2: assumes \a \ 0\ and \a < MLDSA_Q\