diff --git a/.github/workflows/hol_light.yml b/.github/workflows/hol_light.yml index ef89ac54a..6c7dd4300 100644 --- a/.github/workflows/hol_light.yml +++ b/.github/workflows/hol_light.yml @@ -205,6 +205,10 @@ jobs: needs: ["mldsa_specs.ml", "mldsa_utils.ml", "subroutine_signatures.ml"] - name: poly_chknorm_avx2_asm needs: ["mldsa_specs.ml", "mldsa_utils.ml", "subroutine_signatures.ml"] + - name: poly_decompose_32_avx2_asm + needs: ["mldsa_specs.ml", "mldsa_utils.ml", "subroutine_signatures.ml"] + - name: poly_decompose_88_avx2_asm + needs: ["mldsa_specs.ml", "mldsa_utils.ml", "subroutine_signatures.ml"] - name: polyz_unpack_17_avx2_asm needs: ["mldsa_specs.ml", "mldsa_utils.ml", "subroutine_signatures.ml"] - name: polyz_unpack_19_avx2_asm diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md index 2ed637232..f4ec4dc75 100644 --- a/BIBLIOGRAPHY.md +++ b/BIBLIOGRAPHY.md @@ -272,8 +272,8 @@ source code and documentation. - [dev/x86_64/src/pointwise_avx2_asm.S](dev/x86_64/src/pointwise_avx2_asm.S) - [dev/x86_64/src/poly_caddq_avx2_asm.S](dev/x86_64/src/poly_caddq_avx2_asm.S) - [dev/x86_64/src/poly_chknorm_avx2_asm.S](dev/x86_64/src/poly_chknorm_avx2_asm.S) - - [dev/x86_64/src/poly_decompose_32_avx2.c](dev/x86_64/src/poly_decompose_32_avx2.c) - - [dev/x86_64/src/poly_decompose_88_avx2.c](dev/x86_64/src/poly_decompose_88_avx2.c) + - [dev/x86_64/src/poly_decompose_32_avx2_asm.S](dev/x86_64/src/poly_decompose_32_avx2_asm.S) + - [dev/x86_64/src/poly_decompose_88_avx2_asm.S](dev/x86_64/src/poly_decompose_88_avx2_asm.S) - [dev/x86_64/src/poly_use_hint_32_avx2.c](dev/x86_64/src/poly_use_hint_32_avx2.c) - [dev/x86_64/src/poly_use_hint_88_avx2.c](dev/x86_64/src/poly_use_hint_88_avx2.c) - [dev/x86_64/src/polyz_unpack_17_avx2_asm.S](dev/x86_64/src/polyz_unpack_17_avx2_asm.S) @@ -290,8 +290,8 @@ source code and documentation. - [mldsa/src/native/x86_64/src/pointwise_avx2_asm.S](mldsa/src/native/x86_64/src/pointwise_avx2_asm.S) - [mldsa/src/native/x86_64/src/poly_caddq_avx2_asm.S](mldsa/src/native/x86_64/src/poly_caddq_avx2_asm.S) - [mldsa/src/native/x86_64/src/poly_chknorm_avx2_asm.S](mldsa/src/native/x86_64/src/poly_chknorm_avx2_asm.S) - - [mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c](mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c) - - [mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c](mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c) + - [mldsa/src/native/x86_64/src/poly_decompose_32_avx2_asm.S](mldsa/src/native/x86_64/src/poly_decompose_32_avx2_asm.S) + - [mldsa/src/native/x86_64/src/poly_decompose_88_avx2_asm.S](mldsa/src/native/x86_64/src/poly_decompose_88_avx2_asm.S) - [mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c) - [mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c) - [mldsa/src/native/x86_64/src/polyz_unpack_17_avx2_asm.S](mldsa/src/native/x86_64/src/polyz_unpack_17_avx2_asm.S) @@ -308,6 +308,8 @@ source code and documentation. - [proofs/hol_light/x86_64/mldsa/pointwise_avx2_asm.S](proofs/hol_light/x86_64/mldsa/pointwise_avx2_asm.S) - [proofs/hol_light/x86_64/mldsa/poly_caddq_avx2_asm.S](proofs/hol_light/x86_64/mldsa/poly_caddq_avx2_asm.S) - [proofs/hol_light/x86_64/mldsa/poly_chknorm_avx2_asm.S](proofs/hol_light/x86_64/mldsa/poly_chknorm_avx2_asm.S) + - [proofs/hol_light/x86_64/mldsa/poly_decompose_32_avx2_asm.S](proofs/hol_light/x86_64/mldsa/poly_decompose_32_avx2_asm.S) + - [proofs/hol_light/x86_64/mldsa/poly_decompose_88_avx2_asm.S](proofs/hol_light/x86_64/mldsa/poly_decompose_88_avx2_asm.S) - [proofs/hol_light/x86_64/mldsa/polyz_unpack_17_avx2_asm.S](proofs/hol_light/x86_64/mldsa/polyz_unpack_17_avx2_asm.S) - [proofs/hol_light/x86_64/mldsa/polyz_unpack_19_avx2_asm.S](proofs/hol_light/x86_64/mldsa/polyz_unpack_19_avx2_asm.S) diff --git a/dev/x86_64/meta.h b/dev/x86_64/meta.h index 55924ffec..6e2641c0e 100644 --- a/dev/x86_64/meta.h +++ b/dev/x86_64/meta.h @@ -153,7 +153,7 @@ static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0) { return MLD_NATIVE_FUNC_FALLBACK; } - mld_poly_decompose_32_avx2(a1, a0); + mld_poly_decompose_32_avx2_asm(a1, a0); return MLD_NATIVE_FUNC_SUCCESS; } #endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ @@ -167,7 +167,7 @@ static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0) { return MLD_NATIVE_FUNC_FALLBACK; } - mld_poly_decompose_88_avx2(a1, a0); + mld_poly_decompose_88_avx2_asm(a1, a0); return MLD_NATIVE_FUNC_SUCCESS; } #endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \ diff --git a/dev/x86_64/src/arith_native_x86_64.h b/dev/x86_64/src/arith_native_x86_64.h index 6ec3c1434..ce91fafc2 100644 --- a/dev/x86_64/src/arith_native_x86_64.h +++ b/dev/x86_64/src/arith_native_x86_64.h @@ -95,11 +95,39 @@ unsigned mld_rej_uniform_eta4_avx2( #endif /* !MLD_CONFIG_NO_KEYPAIR_API */ #if !defined(MLD_CONFIG_NO_SIGN_API) -#define mld_poly_decompose_32_avx2 MLD_NAMESPACE(mld_poly_decompose_32_avx2) -void mld_poly_decompose_32_avx2(int32_t *a1, int32_t *a0); +#define mld_poly_decompose_32_avx2_asm MLD_NAMESPACE(poly_decompose_32_avx2_asm) +MLD_SYSV_ABI +void mld_poly_decompose_32_avx2_asm(int32_t *a1, int32_t *a0) +/* This must be kept in sync with the HOL-Light specification + * in proofs/hol_light/x86_64/proofs/poly_decompose_32_avx2_asm.ml */ +__contract__( + requires(memory_no_alias(a1, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(a0, sizeof(int32_t) * MLDSA_N)) + requires(array_bound(a0, 0, MLDSA_N, 0, MLDSA_Q)) + assigns(memory_slice(a1, sizeof(int32_t) * MLDSA_N)) + assigns(memory_slice(a0, sizeof(int32_t) * MLDSA_N)) + /* check-magic: 16 == (MLDSA_Q - 1) / (2 * ((MLDSA_Q - 1) / 32)) */ + ensures(array_bound(a1, 0, MLDSA_N, 0, 16)) + /* check-magic: 261889 == (MLDSA_Q - 1) / 32 + 1 */ + ensures(array_abs_bound(a0, 0, MLDSA_N, 261889)) +); -#define mld_poly_decompose_88_avx2 MLD_NAMESPACE(mld_poly_decompose_88_avx2) -void mld_poly_decompose_88_avx2(int32_t *a1, int32_t *a0); +#define mld_poly_decompose_88_avx2_asm MLD_NAMESPACE(poly_decompose_88_avx2_asm) +MLD_SYSV_ABI +void mld_poly_decompose_88_avx2_asm(int32_t *a1, int32_t *a0) +/* This must be kept in sync with the HOL-Light specification + * in proofs/hol_light/x86_64/proofs/poly_decompose_88_avx2_asm.ml */ +__contract__( + requires(memory_no_alias(a1, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(a0, sizeof(int32_t) * MLDSA_N)) + requires(array_bound(a0, 0, MLDSA_N, 0, MLDSA_Q)) + assigns(memory_slice(a1, sizeof(int32_t) * MLDSA_N)) + assigns(memory_slice(a0, sizeof(int32_t) * MLDSA_N)) + /* check-magic: 44 == (MLDSA_Q - 1) / (2 * ((MLDSA_Q - 1) / 88)) */ + ensures(array_bound(a1, 0, MLDSA_N, 0, 44)) + /* check-magic: 95233 == (MLDSA_Q - 1) / 88 + 1 */ + ensures(array_abs_bound(a0, 0, MLDSA_N, 95233)) +); #endif /* !MLD_CONFIG_NO_SIGN_API */ #define mld_poly_caddq_avx2_asm MLD_NAMESPACE(poly_caddq_avx2_asm) diff --git a/dev/x86_64/src/poly_decompose_32_avx2.c b/dev/x86_64/src/poly_decompose_32_avx2.c deleted file mode 100644 index c97d0c729..000000000 --- a/dev/x86_64/src/poly_decompose_32_avx2.c +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (c) The mldsa-native project authors - * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - -/* References - * ========== - * - * - [REF_AVX2] - * CRYSTALS-Dilithium optimized AVX2 implementation - * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé - * https://github.com/pq-crystals/dilithium/tree/master/avx2 - */ - -/* - * This file is derived from the public domain - * AVX2 Dilithium implementation @[REF_AVX2]. - * - * The algorithm for Decompose(r) (more specifically the handling for the - * wrap-around cases) are modified. See the "Reference" section in the comments - * below for a more detailed comparison. - */ - -#include "../../../common.h" - -#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ - !defined(MLD_CONFIG_NO_SIGN_API) && \ - !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ - (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ - (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)) - -#include -#include "arith_native_x86_64.h" -#include "consts.h" - -/* - * Reference: The reference implementation has the input polynomial as a - * separate argument that may be aliased with either of the outputs. - * Removing the aliasing eases CBMC proofs. - */ -void mld_poly_decompose_32_avx2(int32_t *a1, int32_t *a0) -{ - unsigned int i; - __m256i f, f0, f1, t; - const __m256i q_bound = _mm256_set1_epi32(31 * ((MLDSA_Q - 1) / 32)); - /* check-magic: 1025 == floor(2**22 / 4092) */ - const __m256i v = _mm256_set1_epi32(1025); - const __m256i alpha = _mm256_set1_epi32(2 * ((MLDSA_Q - 1) / 32)); - const __m256i off = _mm256_set1_epi32(127); - const __m256i shift = _mm256_set1_epi32(512); - - for (i = 0; i < MLDSA_N / 8; i++) - { - f = _mm256_load_si256((__m256i *)&a0[8 * i]); - - /* check-magic: 4092 == intdiv(2 * intdiv(MLDSA_Q - 1, 32), 128) */ - /* - * Compute f1 = round-(f / (2*GAMMA2)) as round-(f / (128B)) = - * round-(ceil(f / 128) / B) where B = 2*GAMMA2 / 128 = 4092. See - * mld_decompose() in mldsa/src/rounding.h for more details. - * - * range: 0 <= f <= Q-1 = 32*GAMMA2 = 16*128*B - */ - - /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */ - f1 = _mm256_add_epi32(f, off); - f1 = _mm256_srli_epi32(f1, 7); - /* - * range: 0 <= f1' <= (Q-1)/128 = 16B - * - * Also, f1' <= (Q-1)/128 = 2^16 - 2^6 < 2^16 ensures that the odd-index - * 16-bit lanes are all 0, so no bits will be dropped in the input of the - * _mm256_mulhi_epu16() below. - */ - - /* - * Compute f1 = round-(f1' / B) ≈ round(f1' * 1025 / 2^22). This is exact - * for 0 <= f1' < 2^16. See mld_decompose() in mldsa/src/rounding.h for the - * proof, and proofs/isabelle/compress for a formalization of the argument. - * - * round(f1' * 1025 / 2^22) is in turn computed in 2 steps as - * round(floor(f1' * 1025 / 2^16) / 2^6). The mulhi computes f1'' = - * floor(f1' * 1025 / 2^16). As for the next step f1 = round(f1'' / 2^6), - * because AVX2 doesn't have rounding right-shift (e.g. urshr in Neon), we - * simulate it using mulhrs with a power of 2, in this case mulhrs(f1'', - * 2^9) = round(f1'' * 2^9 / 2^15). (Note that the denominator is 2^15, - * not 2^16 as in mulhi.) - */ - f1 = _mm256_mulhi_epu16(f1, v); - /* - * range: 0 <= f1'' = floor(f1' * 1025 / 2^16) - * <= f1' * 1025 / 2^16 - * < 2^16 * 1025 / 2^16 = 1025 - * - * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that - * is, no erroneous sign-extension occurs. - */ - f1 = _mm256_mulhrs_epi16(f1, shift); - /* - * range: 0 <= f1 = round-(f1' / B) <= round-(16B / B) = 16 - * - * Note that the odd-index 16-bit lanes are still all 0 right now, so - * reinterpreting f1 as 8 lanes of int32_t (as done in the following) does - * not affect its value. - */ - - /* - * If f1 = 16, i.e. f > 31*GAMMA2, proceed as if f' = f - Q was given - * instead. (For f = 31*GAMMA2 + 1 thus f' = -GAMMA2, we still round it to 0 - * like other "wrapped around" cases.) - * - * Reference: They handle wrap-around in a somewhat convoluted way. Most - * notably, they compute remainder f0 with quotient f1 that's - * already wrapped around, so is off by q (instead of by 1) from - * what it should be ultimately. They detect the need for - * correction by checking if f0 is abnormally large. - * - * Our approach is closer to Algorithm 36 in the specification, - * in that we compute f0 normally and correct f1, f0 in the way - * they prescribed. The only real difference is that we check for - * wrap-around by examining f directly, instead of some other - * intermediates computed from it. - */ - - /* Check for wrap-around */ - t = _mm256_cmpgt_epi32(f, q_bound); - - /* Compute remainder f0 */ - f0 = _mm256_mullo_epi32(f1, alpha); - f0 = _mm256_sub_epi32(f, f0); - /* - * range: -GAMMA2 < f0 <= GAMMA2 - * - * This holds since f1 = round-(f / (2*GAMMA2)) was computed exactly. - */ - - /* If wrap-around is required, set f1 = 0 and f0 -= 1 */ - f1 = _mm256_andnot_si256(t, f1); - f0 = _mm256_add_epi32(f0, t); - /* range: 0 <= f1 <= 15, -GAMMA2 <= f0 <= GAMMA2 */ - - _mm256_store_si256((__m256i *)&a1[8 * i], f1); - _mm256_store_si256((__m256i *)&a0[8 * i], f0); - } -} - -#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_SIGN_API && \ - !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ - (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ - || MLD_CONFIG_PARAMETER_SET == 87) */ - -MLD_EMPTY_CU(avx2_poly_decompose_32) - -#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_SIGN_API && \ - !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ - (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ - || MLD_CONFIG_PARAMETER_SET == 87)) */ diff --git a/dev/x86_64/src/poly_decompose_32_avx2_asm.S b/dev/x86_64/src/poly_decompose_32_avx2_asm.S new file mode 100644 index 000000000..f81706233 --- /dev/null +++ b/dev/x86_64/src/poly_decompose_32_avx2_asm.S @@ -0,0 +1,154 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + * + * The algorithm for Decompose(r) (more specifically the handling for the + * wrap-around cases) is modified. See the AVX2 intrinsics version + * (poly_decompose_32_avx2.c, predecessor of this file) for a more detailed + * comparison. + */ + + +/************************************************* + * Name: mld_poly_decompose_32_avx2_asm + * + * Description: For all coefficients c of the input polynomial, + * compute high and low bits c0, c1 such c mod^+ Q = c1*ALPHA + c0 + * with -ALPHA/2 < c0 <= ALPHA/2 except if c1 = (Q-1)/ALPHA where we + * set c1 = 0 and -ALPHA/2 <= c0 = c mod^+ Q - Q < 0. + * Assumes coefficients to be standard representatives. + * For ML-DSA-65 / ML-DSA-87 (gamma2 = (Q-1)/32, alpha = 2*gamma2). + * + * Arguments: - int32_t *a1: pointer to output polynomial with coefficients c1 + * - int32_t *a0: pointer to input/output polynomial. On input, + * holds the standard representatives c. On output, + * holds the low bits c0. + **************************************************/ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_NO_SIGN_API) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)) + +/* simpasm: header-end */ + +/* + * Per-block decompose. + * + * f - loaded coefficient block (clobbered) + * f1 - scratch holding the high-part output + * f0 - scratch holding the low-part output + * t - scratch holding the wrap-around mask + * + * Constants are passed in registers: + * ymm10 = off (broadcast 127) + * ymm11 = v (broadcast 1025) + * ymm12 = shift (broadcast 512) + * ymm13 = q_bound (broadcast 31 * ((Q-1)/32) = 8118528) + * ymm14 = alpha (broadcast 2 * ((Q-1)/32) = 523776) + */ +.macro decompose32_block off_in, off_out_a1, f, f1, f0, t + vmovdqa \off_in(%rsi), \f + vpaddd %ymm10, \f, \f1 /* f1 = f + 127 */ + vpsrld $7, \f1, \f1 /* f1 = (f + 127) >> 7 */ + vpmulhuw %ymm11, \f1, \f1 /* f1'' = mulhi_u16(f1, 1025) */ + vpmulhrsw %ymm12, \f1, \f1 /* f1 = mulhrs_i16(f1'', 512) */ + vpcmpgtd %ymm13, \f, \t /* t = (f > 31*GAMMA2) */ + vpmulld %ymm14, \f1, \f0 /* f0 = f1 * (2*GAMMA2) */ + vpsubd \f0, \f, \f0 /* f0 = f - f0 */ + vpandn \f1, \t, \f1 /* f1 = ~t & f1 */ + vpaddd \t, \f0, \f0 /* f0 = f0 + t */ + vmovdqa \f1, \off_out_a1(%rdi) + vmovdqa \f0, \off_in(%rsi) +.endm + +.text +.global MLD_ASM_NAMESPACE(poly_decompose_32_avx2_asm) +.balign 16 +MLD_ASM_FN_SYMBOL(poly_decompose_32_avx2_asm) + + /* Build broadcast constants. Each is constructed via mov-immediate + * into a GP reg, vmovd, vpbroadcastd. Avoiding .rodata keeps the + * routine self-contained for HOL-Light. */ + movl $127, %eax + vmovd %eax, %xmm10 + vpbroadcastd %xmm10, %ymm10 /* off */ + + movl $1025, %eax + vmovd %eax, %xmm11 + vpbroadcastd %xmm11, %ymm11 /* v */ + + movl $512, %eax + vmovd %eax, %xmm12 + vpbroadcastd %xmm12, %ymm12 /* shift */ + + movl $8118528, %eax /* 31 * ((Q-1)/32) */ + vmovd %eax, %xmm13 + vpbroadcastd %xmm13, %ymm13 /* q_bound */ + + movl $523776, %eax /* 2 * ((Q-1)/32) */ + vmovd %eax, %xmm14 + vpbroadcastd %xmm14, %ymm14 /* alpha */ + + decompose32_block 0, 0, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 32, 32, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 64, 64, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 96, 96, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 128, 128, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 160, 160, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 192, 192, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 224, 224, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 256, 256, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 288, 288, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 320, 320, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 352, 352, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 384, 384, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 416, 416, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 448, 448, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 480, 480, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 512, 512, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 544, 544, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 576, 576, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 608, 608, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 640, 640, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 672, 672, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 704, 704, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 736, 736, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 768, 768, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 800, 800, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 832, 832, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 864, 864, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 896, 896, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 928, 928, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 960, 960, %ymm0, %ymm1, %ymm2, %ymm3 + decompose32_block 992, 992, %ymm0, %ymm1, %ymm2, %ymm3 + + ret + +/* simpasm: footer-start */ + +#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_SIGN_API && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ + || MLD_CONFIG_PARAMETER_SET == 87) */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/dev/x86_64/src/poly_decompose_88_avx2.c b/dev/x86_64/src/poly_decompose_88_avx2.c deleted file mode 100644 index d6a28d2d1..000000000 --- a/dev/x86_64/src/poly_decompose_88_avx2.c +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (c) The mldsa-native project authors - * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - -/* References - * ========== - * - * - [REF_AVX2] - * CRYSTALS-Dilithium optimized AVX2 implementation - * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé - * https://github.com/pq-crystals/dilithium/tree/master/avx2 - */ - -/* - * This file is derived from the public domain - * AVX2 Dilithium implementation @[REF_AVX2]. - * - * The algorithm for Decompose(r) (more specifically the handling for the - * wrap-around cases) are modified. See the "Reference" section in the comments - * below for a more detailed comparison. - */ - -#include "../../../common.h" - -#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ - !defined(MLD_CONFIG_NO_SIGN_API) && \ - !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ - (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ - MLD_CONFIG_PARAMETER_SET == 44) - -#include -#include "arith_native_x86_64.h" -#include "consts.h" - -/* - * Reference: The reference implementation has the input polynomial as a - * separate argument that may be aliased with either of the outputs. - * Removing the aliasing eases CBMC proofs. - */ - -void mld_poly_decompose_88_avx2(int32_t *a1, int32_t *a0) -{ - unsigned int i; - __m256i f, f0, f1, t; - const __m256i q_bound = _mm256_set1_epi32(87 * ((MLDSA_Q - 1) / 88)); - /* check-magic: 11275 == floor(2**24 / 1488) */ - const __m256i v = _mm256_set1_epi32(11275); - const __m256i alpha = _mm256_set1_epi32(2 * ((MLDSA_Q - 1) / 88)); - const __m256i off = _mm256_set1_epi32(127); - const __m256i shift = _mm256_set1_epi32(128); - - for (i = 0; i < MLDSA_N / 8; i++) - { - f = _mm256_load_si256((__m256i *)&a0[8 * i]); - - /* check-magic: 1488 == intdiv(2 * intdiv(MLDSA_Q - 1, 88), 128) */ - /* - * Compute f1 = round-(f / (2*GAMMA2)) as round-(f / (128B)) = - * round-(ceil(f / 128) / B) where B = 2*GAMMA2 / 128 = 1488. See - * mld_decompose() in mldsa/src/rounding.h for more details. - * - * range: 0 <= f <= Q-1 = 88*GAMMA2 = 44*128*B - */ - - /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */ - f1 = _mm256_add_epi32(f, off); - f1 = _mm256_srli_epi32(f1, 7); - /* - * range: 0 <= f1' <= (Q-1)/128 = 44B - * - * Also, f1' <= (Q-1)/128 = 2^16 - 2^6 < 2^16 ensures that the odd-index - * 16-bit lanes are all 0, so no bits will be dropped in the input of the - * _mm256_mulhi_epu16() below. - */ - - /* - * Compute f1 = round-(f1' / B) ≈ round(f1' * 11275 / 2^24). This is exact - * for 0 <= f1' < 2^16. See mld_decompose() in mldsa/src/rounding.h for the - * proof, and proofs/isabelle/compress for a formalization of the argument. - * - * round(f1' * 11275 / 2^24) is in turn computed in 2 steps as - * round(floor(f1' * 11275 / 2^16) / 2^8). The mulhi computes f1'' = - * floor(f1' * 11275 / 2^16). As for the next step f1 = round(f1'' / 2^8), - * because AVX2 doesn't have rounding right-shift (e.g. urshr in Neon), we - * simulate it using mulhrs with a power of 2, in this case mulhrs(f1'', - * 2^7) = round(f1'' * 2^7 / 2^15). (Note that the denominator is 2^15, - * not 2^16 as in mulhi.) - */ - f1 = _mm256_mulhi_epu16(f1, v); - /* - * range: 0 <= f1'' = floor(f1' * 11275 / 2^16) - * <= f1' * 11275 / 2^16 - * < 2^16 * 11275 / 2^16 = 11275 - * - * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that - * is, no erroneous sign-extension occurs. - */ - f1 = _mm256_mulhrs_epi16(f1, shift); - /* - * range: 0 <= f1 = round-(f1' / B) <= round-(44B / B) = 44 - * - * Note that the odd-index 16-bit lanes are still all 0 right now, so - * reinterpreting f1 as 8 lanes of int32_t (as done in the following) does - * not affect its value. - */ - - /* - * If f1 = 44, i.e. f > 87*GAMMA2, proceed as if f' = f - Q was given - * instead. (For f = 87*GAMMA2 + 1 thus f' = -GAMMA2, we still round it to 0 - * like other "wrapped around" cases.) - * - * Reference: They handle wrap-around in a somewhat convoluted way. Most - * notably, they compute remainder f0 with quotient f1 that's - * already wrapped around, so is off by q (instead of by 1) from - * what it should be ultimately. They detect the need for - * correction by checking if f0 is abnormally large. - * - * Our approach is closer to Algorithm 36 in the specification, - * in that we compute f0 normally and correct f1, f0 in the way - * they prescribed. The only real difference is that we check for - * wrap-around by examining f directly, instead of some other - * intermediates computed from it. - */ - - /* Check for wrap-around */ - t = _mm256_cmpgt_epi32(f, q_bound); - - /* Compute remainder f0 */ - f0 = _mm256_mullo_epi32(f1, alpha); - f0 = _mm256_sub_epi32(f, f0); - /* - * range: -GAMMA2 < f0 <= GAMMA2 - * - * This holds since f1 = round-(f / (2*GAMMA2)) was computed exactly. - */ - - /* If wrap-around is required, set f1 = 0 and f0 -= 1 */ - f1 = _mm256_andnot_si256(t, f1); - f0 = _mm256_add_epi32(f0, t); - /* range: 0 <= f1 <= 43, -GAMMA2 <= f0 <= GAMMA2 */ - - _mm256_store_si256((__m256i *)&a1[8 * i], f1); - _mm256_store_si256((__m256i *)&a0[8 * i], f0); - } -} -#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_SIGN_API && \ - !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ - (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \ - */ - -MLD_EMPTY_CU(avx2_poly_decompose_88) - -#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_SIGN_API && \ - !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ - (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \ - 44)) */ diff --git a/dev/x86_64/src/poly_decompose_88_avx2_asm.S b/dev/x86_64/src/poly_decompose_88_avx2_asm.S new file mode 100644 index 000000000..b1061b3c1 --- /dev/null +++ b/dev/x86_64/src/poly_decompose_88_avx2_asm.S @@ -0,0 +1,146 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + * + * The algorithm for Decompose(r) (more specifically the handling for the + * wrap-around cases) is modified. See the AVX2 intrinsics version + * (poly_decompose_88_avx2.c, predecessor of this file) for a more detailed + * comparison. + */ + + +/************************************************* + * Name: mld_poly_decompose_88_avx2_asm + * + * Description: For all coefficients c of the input polynomial, + * compute high and low bits c0, c1 such c mod^+ Q = c1*ALPHA + c0 + * with -ALPHA/2 < c0 <= ALPHA/2 except if c1 = (Q-1)/ALPHA where we + * set c1 = 0 and -ALPHA/2 <= c0 = c mod^+ Q - Q < 0. + * Assumes coefficients to be standard representatives. + * For ML-DSA-44 (gamma2 = (Q-1)/88, alpha = 2*gamma2). + * + * Arguments: - int32_t *a1: pointer to output polynomial with coefficients c1 + * - int32_t *a0: pointer to input/output polynomial. On input, + * holds the standard representatives c. On output, + * holds the low bits c0. + **************************************************/ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_NO_SIGN_API) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + MLD_CONFIG_PARAMETER_SET == 44) + +/* simpasm: header-end */ + +/* + * Per-block decompose. See poly_decompose_32_avx2_asm.S for documentation; + * the only differences are the broadcast constant values. + * + * ymm10 = off (broadcast 127) + * ymm11 = v (broadcast 11275) + * ymm12 = shift (broadcast 128) + * ymm13 = q_bound (broadcast 87 * ((Q-1)/88) = 8285184) + * ymm14 = alpha (broadcast 2 * ((Q-1)/88) = 190464) + */ +.macro decompose88_block off_in, off_out_a1, f, f1, f0, t + vmovdqa \off_in(%rsi), \f + vpaddd %ymm10, \f, \f1 + vpsrld $7, \f1, \f1 + vpmulhuw %ymm11, \f1, \f1 + vpmulhrsw %ymm12, \f1, \f1 + vpcmpgtd %ymm13, \f, \t + vpmulld %ymm14, \f1, \f0 + vpsubd \f0, \f, \f0 + vpandn \f1, \t, \f1 + vpaddd \t, \f0, \f0 + vmovdqa \f1, \off_out_a1(%rdi) + vmovdqa \f0, \off_in(%rsi) +.endm + +.text +.global MLD_ASM_NAMESPACE(poly_decompose_88_avx2_asm) +.balign 16 +MLD_ASM_FN_SYMBOL(poly_decompose_88_avx2_asm) + + movl $127, %eax + vmovd %eax, %xmm10 + vpbroadcastd %xmm10, %ymm10 /* off */ + + movl $11275, %eax + vmovd %eax, %xmm11 + vpbroadcastd %xmm11, %ymm11 /* v */ + + movl $128, %eax + vmovd %eax, %xmm12 + vpbroadcastd %xmm12, %ymm12 /* shift */ + + movl $8285184, %eax /* 87 * ((Q-1)/88) */ + vmovd %eax, %xmm13 + vpbroadcastd %xmm13, %ymm13 /* q_bound */ + + movl $190464, %eax /* 2 * ((Q-1)/88) */ + vmovd %eax, %xmm14 + vpbroadcastd %xmm14, %ymm14 /* alpha */ + + decompose88_block 0, 0, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 32, 32, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 64, 64, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 96, 96, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 128, 128, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 160, 160, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 192, 192, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 224, 224, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 256, 256, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 288, 288, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 320, 320, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 352, 352, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 384, 384, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 416, 416, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 448, 448, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 480, 480, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 512, 512, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 544, 544, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 576, 576, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 608, 608, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 640, 640, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 672, 672, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 704, 704, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 736, 736, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 768, 768, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 800, 800, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 832, 832, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 864, 864, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 896, 896, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 928, 928, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 960, 960, %ymm0, %ymm1, %ymm2, %ymm3 + decompose88_block 992, 992, %ymm0, %ymm1, %ymm2, %ymm3 + + ret + +/* simpasm: footer-start */ + +#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_SIGN_API && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \ + */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/mldsa/mldsa_native.c b/mldsa/mldsa_native.c index 9365ed369..6ef14fe9b 100644 --- a/mldsa/mldsa_native.c +++ b/mldsa/mldsa_native.c @@ -83,8 +83,6 @@ #endif /* MLD_SYS_AARCH64 */ #if defined(MLD_SYS_X86_64) #include "src/native/x86_64/src/consts.c" -#include "src/native/x86_64/src/poly_decompose_32_avx2.c" -#include "src/native/x86_64/src/poly_decompose_88_avx2.c" #include "src/native/x86_64/src/poly_use_hint_32_avx2.c" #include "src/native/x86_64/src/poly_use_hint_88_avx2.c" #include "src/native/x86_64/src/rej_uniform_avx2.c" @@ -785,8 +783,8 @@ #undef mld_pointwise_avx2_asm #undef mld_poly_caddq_avx2_asm #undef mld_poly_chknorm_avx2_asm -#undef mld_poly_decompose_32_avx2 -#undef mld_poly_decompose_88_avx2 +#undef mld_poly_decompose_32_avx2_asm +#undef mld_poly_decompose_88_avx2_asm #undef mld_poly_use_hint_32_avx2 #undef mld_poly_use_hint_88_avx2 #undef mld_polyz_unpack_17_avx2_asm diff --git a/mldsa/mldsa_native_asm.S b/mldsa/mldsa_native_asm.S index 4877d5156..72ab8101f 100644 --- a/mldsa/mldsa_native_asm.S +++ b/mldsa/mldsa_native_asm.S @@ -88,6 +88,8 @@ #include "src/native/x86_64/src/pointwise_avx2_asm.S" #include "src/native/x86_64/src/poly_caddq_avx2_asm.S" #include "src/native/x86_64/src/poly_chknorm_avx2_asm.S" +#include "src/native/x86_64/src/poly_decompose_32_avx2_asm.S" +#include "src/native/x86_64/src/poly_decompose_88_avx2_asm.S" #include "src/native/x86_64/src/polyz_unpack_17_avx2_asm.S" #include "src/native/x86_64/src/polyz_unpack_19_avx2_asm.S" #endif /* MLD_SYS_X86_64 */ @@ -798,8 +800,8 @@ #undef mld_pointwise_avx2_asm #undef mld_poly_caddq_avx2_asm #undef mld_poly_chknorm_avx2_asm -#undef mld_poly_decompose_32_avx2 -#undef mld_poly_decompose_88_avx2 +#undef mld_poly_decompose_32_avx2_asm +#undef mld_poly_decompose_88_avx2_asm #undef mld_poly_use_hint_32_avx2 #undef mld_poly_use_hint_88_avx2 #undef mld_polyz_unpack_17_avx2_asm diff --git a/mldsa/src/native/x86_64/meta.h b/mldsa/src/native/x86_64/meta.h index 55924ffec..6e2641c0e 100644 --- a/mldsa/src/native/x86_64/meta.h +++ b/mldsa/src/native/x86_64/meta.h @@ -153,7 +153,7 @@ static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0) { return MLD_NATIVE_FUNC_FALLBACK; } - mld_poly_decompose_32_avx2(a1, a0); + mld_poly_decompose_32_avx2_asm(a1, a0); return MLD_NATIVE_FUNC_SUCCESS; } #endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ @@ -167,7 +167,7 @@ static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0) { return MLD_NATIVE_FUNC_FALLBACK; } - mld_poly_decompose_88_avx2(a1, a0); + mld_poly_decompose_88_avx2_asm(a1, a0); return MLD_NATIVE_FUNC_SUCCESS; } #endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \ diff --git a/mldsa/src/native/x86_64/src/arith_native_x86_64.h b/mldsa/src/native/x86_64/src/arith_native_x86_64.h index 6ec3c1434..ce91fafc2 100644 --- a/mldsa/src/native/x86_64/src/arith_native_x86_64.h +++ b/mldsa/src/native/x86_64/src/arith_native_x86_64.h @@ -95,11 +95,39 @@ unsigned mld_rej_uniform_eta4_avx2( #endif /* !MLD_CONFIG_NO_KEYPAIR_API */ #if !defined(MLD_CONFIG_NO_SIGN_API) -#define mld_poly_decompose_32_avx2 MLD_NAMESPACE(mld_poly_decompose_32_avx2) -void mld_poly_decompose_32_avx2(int32_t *a1, int32_t *a0); +#define mld_poly_decompose_32_avx2_asm MLD_NAMESPACE(poly_decompose_32_avx2_asm) +MLD_SYSV_ABI +void mld_poly_decompose_32_avx2_asm(int32_t *a1, int32_t *a0) +/* This must be kept in sync with the HOL-Light specification + * in proofs/hol_light/x86_64/proofs/poly_decompose_32_avx2_asm.ml */ +__contract__( + requires(memory_no_alias(a1, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(a0, sizeof(int32_t) * MLDSA_N)) + requires(array_bound(a0, 0, MLDSA_N, 0, MLDSA_Q)) + assigns(memory_slice(a1, sizeof(int32_t) * MLDSA_N)) + assigns(memory_slice(a0, sizeof(int32_t) * MLDSA_N)) + /* check-magic: 16 == (MLDSA_Q - 1) / (2 * ((MLDSA_Q - 1) / 32)) */ + ensures(array_bound(a1, 0, MLDSA_N, 0, 16)) + /* check-magic: 261889 == (MLDSA_Q - 1) / 32 + 1 */ + ensures(array_abs_bound(a0, 0, MLDSA_N, 261889)) +); -#define mld_poly_decompose_88_avx2 MLD_NAMESPACE(mld_poly_decompose_88_avx2) -void mld_poly_decompose_88_avx2(int32_t *a1, int32_t *a0); +#define mld_poly_decompose_88_avx2_asm MLD_NAMESPACE(poly_decompose_88_avx2_asm) +MLD_SYSV_ABI +void mld_poly_decompose_88_avx2_asm(int32_t *a1, int32_t *a0) +/* This must be kept in sync with the HOL-Light specification + * in proofs/hol_light/x86_64/proofs/poly_decompose_88_avx2_asm.ml */ +__contract__( + requires(memory_no_alias(a1, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(a0, sizeof(int32_t) * MLDSA_N)) + requires(array_bound(a0, 0, MLDSA_N, 0, MLDSA_Q)) + assigns(memory_slice(a1, sizeof(int32_t) * MLDSA_N)) + assigns(memory_slice(a0, sizeof(int32_t) * MLDSA_N)) + /* check-magic: 44 == (MLDSA_Q - 1) / (2 * ((MLDSA_Q - 1) / 88)) */ + ensures(array_bound(a1, 0, MLDSA_N, 0, 44)) + /* check-magic: 95233 == (MLDSA_Q - 1) / 88 + 1 */ + ensures(array_abs_bound(a0, 0, MLDSA_N, 95233)) +); #endif /* !MLD_CONFIG_NO_SIGN_API */ #define mld_poly_caddq_avx2_asm MLD_NAMESPACE(poly_caddq_avx2_asm) diff --git a/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c b/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c deleted file mode 100644 index c97d0c729..000000000 --- a/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (c) The mldsa-native project authors - * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - -/* References - * ========== - * - * - [REF_AVX2] - * CRYSTALS-Dilithium optimized AVX2 implementation - * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé - * https://github.com/pq-crystals/dilithium/tree/master/avx2 - */ - -/* - * This file is derived from the public domain - * AVX2 Dilithium implementation @[REF_AVX2]. - * - * The algorithm for Decompose(r) (more specifically the handling for the - * wrap-around cases) are modified. See the "Reference" section in the comments - * below for a more detailed comparison. - */ - -#include "../../../common.h" - -#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ - !defined(MLD_CONFIG_NO_SIGN_API) && \ - !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ - (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ - (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)) - -#include -#include "arith_native_x86_64.h" -#include "consts.h" - -/* - * Reference: The reference implementation has the input polynomial as a - * separate argument that may be aliased with either of the outputs. - * Removing the aliasing eases CBMC proofs. - */ -void mld_poly_decompose_32_avx2(int32_t *a1, int32_t *a0) -{ - unsigned int i; - __m256i f, f0, f1, t; - const __m256i q_bound = _mm256_set1_epi32(31 * ((MLDSA_Q - 1) / 32)); - /* check-magic: 1025 == floor(2**22 / 4092) */ - const __m256i v = _mm256_set1_epi32(1025); - const __m256i alpha = _mm256_set1_epi32(2 * ((MLDSA_Q - 1) / 32)); - const __m256i off = _mm256_set1_epi32(127); - const __m256i shift = _mm256_set1_epi32(512); - - for (i = 0; i < MLDSA_N / 8; i++) - { - f = _mm256_load_si256((__m256i *)&a0[8 * i]); - - /* check-magic: 4092 == intdiv(2 * intdiv(MLDSA_Q - 1, 32), 128) */ - /* - * Compute f1 = round-(f / (2*GAMMA2)) as round-(f / (128B)) = - * round-(ceil(f / 128) / B) where B = 2*GAMMA2 / 128 = 4092. See - * mld_decompose() in mldsa/src/rounding.h for more details. - * - * range: 0 <= f <= Q-1 = 32*GAMMA2 = 16*128*B - */ - - /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */ - f1 = _mm256_add_epi32(f, off); - f1 = _mm256_srli_epi32(f1, 7); - /* - * range: 0 <= f1' <= (Q-1)/128 = 16B - * - * Also, f1' <= (Q-1)/128 = 2^16 - 2^6 < 2^16 ensures that the odd-index - * 16-bit lanes are all 0, so no bits will be dropped in the input of the - * _mm256_mulhi_epu16() below. - */ - - /* - * Compute f1 = round-(f1' / B) ≈ round(f1' * 1025 / 2^22). This is exact - * for 0 <= f1' < 2^16. See mld_decompose() in mldsa/src/rounding.h for the - * proof, and proofs/isabelle/compress for a formalization of the argument. - * - * round(f1' * 1025 / 2^22) is in turn computed in 2 steps as - * round(floor(f1' * 1025 / 2^16) / 2^6). The mulhi computes f1'' = - * floor(f1' * 1025 / 2^16). As for the next step f1 = round(f1'' / 2^6), - * because AVX2 doesn't have rounding right-shift (e.g. urshr in Neon), we - * simulate it using mulhrs with a power of 2, in this case mulhrs(f1'', - * 2^9) = round(f1'' * 2^9 / 2^15). (Note that the denominator is 2^15, - * not 2^16 as in mulhi.) - */ - f1 = _mm256_mulhi_epu16(f1, v); - /* - * range: 0 <= f1'' = floor(f1' * 1025 / 2^16) - * <= f1' * 1025 / 2^16 - * < 2^16 * 1025 / 2^16 = 1025 - * - * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that - * is, no erroneous sign-extension occurs. - */ - f1 = _mm256_mulhrs_epi16(f1, shift); - /* - * range: 0 <= f1 = round-(f1' / B) <= round-(16B / B) = 16 - * - * Note that the odd-index 16-bit lanes are still all 0 right now, so - * reinterpreting f1 as 8 lanes of int32_t (as done in the following) does - * not affect its value. - */ - - /* - * If f1 = 16, i.e. f > 31*GAMMA2, proceed as if f' = f - Q was given - * instead. (For f = 31*GAMMA2 + 1 thus f' = -GAMMA2, we still round it to 0 - * like other "wrapped around" cases.) - * - * Reference: They handle wrap-around in a somewhat convoluted way. Most - * notably, they compute remainder f0 with quotient f1 that's - * already wrapped around, so is off by q (instead of by 1) from - * what it should be ultimately. They detect the need for - * correction by checking if f0 is abnormally large. - * - * Our approach is closer to Algorithm 36 in the specification, - * in that we compute f0 normally and correct f1, f0 in the way - * they prescribed. The only real difference is that we check for - * wrap-around by examining f directly, instead of some other - * intermediates computed from it. - */ - - /* Check for wrap-around */ - t = _mm256_cmpgt_epi32(f, q_bound); - - /* Compute remainder f0 */ - f0 = _mm256_mullo_epi32(f1, alpha); - f0 = _mm256_sub_epi32(f, f0); - /* - * range: -GAMMA2 < f0 <= GAMMA2 - * - * This holds since f1 = round-(f / (2*GAMMA2)) was computed exactly. - */ - - /* If wrap-around is required, set f1 = 0 and f0 -= 1 */ - f1 = _mm256_andnot_si256(t, f1); - f0 = _mm256_add_epi32(f0, t); - /* range: 0 <= f1 <= 15, -GAMMA2 <= f0 <= GAMMA2 */ - - _mm256_store_si256((__m256i *)&a1[8 * i], f1); - _mm256_store_si256((__m256i *)&a0[8 * i], f0); - } -} - -#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_SIGN_API && \ - !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ - (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ - || MLD_CONFIG_PARAMETER_SET == 87) */ - -MLD_EMPTY_CU(avx2_poly_decompose_32) - -#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_SIGN_API && \ - !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ - (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ - || MLD_CONFIG_PARAMETER_SET == 87)) */ diff --git a/mldsa/src/native/x86_64/src/poly_decompose_32_avx2_asm.S b/mldsa/src/native/x86_64/src/poly_decompose_32_avx2_asm.S new file mode 100644 index 000000000..1c473a355 --- /dev/null +++ b/mldsa/src/native/x86_64/src/poly_decompose_32_avx2_asm.S @@ -0,0 +1,478 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + * + * The algorithm for Decompose(r) (more specifically the handling for the + * wrap-around cases) is modified. See the AVX2 intrinsics version + * (poly_decompose_32_avx2.c, predecessor of this file) for a more detailed + * comparison. + */ + + +/************************************************* + * Name: mld_poly_decompose_32_avx2_asm + * + * Description: For all coefficients c of the input polynomial, + * compute high and low bits c0, c1 such c mod^+ Q = c1*ALPHA + c0 + * with -ALPHA/2 < c0 <= ALPHA/2 except if c1 = (Q-1)/ALPHA where we + * set c1 = 0 and -ALPHA/2 <= c0 = c mod^+ Q - Q < 0. + * Assumes coefficients to be standard representatives. + * For ML-DSA-65 / ML-DSA-87 (gamma2 = (Q-1)/32, alpha = 2*gamma2). + * + * Arguments: - int32_t *a1: pointer to output polynomial with coefficients c1 + * - int32_t *a0: pointer to input/output polynomial. On input, + * holds the standard representatives c. On output, + * holds the low bits c0. + **************************************************/ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_NO_SIGN_API) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)) + + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/poly_decompose_32_avx2_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(poly_decompose_32_avx2_asm) +MLD_ASM_FN_SYMBOL(poly_decompose_32_avx2_asm) + + .cfi_startproc + movl $0x7f, %eax + vmovd %eax, %xmm10 + vpbroadcastd %xmm10, %ymm10 + movl $0x401, %eax # imm = 0x401 + vmovd %eax, %xmm11 + vpbroadcastd %xmm11, %ymm11 + movl $0x200, %eax # imm = 0x200 + vmovd %eax, %xmm12 + vpbroadcastd %xmm12, %ymm12 + movl $0x7be100, %eax # imm = 0x7BE100 + vmovd %eax, %xmm13 + vpbroadcastd %xmm13, %ymm13 + movl $0x7fe00, %eax # imm = 0x7FE00 + vmovd %eax, %xmm14 + vpbroadcastd %xmm14, %ymm14 + vmovdqa (%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, (%rdi) + vmovdqa %ymm2, (%rsi) + vmovdqa 0x20(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x20(%rdi) + vmovdqa %ymm2, 0x20(%rsi) + vmovdqa 0x40(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x40(%rdi) + vmovdqa %ymm2, 0x40(%rsi) + vmovdqa 0x60(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x60(%rdi) + vmovdqa %ymm2, 0x60(%rsi) + vmovdqa 0x80(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x80(%rdi) + vmovdqa %ymm2, 0x80(%rsi) + vmovdqa 0xa0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0xa0(%rdi) + vmovdqa %ymm2, 0xa0(%rsi) + vmovdqa 0xc0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0xc0(%rdi) + vmovdqa %ymm2, 0xc0(%rsi) + vmovdqa 0xe0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0xe0(%rdi) + vmovdqa %ymm2, 0xe0(%rsi) + vmovdqa 0x100(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x100(%rdi) + vmovdqa %ymm2, 0x100(%rsi) + vmovdqa 0x120(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x120(%rdi) + vmovdqa %ymm2, 0x120(%rsi) + vmovdqa 0x140(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x140(%rdi) + vmovdqa %ymm2, 0x140(%rsi) + vmovdqa 0x160(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x160(%rdi) + vmovdqa %ymm2, 0x160(%rsi) + vmovdqa 0x180(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x180(%rdi) + vmovdqa %ymm2, 0x180(%rsi) + vmovdqa 0x1a0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x1a0(%rdi) + vmovdqa %ymm2, 0x1a0(%rsi) + vmovdqa 0x1c0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x1c0(%rdi) + vmovdqa %ymm2, 0x1c0(%rsi) + vmovdqa 0x1e0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x1e0(%rdi) + vmovdqa %ymm2, 0x1e0(%rsi) + vmovdqa 0x200(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x200(%rdi) + vmovdqa %ymm2, 0x200(%rsi) + vmovdqa 0x220(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x220(%rdi) + vmovdqa %ymm2, 0x220(%rsi) + vmovdqa 0x240(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x240(%rdi) + vmovdqa %ymm2, 0x240(%rsi) + vmovdqa 0x260(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x260(%rdi) + vmovdqa %ymm2, 0x260(%rsi) + vmovdqa 0x280(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x280(%rdi) + vmovdqa %ymm2, 0x280(%rsi) + vmovdqa 0x2a0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x2a0(%rdi) + vmovdqa %ymm2, 0x2a0(%rsi) + vmovdqa 0x2c0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x2c0(%rdi) + vmovdqa %ymm2, 0x2c0(%rsi) + vmovdqa 0x2e0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x2e0(%rdi) + vmovdqa %ymm2, 0x2e0(%rsi) + vmovdqa 0x300(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x300(%rdi) + vmovdqa %ymm2, 0x300(%rsi) + vmovdqa 0x320(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x320(%rdi) + vmovdqa %ymm2, 0x320(%rsi) + vmovdqa 0x340(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x340(%rdi) + vmovdqa %ymm2, 0x340(%rsi) + vmovdqa 0x360(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x360(%rdi) + vmovdqa %ymm2, 0x360(%rsi) + vmovdqa 0x380(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x380(%rdi) + vmovdqa %ymm2, 0x380(%rsi) + vmovdqa 0x3a0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x3a0(%rdi) + vmovdqa %ymm2, 0x3a0(%rsi) + vmovdqa 0x3c0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x3c0(%rdi) + vmovdqa %ymm2, 0x3c0(%rsi) + vmovdqa 0x3e0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x3e0(%rdi) + vmovdqa %ymm2, 0x3e0(%rsi) + retq + .cfi_endproc + +MLD_ASM_FN_SIZE(poly_decompose_32_avx2_asm) + + +#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_SIGN_API && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ + || MLD_CONFIG_PARAMETER_SET == 87) */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c b/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c deleted file mode 100644 index d6a28d2d1..000000000 --- a/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (c) The mldsa-native project authors - * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - -/* References - * ========== - * - * - [REF_AVX2] - * CRYSTALS-Dilithium optimized AVX2 implementation - * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé - * https://github.com/pq-crystals/dilithium/tree/master/avx2 - */ - -/* - * This file is derived from the public domain - * AVX2 Dilithium implementation @[REF_AVX2]. - * - * The algorithm for Decompose(r) (more specifically the handling for the - * wrap-around cases) are modified. See the "Reference" section in the comments - * below for a more detailed comparison. - */ - -#include "../../../common.h" - -#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ - !defined(MLD_CONFIG_NO_SIGN_API) && \ - !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ - (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ - MLD_CONFIG_PARAMETER_SET == 44) - -#include -#include "arith_native_x86_64.h" -#include "consts.h" - -/* - * Reference: The reference implementation has the input polynomial as a - * separate argument that may be aliased with either of the outputs. - * Removing the aliasing eases CBMC proofs. - */ - -void mld_poly_decompose_88_avx2(int32_t *a1, int32_t *a0) -{ - unsigned int i; - __m256i f, f0, f1, t; - const __m256i q_bound = _mm256_set1_epi32(87 * ((MLDSA_Q - 1) / 88)); - /* check-magic: 11275 == floor(2**24 / 1488) */ - const __m256i v = _mm256_set1_epi32(11275); - const __m256i alpha = _mm256_set1_epi32(2 * ((MLDSA_Q - 1) / 88)); - const __m256i off = _mm256_set1_epi32(127); - const __m256i shift = _mm256_set1_epi32(128); - - for (i = 0; i < MLDSA_N / 8; i++) - { - f = _mm256_load_si256((__m256i *)&a0[8 * i]); - - /* check-magic: 1488 == intdiv(2 * intdiv(MLDSA_Q - 1, 88), 128) */ - /* - * Compute f1 = round-(f / (2*GAMMA2)) as round-(f / (128B)) = - * round-(ceil(f / 128) / B) where B = 2*GAMMA2 / 128 = 1488. See - * mld_decompose() in mldsa/src/rounding.h for more details. - * - * range: 0 <= f <= Q-1 = 88*GAMMA2 = 44*128*B - */ - - /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */ - f1 = _mm256_add_epi32(f, off); - f1 = _mm256_srli_epi32(f1, 7); - /* - * range: 0 <= f1' <= (Q-1)/128 = 44B - * - * Also, f1' <= (Q-1)/128 = 2^16 - 2^6 < 2^16 ensures that the odd-index - * 16-bit lanes are all 0, so no bits will be dropped in the input of the - * _mm256_mulhi_epu16() below. - */ - - /* - * Compute f1 = round-(f1' / B) ≈ round(f1' * 11275 / 2^24). This is exact - * for 0 <= f1' < 2^16. See mld_decompose() in mldsa/src/rounding.h for the - * proof, and proofs/isabelle/compress for a formalization of the argument. - * - * round(f1' * 11275 / 2^24) is in turn computed in 2 steps as - * round(floor(f1' * 11275 / 2^16) / 2^8). The mulhi computes f1'' = - * floor(f1' * 11275 / 2^16). As for the next step f1 = round(f1'' / 2^8), - * because AVX2 doesn't have rounding right-shift (e.g. urshr in Neon), we - * simulate it using mulhrs with a power of 2, in this case mulhrs(f1'', - * 2^7) = round(f1'' * 2^7 / 2^15). (Note that the denominator is 2^15, - * not 2^16 as in mulhi.) - */ - f1 = _mm256_mulhi_epu16(f1, v); - /* - * range: 0 <= f1'' = floor(f1' * 11275 / 2^16) - * <= f1' * 11275 / 2^16 - * < 2^16 * 11275 / 2^16 = 11275 - * - * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that - * is, no erroneous sign-extension occurs. - */ - f1 = _mm256_mulhrs_epi16(f1, shift); - /* - * range: 0 <= f1 = round-(f1' / B) <= round-(44B / B) = 44 - * - * Note that the odd-index 16-bit lanes are still all 0 right now, so - * reinterpreting f1 as 8 lanes of int32_t (as done in the following) does - * not affect its value. - */ - - /* - * If f1 = 44, i.e. f > 87*GAMMA2, proceed as if f' = f - Q was given - * instead. (For f = 87*GAMMA2 + 1 thus f' = -GAMMA2, we still round it to 0 - * like other "wrapped around" cases.) - * - * Reference: They handle wrap-around in a somewhat convoluted way. Most - * notably, they compute remainder f0 with quotient f1 that's - * already wrapped around, so is off by q (instead of by 1) from - * what it should be ultimately. They detect the need for - * correction by checking if f0 is abnormally large. - * - * Our approach is closer to Algorithm 36 in the specification, - * in that we compute f0 normally and correct f1, f0 in the way - * they prescribed. The only real difference is that we check for - * wrap-around by examining f directly, instead of some other - * intermediates computed from it. - */ - - /* Check for wrap-around */ - t = _mm256_cmpgt_epi32(f, q_bound); - - /* Compute remainder f0 */ - f0 = _mm256_mullo_epi32(f1, alpha); - f0 = _mm256_sub_epi32(f, f0); - /* - * range: -GAMMA2 < f0 <= GAMMA2 - * - * This holds since f1 = round-(f / (2*GAMMA2)) was computed exactly. - */ - - /* If wrap-around is required, set f1 = 0 and f0 -= 1 */ - f1 = _mm256_andnot_si256(t, f1); - f0 = _mm256_add_epi32(f0, t); - /* range: 0 <= f1 <= 43, -GAMMA2 <= f0 <= GAMMA2 */ - - _mm256_store_si256((__m256i *)&a1[8 * i], f1); - _mm256_store_si256((__m256i *)&a0[8 * i], f0); - } -} -#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_SIGN_API && \ - !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ - (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \ - */ - -MLD_EMPTY_CU(avx2_poly_decompose_88) - -#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_SIGN_API && \ - !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ - (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \ - 44)) */ diff --git a/mldsa/src/native/x86_64/src/poly_decompose_88_avx2_asm.S b/mldsa/src/native/x86_64/src/poly_decompose_88_avx2_asm.S new file mode 100644 index 000000000..9a64ec314 --- /dev/null +++ b/mldsa/src/native/x86_64/src/poly_decompose_88_avx2_asm.S @@ -0,0 +1,478 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + * + * The algorithm for Decompose(r) (more specifically the handling for the + * wrap-around cases) is modified. See the AVX2 intrinsics version + * (poly_decompose_88_avx2.c, predecessor of this file) for a more detailed + * comparison. + */ + + +/************************************************* + * Name: mld_poly_decompose_88_avx2_asm + * + * Description: For all coefficients c of the input polynomial, + * compute high and low bits c0, c1 such c mod^+ Q = c1*ALPHA + c0 + * with -ALPHA/2 < c0 <= ALPHA/2 except if c1 = (Q-1)/ALPHA where we + * set c1 = 0 and -ALPHA/2 <= c0 = c mod^+ Q - Q < 0. + * Assumes coefficients to be standard representatives. + * For ML-DSA-44 (gamma2 = (Q-1)/88, alpha = 2*gamma2). + * + * Arguments: - int32_t *a1: pointer to output polynomial with coefficients c1 + * - int32_t *a0: pointer to input/output polynomial. On input, + * holds the standard representatives c. On output, + * holds the low bits c0. + **************************************************/ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_NO_SIGN_API) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + MLD_CONFIG_PARAMETER_SET == 44) + + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/poly_decompose_88_avx2_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(poly_decompose_88_avx2_asm) +MLD_ASM_FN_SYMBOL(poly_decompose_88_avx2_asm) + + .cfi_startproc + movl $0x7f, %eax + vmovd %eax, %xmm10 + vpbroadcastd %xmm10, %ymm10 + movl $0x2c0b, %eax # imm = 0x2C0B + vmovd %eax, %xmm11 + vpbroadcastd %xmm11, %ymm11 + movl $0x80, %eax + vmovd %eax, %xmm12 + vpbroadcastd %xmm12, %ymm12 + movl $0x7e6c00, %eax # imm = 0x7E6C00 + vmovd %eax, %xmm13 + vpbroadcastd %xmm13, %ymm13 + movl $0x2e800, %eax # imm = 0x2E800 + vmovd %eax, %xmm14 + vpbroadcastd %xmm14, %ymm14 + vmovdqa (%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, (%rdi) + vmovdqa %ymm2, (%rsi) + vmovdqa 0x20(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x20(%rdi) + vmovdqa %ymm2, 0x20(%rsi) + vmovdqa 0x40(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x40(%rdi) + vmovdqa %ymm2, 0x40(%rsi) + vmovdqa 0x60(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x60(%rdi) + vmovdqa %ymm2, 0x60(%rsi) + vmovdqa 0x80(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x80(%rdi) + vmovdqa %ymm2, 0x80(%rsi) + vmovdqa 0xa0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0xa0(%rdi) + vmovdqa %ymm2, 0xa0(%rsi) + vmovdqa 0xc0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0xc0(%rdi) + vmovdqa %ymm2, 0xc0(%rsi) + vmovdqa 0xe0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0xe0(%rdi) + vmovdqa %ymm2, 0xe0(%rsi) + vmovdqa 0x100(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x100(%rdi) + vmovdqa %ymm2, 0x100(%rsi) + vmovdqa 0x120(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x120(%rdi) + vmovdqa %ymm2, 0x120(%rsi) + vmovdqa 0x140(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x140(%rdi) + vmovdqa %ymm2, 0x140(%rsi) + vmovdqa 0x160(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x160(%rdi) + vmovdqa %ymm2, 0x160(%rsi) + vmovdqa 0x180(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x180(%rdi) + vmovdqa %ymm2, 0x180(%rsi) + vmovdqa 0x1a0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x1a0(%rdi) + vmovdqa %ymm2, 0x1a0(%rsi) + vmovdqa 0x1c0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x1c0(%rdi) + vmovdqa %ymm2, 0x1c0(%rsi) + vmovdqa 0x1e0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x1e0(%rdi) + vmovdqa %ymm2, 0x1e0(%rsi) + vmovdqa 0x200(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x200(%rdi) + vmovdqa %ymm2, 0x200(%rsi) + vmovdqa 0x220(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x220(%rdi) + vmovdqa %ymm2, 0x220(%rsi) + vmovdqa 0x240(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x240(%rdi) + vmovdqa %ymm2, 0x240(%rsi) + vmovdqa 0x260(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x260(%rdi) + vmovdqa %ymm2, 0x260(%rsi) + vmovdqa 0x280(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x280(%rdi) + vmovdqa %ymm2, 0x280(%rsi) + vmovdqa 0x2a0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x2a0(%rdi) + vmovdqa %ymm2, 0x2a0(%rsi) + vmovdqa 0x2c0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x2c0(%rdi) + vmovdqa %ymm2, 0x2c0(%rsi) + vmovdqa 0x2e0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x2e0(%rdi) + vmovdqa %ymm2, 0x2e0(%rsi) + vmovdqa 0x300(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x300(%rdi) + vmovdqa %ymm2, 0x300(%rsi) + vmovdqa 0x320(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x320(%rdi) + vmovdqa %ymm2, 0x320(%rsi) + vmovdqa 0x340(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x340(%rdi) + vmovdqa %ymm2, 0x340(%rsi) + vmovdqa 0x360(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x360(%rdi) + vmovdqa %ymm2, 0x360(%rsi) + vmovdqa 0x380(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x380(%rdi) + vmovdqa %ymm2, 0x380(%rsi) + vmovdqa 0x3a0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x3a0(%rdi) + vmovdqa %ymm2, 0x3a0(%rsi) + vmovdqa 0x3c0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x3c0(%rdi) + vmovdqa %ymm2, 0x3c0(%rsi) + vmovdqa 0x3e0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x3e0(%rdi) + vmovdqa %ymm2, 0x3e0(%rsi) + retq + .cfi_endproc + +MLD_ASM_FN_SIZE(poly_decompose_88_avx2_asm) + + +#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_SIGN_API && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \ + */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/nix/s2n_bignum/default.nix b/nix/s2n_bignum/default.nix index 77a58d11d..75bcb6483 100644 --- a/nix/s2n_bignum/default.nix +++ b/nix/s2n_bignum/default.nix @@ -4,12 +4,12 @@ { stdenv, fetchFromGitHub, writeText, ... }: stdenv.mkDerivation rec { pname = "s2n_bignum"; - version = "2f8b8d8562ef001508d497f6b31a4bdd2add0c8e"; + version = "9061e8b76522beafa5ca020f3c8d99b23eba4fbc"; src = fetchFromGitHub { owner = "awslabs"; repo = "s2n-bignum"; rev = "${version}"; - hash = "sha256-rz6qzDMUapxOtu0lsj9uWhPnURMNcCCVC79Zs7SdrZA="; + hash = "sha256-NvtrVfiz5yxfdNvD0P1wSQrn37znuWMbNWxys4jZlU4="; }; setupHook = writeText "setup-hook.sh" '' export S2N_BIGNUM_DIR="$1" diff --git a/proofs/cbmc/poly_decompose_native_x86_64/Makefile b/proofs/cbmc/poly_decompose_native_x86_64/Makefile new file mode 100644 index 000000000..d6dd6d9b1 --- /dev/null +++ b/proofs/cbmc/poly_decompose_native_x86_64/Makefile @@ -0,0 +1,53 @@ +# Copyright (c) The mldsa-native project authors +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +include ../Makefile_params.common + +HARNESS_ENTRY = harness +HARNESS_FILE = poly_decompose_native_x86_64_harness + +# This should be a unique identifier for this proof, and will appear on the +# Litani dashboard. It can be human-readable and contain spaces if you wish. +PROOF_UID = poly_decompose_native_x86_64 + +# We need to set MLD_CHECK_APIS as otherwise mldsa/src/native/api.h won't be +# included, which contains the CBMC specifications. +DEFINES += -DMLD_CONFIG_USE_NATIVE_BACKEND_ARITH -DMLD_CONFIG_ARITH_BACKEND_FILE="\"$(SRCDIR)/mldsa/src/native/x86_64/meta.h\"" -DMLD_CHECK_APIS +INCLUDES += + +REMOVE_FUNCTION_BODY += +UNWINDSET += + +PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c +PROJECT_SOURCES += $(SRCDIR)/mldsa/src/poly_kl.c + +ifeq ($(MLD_CONFIG_PARAMETER_SET),44) + CHECK_FUNCTION_CONTRACTS=mld_poly_decompose_88_native + USE_FUNCTION_CONTRACTS=mld_poly_decompose_88_avx2_asm +else ifeq ($(MLD_CONFIG_PARAMETER_SET),65) + CHECK_FUNCTION_CONTRACTS=mld_poly_decompose_32_native + USE_FUNCTION_CONTRACTS=mld_poly_decompose_32_avx2_asm +else ifeq ($(MLD_CONFIG_PARAMETER_SET),87) + CHECK_FUNCTION_CONTRACTS=mld_poly_decompose_32_native + USE_FUNCTION_CONTRACTS=mld_poly_decompose_32_avx2_asm +endif +USE_FUNCTION_CONTRACTS+=mld_sys_check_capability +APPLY_LOOP_CONTRACTS=on +USE_DYNAMIC_FRAMES=1 + +# Disable any setting of EXTERNAL_SAT_SOLVER, and choose SMT backend instead +EXTERNAL_SAT_SOLVER= +CBMCFLAGS=--smt2 + +FUNCTION_NAME = poly_decompose_native_x86_64 + +# If this proof is found to consume huge amounts of RAM, you can set the +# EXPENSIVE variable. With new enough versions of the proof tools, this will +# restrict the number of EXPENSIVE CBMC jobs running at once. See the +# documentation in Makefile.common under the "Job Pools" heading for details. +# EXPENSIVE = true + +# This function is large enough to need... +CBMC_OBJECT_BITS = 8 + +include ../Makefile.common diff --git a/proofs/cbmc/poly_decompose_native_x86_64/poly_decompose_native_x86_64_harness.c b/proofs/cbmc/poly_decompose_native_x86_64/poly_decompose_native_x86_64_harness.c new file mode 100644 index 000000000..21973c4af --- /dev/null +++ b/proofs/cbmc/poly_decompose_native_x86_64/poly_decompose_native_x86_64_harness.c @@ -0,0 +1,24 @@ +// Copyright (c) The mldsa-native project authors +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +#include +#include "cbmc.h" +#include "params.h" + +#if MLDSA_GAMMA2 == ((MLDSA_Q - 1) / 88) +int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0); +#else +int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0); +#endif + +void harness(void) +{ + int32_t *a1, *a0; + int t; + +#if MLDSA_GAMMA2 == ((MLDSA_Q - 1) / 88) + t = mld_poly_decompose_88_native(a1, a0); +#else + t = mld_poly_decompose_32_native(a1, a0); +#endif +} diff --git a/proofs/hol_light/README.md b/proofs/hol_light/README.md index 606c4e657..8958b80ec 100644 --- a/proofs/hol_light/README.md +++ b/proofs/hol_light/README.md @@ -168,6 +168,8 @@ All routines listed below have been proven correct, memory-safe, and secret-inde * x86_64 pointwise multiplication-accumulation (l=5): [pointwise_acc_l5_avx2_asm.S](x86_64/mldsa/pointwise_acc_l5_avx2_asm.S) * x86_64 pointwise multiplication-accumulation (l=7): [pointwise_acc_l7_avx2_asm.S](x86_64/mldsa/pointwise_acc_l7_avx2_asm.S) * x86_64 poly_chknorm: [poly_chknorm_avx2_asm.S](x86_64/mldsa/poly_chknorm_avx2_asm.S) + * x86_64 poly_decompose (l=5,7): [poly_decompose_32_avx2_asm.S](x86_64/mldsa/poly_decompose_32_avx2_asm.S) + * x86_64 poly_decompose (l=4): [poly_decompose_88_avx2_asm.S](x86_64/mldsa/poly_decompose_88_avx2_asm.S) * x86_64 polyz_unpack (l=4): [polyz_unpack_17_avx2_asm.S](x86_64/mldsa/polyz_unpack_17_avx2_asm.S) * x86_64 polyz_unpack (l=5,7): [polyz_unpack_19_avx2_asm.S](x86_64/mldsa/polyz_unpack_19_avx2_asm.S) - FIPS202: diff --git a/proofs/hol_light/x86_64/Makefile b/proofs/hol_light/x86_64/Makefile index 693078496..2fbd9bbb9 100644 --- a/proofs/hol_light/x86_64/Makefile +++ b/proofs/hol_light/x86_64/Makefile @@ -55,6 +55,8 @@ OBJ = mldsa/ntt_avx2_asm.o \ mldsa/nttunpack_avx2_asm.o \ mldsa/poly_caddq_avx2_asm.o \ mldsa/poly_chknorm_avx2_asm.o \ + mldsa/poly_decompose_32_avx2_asm.o \ + mldsa/poly_decompose_88_avx2_asm.o \ mldsa/polyz_unpack_17_avx2_asm.o \ mldsa/polyz_unpack_19_avx2_asm.o \ mldsa/pointwise_avx2_asm.o \ diff --git a/proofs/hol_light/x86_64/mldsa/poly_decompose_32_avx2_asm.S b/proofs/hol_light/x86_64/mldsa/poly_decompose_32_avx2_asm.S new file mode 100644 index 000000000..1bdf112e3 --- /dev/null +++ b/proofs/hol_light/x86_64/mldsa/poly_decompose_32_avx2_asm.S @@ -0,0 +1,466 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + * + * The algorithm for Decompose(r) (more specifically the handling for the + * wrap-around cases) is modified. See the AVX2 intrinsics version + * (poly_decompose_32_avx2.c, predecessor of this file) for a more detailed + * comparison. + */ + + +/************************************************* + * Name: mld_poly_decompose_32_avx2_asm + * + * Description: For all coefficients c of the input polynomial, + * compute high and low bits c0, c1 such c mod^+ Q = c1*ALPHA + c0 + * with -ALPHA/2 < c0 <= ALPHA/2 except if c1 = (Q-1)/ALPHA where we + * set c1 = 0 and -ALPHA/2 <= c0 = c mod^+ Q - Q < 0. + * Assumes coefficients to be standard representatives. + * For ML-DSA-65 / ML-DSA-87 (gamma2 = (Q-1)/32, alpha = 2*gamma2). + * + * Arguments: - int32_t *a1: pointer to output polynomial with coefficients c1 + * - int32_t *a0: pointer to input/output polynomial. On input, + * holds the standard representatives c. On output, + * holds the low bits c0. + **************************************************/ + + + + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/poly_decompose_32_avx2_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 +#ifdef __APPLE__ +.global _mld_poly_decompose_32_avx2_asm +_mld_poly_decompose_32_avx2_asm: +#else +.global mld_poly_decompose_32_avx2_asm +mld_poly_decompose_32_avx2_asm: +#endif + + .cfi_startproc + endbr64 + movl $0x7f, %eax + vmovd %eax, %xmm10 + vpbroadcastd %xmm10, %ymm10 + movl $0x401, %eax # imm = 0x401 + vmovd %eax, %xmm11 + vpbroadcastd %xmm11, %ymm11 + movl $0x200, %eax # imm = 0x200 + vmovd %eax, %xmm12 + vpbroadcastd %xmm12, %ymm12 + movl $0x7be100, %eax # imm = 0x7BE100 + vmovd %eax, %xmm13 + vpbroadcastd %xmm13, %ymm13 + movl $0x7fe00, %eax # imm = 0x7FE00 + vmovd %eax, %xmm14 + vpbroadcastd %xmm14, %ymm14 + vmovdqa (%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, (%rdi) + vmovdqa %ymm2, (%rsi) + vmovdqa 0x20(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x20(%rdi) + vmovdqa %ymm2, 0x20(%rsi) + vmovdqa 0x40(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x40(%rdi) + vmovdqa %ymm2, 0x40(%rsi) + vmovdqa 0x60(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x60(%rdi) + vmovdqa %ymm2, 0x60(%rsi) + vmovdqa 0x80(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x80(%rdi) + vmovdqa %ymm2, 0x80(%rsi) + vmovdqa 0xa0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0xa0(%rdi) + vmovdqa %ymm2, 0xa0(%rsi) + vmovdqa 0xc0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0xc0(%rdi) + vmovdqa %ymm2, 0xc0(%rsi) + vmovdqa 0xe0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0xe0(%rdi) + vmovdqa %ymm2, 0xe0(%rsi) + vmovdqa 0x100(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x100(%rdi) + vmovdqa %ymm2, 0x100(%rsi) + vmovdqa 0x120(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x120(%rdi) + vmovdqa %ymm2, 0x120(%rsi) + vmovdqa 0x140(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x140(%rdi) + vmovdqa %ymm2, 0x140(%rsi) + vmovdqa 0x160(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x160(%rdi) + vmovdqa %ymm2, 0x160(%rsi) + vmovdqa 0x180(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x180(%rdi) + vmovdqa %ymm2, 0x180(%rsi) + vmovdqa 0x1a0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x1a0(%rdi) + vmovdqa %ymm2, 0x1a0(%rsi) + vmovdqa 0x1c0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x1c0(%rdi) + vmovdqa %ymm2, 0x1c0(%rsi) + vmovdqa 0x1e0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x1e0(%rdi) + vmovdqa %ymm2, 0x1e0(%rsi) + vmovdqa 0x200(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x200(%rdi) + vmovdqa %ymm2, 0x200(%rsi) + vmovdqa 0x220(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x220(%rdi) + vmovdqa %ymm2, 0x220(%rsi) + vmovdqa 0x240(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x240(%rdi) + vmovdqa %ymm2, 0x240(%rsi) + vmovdqa 0x260(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x260(%rdi) + vmovdqa %ymm2, 0x260(%rsi) + vmovdqa 0x280(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x280(%rdi) + vmovdqa %ymm2, 0x280(%rsi) + vmovdqa 0x2a0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x2a0(%rdi) + vmovdqa %ymm2, 0x2a0(%rsi) + vmovdqa 0x2c0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x2c0(%rdi) + vmovdqa %ymm2, 0x2c0(%rsi) + vmovdqa 0x2e0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x2e0(%rdi) + vmovdqa %ymm2, 0x2e0(%rsi) + vmovdqa 0x300(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x300(%rdi) + vmovdqa %ymm2, 0x300(%rsi) + vmovdqa 0x320(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x320(%rdi) + vmovdqa %ymm2, 0x320(%rsi) + vmovdqa 0x340(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x340(%rdi) + vmovdqa %ymm2, 0x340(%rsi) + vmovdqa 0x360(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x360(%rdi) + vmovdqa %ymm2, 0x360(%rsi) + vmovdqa 0x380(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x380(%rdi) + vmovdqa %ymm2, 0x380(%rsi) + vmovdqa 0x3a0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x3a0(%rdi) + vmovdqa %ymm2, 0x3a0(%rsi) + vmovdqa 0x3c0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x3c0(%rdi) + vmovdqa %ymm2, 0x3c0(%rsi) + vmovdqa 0x3e0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x3e0(%rdi) + vmovdqa %ymm2, 0x3e0(%rsi) + retq + .cfi_endproc + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/proofs/hol_light/x86_64/mldsa/poly_decompose_88_avx2_asm.S b/proofs/hol_light/x86_64/mldsa/poly_decompose_88_avx2_asm.S new file mode 100644 index 000000000..3fb54d7c5 --- /dev/null +++ b/proofs/hol_light/x86_64/mldsa/poly_decompose_88_avx2_asm.S @@ -0,0 +1,466 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + * + * The algorithm for Decompose(r) (more specifically the handling for the + * wrap-around cases) is modified. See the AVX2 intrinsics version + * (poly_decompose_88_avx2.c, predecessor of this file) for a more detailed + * comparison. + */ + + +/************************************************* + * Name: mld_poly_decompose_88_avx2_asm + * + * Description: For all coefficients c of the input polynomial, + * compute high and low bits c0, c1 such c mod^+ Q = c1*ALPHA + c0 + * with -ALPHA/2 < c0 <= ALPHA/2 except if c1 = (Q-1)/ALPHA where we + * set c1 = 0 and -ALPHA/2 <= c0 = c mod^+ Q - Q < 0. + * Assumes coefficients to be standard representatives. + * For ML-DSA-44 (gamma2 = (Q-1)/88, alpha = 2*gamma2). + * + * Arguments: - int32_t *a1: pointer to output polynomial with coefficients c1 + * - int32_t *a0: pointer to input/output polynomial. On input, + * holds the standard representatives c. On output, + * holds the low bits c0. + **************************************************/ + + + + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/poly_decompose_88_avx2_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 +#ifdef __APPLE__ +.global _mld_poly_decompose_88_avx2_asm +_mld_poly_decompose_88_avx2_asm: +#else +.global mld_poly_decompose_88_avx2_asm +mld_poly_decompose_88_avx2_asm: +#endif + + .cfi_startproc + endbr64 + movl $0x7f, %eax + vmovd %eax, %xmm10 + vpbroadcastd %xmm10, %ymm10 + movl $0x2c0b, %eax # imm = 0x2C0B + vmovd %eax, %xmm11 + vpbroadcastd %xmm11, %ymm11 + movl $0x80, %eax + vmovd %eax, %xmm12 + vpbroadcastd %xmm12, %ymm12 + movl $0x7e6c00, %eax # imm = 0x7E6C00 + vmovd %eax, %xmm13 + vpbroadcastd %xmm13, %ymm13 + movl $0x2e800, %eax # imm = 0x2E800 + vmovd %eax, %xmm14 + vpbroadcastd %xmm14, %ymm14 + vmovdqa (%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, (%rdi) + vmovdqa %ymm2, (%rsi) + vmovdqa 0x20(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x20(%rdi) + vmovdqa %ymm2, 0x20(%rsi) + vmovdqa 0x40(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x40(%rdi) + vmovdqa %ymm2, 0x40(%rsi) + vmovdqa 0x60(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x60(%rdi) + vmovdqa %ymm2, 0x60(%rsi) + vmovdqa 0x80(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x80(%rdi) + vmovdqa %ymm2, 0x80(%rsi) + vmovdqa 0xa0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0xa0(%rdi) + vmovdqa %ymm2, 0xa0(%rsi) + vmovdqa 0xc0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0xc0(%rdi) + vmovdqa %ymm2, 0xc0(%rsi) + vmovdqa 0xe0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0xe0(%rdi) + vmovdqa %ymm2, 0xe0(%rsi) + vmovdqa 0x100(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x100(%rdi) + vmovdqa %ymm2, 0x100(%rsi) + vmovdqa 0x120(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x120(%rdi) + vmovdqa %ymm2, 0x120(%rsi) + vmovdqa 0x140(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x140(%rdi) + vmovdqa %ymm2, 0x140(%rsi) + vmovdqa 0x160(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x160(%rdi) + vmovdqa %ymm2, 0x160(%rsi) + vmovdqa 0x180(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x180(%rdi) + vmovdqa %ymm2, 0x180(%rsi) + vmovdqa 0x1a0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x1a0(%rdi) + vmovdqa %ymm2, 0x1a0(%rsi) + vmovdqa 0x1c0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x1c0(%rdi) + vmovdqa %ymm2, 0x1c0(%rsi) + vmovdqa 0x1e0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x1e0(%rdi) + vmovdqa %ymm2, 0x1e0(%rsi) + vmovdqa 0x200(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x200(%rdi) + vmovdqa %ymm2, 0x200(%rsi) + vmovdqa 0x220(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x220(%rdi) + vmovdqa %ymm2, 0x220(%rsi) + vmovdqa 0x240(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x240(%rdi) + vmovdqa %ymm2, 0x240(%rsi) + vmovdqa 0x260(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x260(%rdi) + vmovdqa %ymm2, 0x260(%rsi) + vmovdqa 0x280(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x280(%rdi) + vmovdqa %ymm2, 0x280(%rsi) + vmovdqa 0x2a0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x2a0(%rdi) + vmovdqa %ymm2, 0x2a0(%rsi) + vmovdqa 0x2c0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x2c0(%rdi) + vmovdqa %ymm2, 0x2c0(%rsi) + vmovdqa 0x2e0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x2e0(%rdi) + vmovdqa %ymm2, 0x2e0(%rsi) + vmovdqa 0x300(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x300(%rdi) + vmovdqa %ymm2, 0x300(%rsi) + vmovdqa 0x320(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x320(%rdi) + vmovdqa %ymm2, 0x320(%rsi) + vmovdqa 0x340(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x340(%rdi) + vmovdqa %ymm2, 0x340(%rsi) + vmovdqa 0x360(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x360(%rdi) + vmovdqa %ymm2, 0x360(%rsi) + vmovdqa 0x380(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x380(%rdi) + vmovdqa %ymm2, 0x380(%rsi) + vmovdqa 0x3a0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x3a0(%rdi) + vmovdqa %ymm2, 0x3a0(%rsi) + vmovdqa 0x3c0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x3c0(%rdi) + vmovdqa %ymm2, 0x3c0(%rsi) + vmovdqa 0x3e0(%rsi), %ymm0 + vpaddd %ymm10, %ymm0, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm11, %ymm1, %ymm1 + vpmulhrsw %ymm12, %ymm1, %ymm1 + vpcmpgtd %ymm13, %ymm0, %ymm3 + vpmulld %ymm14, %ymm1, %ymm2 + vpsubd %ymm2, %ymm0, %ymm2 + vpandn %ymm1, %ymm3, %ymm1 + vpaddd %ymm3, %ymm2, %ymm2 + vmovdqa %ymm1, 0x3e0(%rdi) + vmovdqa %ymm2, 0x3e0(%rsi) + retq + .cfi_endproc + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/proofs/hol_light/x86_64/proofs/dump_bytecode.ml b/proofs/hol_light/x86_64/proofs/dump_bytecode.ml index d5b7ad12f..b091bdb41 100644 --- a/proofs/hol_light/x86_64/proofs/dump_bytecode.ml +++ b/proofs/hol_light/x86_64/proofs/dump_bytecode.ml @@ -45,6 +45,14 @@ print_string "=== bytecode start: x86_64/mldsa/poly_chknorm_avx2_asm.o ========= print_literal_from_elf "x86_64/mldsa/poly_chknorm_avx2_asm.o";; print_string "==== bytecode end =====================================\n\n";; +print_string "=== bytecode start: x86_64/mldsa/poly_decompose_32_avx2_asm.o ================\n";; +print_literal_from_elf "x86_64/mldsa/poly_decompose_32_avx2_asm.o";; +print_string "==== bytecode end =====================================\n\n";; + +print_string "=== bytecode start: x86_64/mldsa/poly_decompose_88_avx2_asm.o ================\n";; +print_literal_from_elf "x86_64/mldsa/poly_decompose_88_avx2_asm.o";; +print_string "==== bytecode end =====================================\n\n";; + print_string "=== bytecode start: x86_64/mldsa/polyz_unpack_17_avx2_asm.o ================\n";; print_literal_from_elf "x86_64/mldsa/polyz_unpack_17_avx2_asm.o";; print_string "==== bytecode end =====================================\n\n";; diff --git a/proofs/hol_light/x86_64/proofs/mldsa_utils.ml b/proofs/hol_light/x86_64/proofs/mldsa_utils.ml index 88b6bf6ac..a12b11b80 100644 --- a/proofs/hol_light/x86_64/proofs/mldsa_utils.ml +++ b/proofs/hol_light/x86_64/proofs/mldsa_utils.ml @@ -77,6 +77,120 @@ let MAP_UNTIL_TARGET_PC f n = fun (asl, w) -> core n (asl, w);; (* ------------------------------------------------------------------------- *) +(* Word-arithmetic helper lemmas shared by the poly_decompose_{32,88} AVX2 *) +(* proofs. These are x86-only and live here (rather than common/) so they do *) +(* not leak into the AArch64 proofs, which carry their own copies. *) +(* ------------------------------------------------------------------------- *) + +(* val of the right-shift-by-7 of (x+127), for in-range x. *) +let H_T = prove( + `!x:int32. val x < 8380417 ==> + val(word_ushr (word_add (x:int32) (word 127)) 7) = (val x + 127) DIV 128`, + GEN_TAC THEN DISCH_TAC THEN + REWRITE_TAC[VAL_WORD_USHR; VAL_WORD_ADD; VAL_WORD; DIMINDEX_32] THEN + CONV_TAC NUM_REDUCE_CONV THEN + SUBGOAL_THEN `(val(x:int32) + 127) MOD 4294967296 = val x + 127` SUBST1_TAC THENL + [MATCH_MP_TAC MOD_LT THEN ASM_ARITH_TAC; REWRITE_TAC[]]);; + +(* The shifted value stays below the next power-of-two boundary. *) +let T_BOUND = prove( + `!x:int32. val x < 8380417 ==> (val x + 127) DIV 128 < 65473`, + GEN_TAC THEN DISCH_TAC THEN + SIMP_TAC[RDIV_LT_EQ; ARITH_RULE `~(128 = 0)`] THEN ASM_ARITH_TAC);; + +(* DIV bound helper: pins a quotient from a multiplicative bracket. *) +let DIV_BOUNDS_EQ = prove( + `!b d q. ~(d = 0) /\ q * d <= b /\ b < (q + 1) * d ==> b DIV d = q`, + REPEAT STRIP_TAC THEN MATCH_MP_TAC(ARITH_RULE `q <= r /\ r < q + 1 ==> r = q`) THEN + CONJ_TAC THENL + [ASM_SIMP_TAC[LE_RDIV_EQ] THEN ASM_ARITH_TAC; + ASM_SIMP_TAC[RDIV_LT_EQ] THEN ASM_ARITH_TAC]);; + +(* ival = val for in-range positive int32. *) +let IVAL_EQ_VAL = prove( + `!x:int32. val x < 2 EXP 31 ==> ival x = &(val x)`, + GEN_TAC THEN REWRITE_TAC[IVAL_VAL; DIMINDEX_32] THEN + CONV_TAC(ONCE_DEPTH_CONV NUM_EXP_CONV) THEN + DISCH_TAC THEN + SUBGOAL_THEN `bit (32 - 1) (x:int32) = F` ASSUME_TAC THENL + [REWRITE_TAC[BIT_VAL; DIMINDEX_32] THEN CONV_TAC NUM_REDUCE_CONV THEN ASM_ARITH_TAC; + ASM_REWRITE_TAC[bitval] THEN INT_ARITH_TAC]);; + +(* val of a 16->32 sign extension when the source is below 2^15. *) +let VAL_SX_16_32 = prove( + `!w:16 word. val w < 32768 ==> val(word_sx w:int32) = val w`, + GEN_TAC THEN DISCH_TAC THEN + SUBGOAL_THEN `bit 15 (w:16 word) = F` ASSUME_TAC THENL + [REWRITE_TAC[BIT_VAL; DIMINDEX_16] THEN CONV_TAC NUM_REDUCE_CONV THEN + SUBGOAL_THEN `val(w:16 word) DIV 32768 = 0` SUBST1_TAC THENL + [MATCH_MP_TAC DIV_LT THEN ASM_REWRITE_TAC[]; CONV_TAC NUM_REDUCE_CONV]; ALL_TAC] THEN + SUBGOAL_THEN `ival(w:16 word) = &(val w)` ASSUME_TAC THENL + [MP_TAC(ISPEC `w:16 word` VAL_IVAL) THEN + REWRITE_TAC[DIMINDEX_16; ARITH_RULE `16 - 1 = 15`] THEN + ASM_REWRITE_TAC[BITVAL_CLAUSES; INT_MUL_RZERO; INT_ADD_RID] THEN INT_ARITH_TAC; ALL_TAC] THEN + SUBGOAL_THEN `ival(word_sx (w:16 word):int32) = &(val w)` ASSUME_TAC THENL + [MP_TAC(ISPECL [`w:16 word`] (INST_TYPE [`:16`,`:M`; `:32`,`:N`] IVAL_WORD_SX)) THEN + REWRITE_TAC[DIMINDEX_16; DIMINDEX_32] THEN ANTS_TAC THENL [ARITH_TAC; ALL_TAC] THEN + ASM_REWRITE_TAC[]; ALL_TAC] THEN + MP_TAC(ISPEC `word_sx (w:16 word):int32` VAL_IVAL) THEN + REWRITE_TAC[DIMINDEX_32; ARITH_RULE `32 - 1 = 31`] THEN + SUBGOAL_THEN `bit 31 (word_sx (w:16 word):int32) = F` SUBST1_TAC THENL + [MP_TAC(ISPEC `word_sx (w:16 word):int32` MSB_IVAL) THEN + REWRITE_TAC[DIMINDEX_32; ARITH_RULE `32 - 1 = 31`] THEN DISCH_THEN SUBST1_TAC THEN + ASM_REWRITE_TAC[] THEN INT_ARITH_TAC; ALL_TAC] THEN + REWRITE_TAC[BITVAL_CLAUSES; INT_MUL_RZERO; INT_ADD_RID] THEN + ASM_REWRITE_TAC[] THEN REWRITE_TAC[INT_OF_NUM_EQ] THEN ASM_MESON_TAC[]);; + +(* Signed comparison against a non-negative bound below 2^31: word_igt + reduces to a comparison on the signed interpretation. Parameterized over + the threshold b (variant-specific GAMMA2-derived constant). *) +let IGT_BOUND_GEN = prove( + `!x:int32 b. b < 2147483648 ==> (word_igt x (word b) <=> ival x > &b)`, + REPEAT GEN_TAC THEN DISCH_TAC THEN + SUBGOAL_THEN `ival(word b:int32) = &b` ASSUME_TAC THENL + [MP_TAC(ISPEC `word b:int32` IVAL_EQ_VAL) THEN + REWRITE_TAC[VAL_WORD; DIMINDEX_32] THEN + SUBGOAL_THEN `b MOD 2 EXP 32 = b` SUBST1_TAC THENL + [MATCH_MP_TAC MOD_LT THEN UNDISCH_TAC `b < 2147483648` THEN ARITH_TAC; + ANTS_TAC THENL [UNDISCH_TAC `b < 2147483648` THEN ARITH_TAC; SIMP_TAC[]]]; + ASM_REWRITE_TAC[WORD_IGT; irelational2; GT]]);; + +(* High 16 bits of a 16x16->32 unsigned multiply (VPMULHUW lane semantics). + Parameterized over the multiplier m (the Barrett magic constant). *) +let MULHI_LANE_GEN = prove( + `!t:int32 m. val t < 65536 /\ m < 65536 ==> + val(word_subword (word_mul (word_zx (word_subword t (0,16):16 word):int32) + (word m)) (16,16):16 word) = + (val t * m) DIV 65536`, + REPEAT GEN_TAC THEN STRIP_TAC THEN + SUBGOAL_THEN `val(t:int32) * m < 4294967296` ASSUME_TAC THENL + [MATCH_MP_TAC LET_TRANS THEN EXISTS_TAC `65535 * 65535` THEN + CONJ_TAC THENL [MATCH_MP_TAC LE_MULT2 THEN ASM_ARITH_TAC; ARITH_TAC]; + ALL_TAC] THEN + REWRITE_TAC[VAL_WORD_SUBWORD; VAL_WORD_MUL; VAL_WORD_ZX_GEN; VAL_WORD; + DIMINDEX_16; DIMINDEX_32] THEN + CONV_TAC NUM_REDUCE_CONV THEN + SUBGOAL_THEN `val(t:int32) DIV 1 = val t /\ val(t:int32) MOD 65536 = val t` + (fun th -> REWRITE_TAC[th]) THENL + [ASM_SIMP_TAC[DIV_1; MOD_LT]; ALL_TAC] THEN + SUBGOAL_THEN `val(t:int32) MOD 4294967296 = val t /\ m MOD 4294967296 = m` + (fun th -> REWRITE_TAC[th]) THENL + [CONJ_TAC THEN MATCH_MP_TAC MOD_LT THEN ASM_ARITH_TAC; ALL_TAC] THEN + SUBGOAL_THEN `(val(t:int32) * m) MOD 4294967296 = val t * m` SUBST1_TAC THENL + [MATCH_MP_TAC MOD_LT THEN ASM_REWRITE_TAC[]; ALL_TAC] THEN + MATCH_MP_TAC MOD_LT THEN + SIMP_TAC[RDIV_LT_EQ; ARITH_RULE `~(65536 = 0)`] THEN ASM_ARITH_TAC);; + +(* word_not distributes over word_join at the 64/128/256-bit AVX2 lane widths. *) +let WORD_NOT_JOIN_64 = WORD_BLAST + `!a b : int32. word_not ((word_join:int32->int32->int64) a b) = + word_join (word_not a) (word_not b)`;; +let WORD_NOT_JOIN_128 = WORD_BLAST + `!a b : int64. word_not ((word_join:int64->int64->int128) a b) = + word_join (word_not a) (word_not b)`;; +let WORD_NOT_JOIN_256 = WORD_BLAST + `!a b : int128. word_not ((word_join:int128->int128->int256) a b) = + word_join (word_not a) (word_not b)`;; (* Coefficient (un)packing helpers shared across the polyz_unpack proofs. *) (* ------------------------------------------------------------------------- *) diff --git a/proofs/hol_light/x86_64/proofs/poly_decompose_32_avx2_asm.ml b/proofs/hol_light/x86_64/proofs/poly_decompose_32_avx2_asm.ml new file mode 100644 index 000000000..455229c9c --- /dev/null +++ b/proofs/hol_light/x86_64/proofs/poly_decompose_32_avx2_asm.ml @@ -0,0 +1,1300 @@ +(* + * Copyright (c) The mldsa-native project authors + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(* ========================================================================= *) +(* Functional correctness of poly_decompose_32 (x86_64 AVX2): *) +(* Decompose polynomial coefficients into (a1, a0) where *) +(* a mod^+ Q = a1*2*GAMMA2 + a0 for GAMMA2 = (Q-1)/32 = 261888 *) +(* (ML-DSA-65 / ML-DSA-87). *) +(* *) +(* The high-bits quotient is computed with the AVX2 mulhi/mulhrs Barrett *) +(* sequence (VPADDD/VPSRLD/VPMULHUW/VPMULHRSW), which differs from the *) +(* AArch64 SQDMULH+SRSHR path but computes the same round-half-down *) +(* quotient. *) +(* ========================================================================= *) + +needs "s2n_bignum/x86/proofs/base.ml";; +needs "mldsa_native/common/mldsa_specs.ml";; +needs "mldsa_native/x86_64/proofs/mldsa_utils.ml";; + + +(**** print_literal_from_elf "x86_64/mldsa/poly_decompose_32_avx2_asm.o";; + ****) + +let mldsa_decompose32_mc = define_assert_from_elf "mldsa_decompose32_mc" "x86_64/mldsa/poly_decompose_32_avx2_asm.o" +(*** BYTECODE START ***) +[ + 0xf3; 0x0f; 0x1e; 0xfa; (* ENDBR64 *) + 0xb8; 0x7f; 0x00; 0x00; 0x00; + (* MOV (% eax) (Imm32 (word 127)) *) + 0xc5; 0x79; 0x6e; 0xd0; (* VMOVD (%_% xmm10) (% eax) *) + 0xc4; 0x42; 0x7d; 0x58; 0xd2; + (* VPBROADCASTD (%_% ymm10) (%_% xmm10) *) + 0xb8; 0x01; 0x04; 0x00; 0x00; + (* MOV (% eax) (Imm32 (word 1025)) *) + 0xc5; 0x79; 0x6e; 0xd8; (* VMOVD (%_% xmm11) (% eax) *) + 0xc4; 0x42; 0x7d; 0x58; 0xdb; + (* VPBROADCASTD (%_% ymm11) (%_% xmm11) *) + 0xb8; 0x00; 0x02; 0x00; 0x00; + (* MOV (% eax) (Imm32 (word 512)) *) + 0xc5; 0x79; 0x6e; 0xe0; (* VMOVD (%_% xmm12) (% eax) *) + 0xc4; 0x42; 0x7d; 0x58; 0xe4; + (* VPBROADCASTD (%_% ymm12) (%_% xmm12) *) + 0xb8; 0x00; 0xe1; 0x7b; 0x00; + (* MOV (% eax) (Imm32 (word 8118528)) *) + 0xc5; 0x79; 0x6e; 0xe8; (* VMOVD (%_% xmm13) (% eax) *) + 0xc4; 0x42; 0x7d; 0x58; 0xed; + (* VPBROADCASTD (%_% ymm13) (%_% xmm13) *) + 0xb8; 0x00; 0xfe; 0x07; 0x00; + (* MOV (% eax) (Imm32 (word 523776)) *) + 0xc5; 0x79; 0x6e; 0xf0; (* VMOVD (%_% xmm14) (% eax) *) + 0xc4; 0x42; 0x7d; 0x58; 0xf6; + (* VPBROADCASTD (%_% ymm14) (%_% xmm14) *) + 0xc5; 0xfd; 0x6f; 0x06; (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,0))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x0f; (* VMOVDQA (Memop Word256 (%% (rdi,0))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x16; (* VMOVDQA (Memop Word256 (%% (rsi,0))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x46; 0x20; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,32))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x4f; 0x20; + (* VMOVDQA (Memop Word256 (%% (rdi,32))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x56; 0x20; + (* VMOVDQA (Memop Word256 (%% (rsi,32))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x46; 0x40; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,64))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x4f; 0x40; + (* VMOVDQA (Memop Word256 (%% (rdi,64))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x56; 0x40; + (* VMOVDQA (Memop Word256 (%% (rsi,64))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x46; 0x60; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,96))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x4f; 0x60; + (* VMOVDQA (Memop Word256 (%% (rdi,96))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x56; 0x60; + (* VMOVDQA (Memop Word256 (%% (rsi,96))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,128))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,128))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,128))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,160))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,160))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,160))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,192))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,192))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,192))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,224))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,224))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,224))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,256))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,256))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,256))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,288))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,288))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,288))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,320))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,320))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,320))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,352))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,352))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,352))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,384))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,384))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,384))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,416))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,416))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,416))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,448))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,448))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,448))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,480))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,480))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,480))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x00; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,512))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x00; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,512))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x00; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,512))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x20; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,544))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x20; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,544))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x20; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,544))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x40; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,576))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x40; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,576))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x40; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,576))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x60; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,608))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x60; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,608))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x60; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,608))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x80; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,640))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x80; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,640))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x80; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,640))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xa0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,672))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xa0; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,672))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xa0; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,672))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xc0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,704))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xc0; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,704))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xc0; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,704))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xe0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,736))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xe0; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,736))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xe0; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,736))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x00; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,768))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x00; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,768))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x00; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,768))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x20; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,800))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x20; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,800))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x20; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,800))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x40; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,832))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x40; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,832))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x40; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,832))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x60; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,864))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x60; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,864))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x60; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,864))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x80; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,896))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x80; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,896))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x80; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,896))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xa0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,928))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xa0; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,928))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xa0; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,928))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xc0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,960))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xc0; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,960))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xc0; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,960))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xe0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,992))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xe0; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,992))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xe0; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,992))) (%_% ymm2) *) + 0xc3 (* RET *) +];; +(*** BYTECODE END ***) + +let mldsa_decompose32_tmc = define_trimmed "mldsa_decompose32_tmc" mldsa_decompose32_mc;; +let MLDSA_DECOMPOSE32_EXEC = X86_MK_CORE_EXEC_RULE mldsa_decompose32_tmc;; + +(* ========================================================================= *) +(* Word-level lane functions matching the AVX2 instruction sequence. *) +(* ========================================================================= *) + +(* High-bits quotient h, x86 mulhi/mulhrs path (matches VPMULHUW+VPMULHRSW). *) +let decompose32_h = define + `decompose32_h (y:int32) : int32 = + word_join + (word_subword + (word_add + (word_ushr + (word_mul + (word_sx + (word_subword + (word_mul + (word_zx (word_subword (word_ushr (word_add y (word 127)) 7) (16,16):16 word):int32) + (word 0:int32)) (16,16):16 word):int32) + (word 0:int32)) 14) + (word 1:int32)) (1,16):16 word) + (word_subword + (word_add + (word_ushr + (word_mul + (word_sx + (word_subword + (word_mul + (word_zx (word_subword (word_ushr (word_add y (word 127)) 7) (0,16):16 word):int32) + (word 1025:int32)) (16,16):16 word):int32) + (word 512:int32)) 14) + (word 1:int32)) (1,16):16 word) :int32`;; + +let decompose32_a1 = define + `decompose32_a1 (y:int32) : int32 = + word_and (word_not (if word_igt y (word 8118528) then word 4294967295 else word 0)) + (decompose32_h y)`;; + +let decompose32_a0 = define + `decompose32_a0 (y:int32) : int32 = + word_add (word_sub y (word_mul (decompose32_h y) (word 523776))) + (if word_igt y (word 8118528) then word 4294967295 else word 0)`;; + +(* The unsigned high-16 multiply (VPMULHUW) on a sub-2^16 lane, specialized + to the Barrett magic constant 1025 from the shared MULHI_LANE_GEN. *) +let MULHI_LANE = prove( + `!t:int32. val t < 65536 ==> + val(word_subword (word_mul (word_zx (word_subword t (0,16):16 word):int32) + (word 1025)) (16,16):16 word) = + (val t * 1025) DIV 65536`, + GEN_TAC THEN DISCH_TAC THEN MATCH_MP_TAC MULHI_LANE_GEN THEN + ASM_REWRITE_TAC[] THEN ARITH_TAC);; + +(* The high 16-bit lane of the VPMULHUW/VPMULHRSW chain is identically zero + (the high subword of (x+127)>>7 is multiplied by zero). *) +let HI16_ZERO = + let rhs = rand(concl decompose32_h) in + let hi16 = rand(rator rhs) in + prove(mk_eq(hi16, `word 0:16 word`), + SUBGOAL_THEN `word_mul (word_zx (word_subword (word_ushr (word_add (y:int32) (word 127)) 7) (16,16):16 word):int32) + (word 0):int32 = word 0` SUBST1_TAC THENL + [CONV_TAC WORD_RULE; ALL_TAC] THEN CONV_TAC WORD_REDUCE_CONV);; + +(* Numerical form of decompose32_h: the nested mulhi/mulhrs floors. *) +let H_NUM = prove( + `!x:int32. val x < 8380417 ==> + val(decompose32_h x) = + (((((val x + 127) DIV 128 * 1025) DIV 65536) * 512) DIV 16384 + 1) DIV 2`, + GEN_TAC THEN DISCH_TAC THEN + GEN_REWRITE_TAC (LAND_CONV o RAND_CONV) [decompose32_h] THEN + REWRITE_TAC[HI16_ZERO] THEN + REWRITE_TAC[VAL_WORD_JOIN; DIMINDEX_16; DIMINDEX_32; VAL_WORD_0; ADD_CLAUSES; MULT_CLAUSES] THEN + ABBREV_TAC `m:16 word = word_subword (word_mul (word_zx (word_subword (word_ushr (word_add (x:int32) (word 127)) 7) (0,16):16 word):int32) (word 1025)) (16,16)` THEN + SUBGOAL_THEN `val(m:16 word) = ((val(x:int32) + 127) DIV 128 * 1025) DIV 65536` ASSUME_TAC THENL + [EXPAND_TAC "m" THEN + MP_TAC(SPEC `word_ushr (word_add (x:int32) (word 127)) 7` MULHI_LANE) THEN + ASM_SIMP_TAC[H_T] THEN ANTS_TAC THENL + [ASM_SIMP_TAC[H_T] THEN MP_TAC(SPEC `x:int32` T_BOUND) THEN ASM_ARITH_TAC; + DISCH_THEN SUBST1_TAC THEN ASM_SIMP_TAC[H_T]]; ALL_TAC] THEN + SUBGOAL_THEN `val(m:16 word) < 1025` ASSUME_TAC THENL + [ASM_REWRITE_TAC[] THEN SIMP_TAC[RDIV_LT_EQ; ARITH_RULE `~(65536 = 0)`] THEN + MP_TAC(SPEC `x:int32` T_BOUND) THEN ASM_ARITH_TAC; ALL_TAC] THEN + REWRITE_TAC[VAL_WORD_SUBWORD; VAL_WORD_ADD; VAL_WORD_USHR; VAL_WORD_MUL; VAL_WORD; + DIMINDEX_16; DIMINDEX_32] THEN + CONV_TAC NUM_REDUCE_CONV THEN + SUBGOAL_THEN `val(word_sx (m:16 word):int32) = val m` SUBST1_TAC THENL + [MATCH_MP_TAC VAL_SX_16_32 THEN ASM_ARITH_TAC; ALL_TAC] THEN + ASM_REWRITE_TAC[] THEN + SUBGOAL_THEN `((val(x:int32) + 127) DIV 128 * 1025) DIV 65536 < 1025` ASSUME_TAC THENL + [ASM_MESON_TAC[]; ALL_TAC] THEN + ABBREV_TAC `q = ((val(x:int32) + 127) DIV 128 * 1025) DIV 65536` THEN + SUBGOAL_THEN `(q * 512) MOD 4294967296 = q * 512` SUBST1_TAC THENL + [MATCH_MP_TAC MOD_LT THEN ASM_ARITH_TAC; ALL_TAC] THEN + SUBGOAL_THEN `(q * 512) DIV 16384 < 33` ASSUME_TAC THENL + [SIMP_TAC[RDIV_LT_EQ; ARITH_RULE `~(16384 = 0)`] THEN ASM_ARITH_TAC; ALL_TAC] THEN + SUBGOAL_THEN `((q * 512) DIV 16384 + 1) MOD 4294967296 = (q * 512) DIV 16384 + 1` SUBST1_TAC THENL + [MATCH_MP_TAC MOD_LT THEN ASM_ARITH_TAC; ALL_TAC] THEN + SUBGOAL_THEN `((q * 512) DIV 16384 + 1) DIV 2 < 65536` ASSUME_TAC THENL + [ASM_ARITH_TAC; ALL_TAC] THEN + ASM_SIMP_TAC[MOD_LT; ARITH_RULE `n < 65536 ==> n < 4294967296`]);; + +(* Collapse the two trailing right-shift floors into one. *) +let ADD32_DIV = prove( + `!c1. (c1 + 32) DIV 32 = c1 DIV 32 + 1`, + GEN_TAC THEN SUBGOAL_THEN `c1 + 32 = c1 + 1 * 32` SUBST1_TAC THENL + [ARITH_TAC; SIMP_TAC[DIV_MULT_ADD; ARITH_RULE `~(32 = 0)`]]);; + +let DIVMUL_HELP = prove(`!a. (a * 512) DIV 16384 = a DIV 32`, + GEN_TAC THEN REWRITE_TAC[ARITH_RULE `16384 = 512 * 32`] THEN + GEN_REWRITE_TAC (LAND_CONV o RAND_CONV) [MULT_SYM] THEN + ONCE_REWRITE_TAC[MULT_SYM] THEN + SIMP_TAC[DIV_MULT2; ARITH_RULE `~(512 = 0)`]);; + +let E_COLLAPSE = prove( + `!c. ((((c * 1025) DIV 65536) * 512) DIV 16384 + 1) DIV 2 = + ((c * 1025) DIV 65536 + 32) DIV 64`, + GEN_TAC THEN REWRITE_TAC[DIVMUL_HELP; GSYM ADD32_DIV] THEN + REWRITE_TAC[ARITH_RULE `64 = 32 * 2`; DIV_DIV]);; + +(* Barrett-style correctness of the mulhi/mulhrs quotient. *) +let BARRETT_CORE = prove( + `!c. c < 65473 ==> ((c * 1025) DIV 65536 + 32) DIV 64 = (c + 2045) DIV 4092`, + GEN_TAC THEN DISCH_TAC THEN + ABBREV_TAC `c1 = (c * 1025) DIV 65536` THEN + MP_TAC(SPECL [`c * 1025`; `65536`] DIVISION) THEN ANTS_TAC THENL [ARITH_TAC; ALL_TAC] THEN + ASM_REWRITE_TAC[] THEN STRIP_TAC THEN + MATCH_MP_TAC DIV_BOUNDS_EQ THEN CONV_TAC NUM_REDUCE_CONV THEN + MP_TAC(SPECL [`c + 2045`; `4092`] DIVISION) THEN ANTS_TAC THENL [ARITH_TAC; ALL_TAC] THEN + ABBREV_TAC `q = (c + 2045) DIV 4092` THEN + ABBREV_TAC `t = (c + 2045) MOD 4092` THEN STRIP_TAC THEN + SUBGOAL_THEN `q <= 16` ASSUME_TAC THENL + [SUBGOAL_THEN `q * 4092 <= c + 2045` MP_TAC THENL [ASM_ARITH_TAC; ASM_ARITH_TAC]; ALL_TAC] THEN + ASM_ARITH_TAC);; + +(* round-half-down closed form. *) +let ROUND_CLOSED = prove( + `!r. (if r MOD 523776 * 2 <= 523776 then r DIV 523776 else r DIV 523776 + 1) = + (r + 261887) DIV 523776`, + GEN_TAC THEN + MP_TAC(SPECL [`r:num`; `523776`] DIVISION) THEN ANTS_TAC THENL [ARITH_TAC; STRIP_TAC] THEN + ABBREV_TAC `q = r DIV 523776` THEN ABBREV_TAC `m = r MOD 523776` THEN + COND_CASES_TAC THENL + [SUBGOAL_THEN `(r + 261887) DIV 523776 = q` (fun th -> REWRITE_TAC[th]) THEN + MATCH_MP_TAC DIV_UNIQ THEN EXISTS_TAC `m + 261887` THEN ASM_ARITH_TAC; + SUBGOAL_THEN `261889 <= m` ASSUME_TAC THENL [ASM_ARITH_TAC; ALL_TAC] THEN + SUBGOAL_THEN `(r + 261887) DIV 523776 = q + 1` (fun th -> REWRITE_TAC[th]) THEN + MATCH_MP_TAC DIV_UNIQ THEN EXISTS_TAC `m - 261889` THEN ASM_ARITH_TAC]);; + +let C_TO_R = prove( + `!r. (((r + 127) DIV 128) + 2045) DIV 4092 = (r + 261887) DIV 523776`, + GEN_TAC THEN + SUBGOAL_THEN `(r + 127) DIV 128 + 2045 = (r + 261887) DIV 128` SUBST1_TAC THENL + [SUBGOAL_THEN `r + 261887 = (r + 127) + 2045 * 128` SUBST1_TAC THENL + [ARITH_TAC; SIMP_TAC[DIV_MULT_ADD; ARITH_RULE `~(128 = 0)`]]; ALL_TAC] THEN + REWRITE_TAC[DIV_DIV] THEN REWRITE_TAC[ARITH_RULE `128 * 4092 = 523776`]);; + +let H_ROUND = prove( + `!r. r < 8380417 ==> + (((((r + 127) DIV 128 * 1025) DIV 65536) * 512) DIV 16384 + 1) DIV 2 = + (if r MOD 523776 * 2 <= 523776 then r DIV 523776 else r DIV 523776 + 1)`, + GEN_TAC THEN DISCH_TAC THEN + REWRITE_TAC[E_COLLAPSE] THEN + SUBGOAL_THEN `(r + 127) DIV 128 < 65473` ASSUME_TAC THENL + [SIMP_TAC[RDIV_LT_EQ; ARITH_RULE `~(128 = 0)`] THEN ASM_ARITH_TAC; ALL_TAC] THEN + ASM_SIMP_TAC[BARRETT_CORE] THEN + REWRITE_TAC[C_TO_R; ROUND_CLOSED]);; + +(* Correctness of the high-bits quotient: same round-half-down form proven for + AArch64 (h32), enabling reuse of the spec-connection lemmas below. *) +let H32_CORRECT = prove( + `!x:int32. val x < 8380417 ==> + val(decompose32_h x) = (if val x MOD 523776 * 2 <= 523776 + then val x DIV 523776 + else val x DIV 523776 + 1)`, + GEN_TAC THEN DISCH_TAC THEN ASM_SIMP_TAC[H_NUM] THEN ASM_SIMP_TAC[H_ROUND]);; + +(* The wrap-around test in word form, specialized to the threshold 8118528 + from the shared IGT_BOUND_GEN. *) +let IGT_BOUND = + GEN_ALL(MP (SPECL [`x:int32`; `8118528`] IGT_BOUND_GEN) + (ARITH_RULE `8118528 < 2147483648`));; + +(* a1 = 0 on wrap-around, else h. *) +let DECOMPOSE32_A1_CASES = prove( + `!x:int32. decompose32_a1 x = if ival x > &8118528 then word 0 else decompose32_h x`, + GEN_TAC THEN REWRITE_TAC[decompose32_a1; GSYM IGT_BOUND] THEN + ABBREV_TAC `h = decompose32_h x` THEN + COND_CASES_TAC THEN ASM_REWRITE_TAC[] THEN BITBLAST_TAC);; + +(* a0 subtracts one extra on wrap-around. *) +let DECOMPOSE32_A0_CASES = prove( + `!x:int32. decompose32_a0 x = + if ival x > &8118528 + then word_sub (word_sub x (word_mul (decompose32_h x) (word 523776))) (word 1) + else word_sub x (word_mul (decompose32_h x) (word 523776))`, + GEN_TAC THEN REWRITE_TAC[decompose32_a0; GSYM IGT_BOUND] THEN + ABBREV_TAC `h = decompose32_h x` THEN + SUBGOAL_THEN `word 4294967295:int32 = word_neg(word 1)` SUBST1_TAC THENL + [CONV_TAC WORD_REDUCE_CONV; ALL_TAC] THEN + COND_CASES_TAC THEN ASM_REWRITE_TAC[] THEN CONV_TAC WORD_RULE);; + +(* On wrap-around (val x > 31*GAMMA2) the rounding quotient is exactly 16. *) +let ROUND32_SPECIAL = prove( + `!n. 8118528 < n /\ n < 8380417 ==> + (if n MOD 523776 * 2 <= 523776 then n DIV 523776 else n DIV 523776 + 1) = 16`, + REPEAT STRIP_TAC THEN + ASM_CASES_TAC `n < 8380416` THENL + [SUBGOAL_THEN `n DIV 523776 = 15` ASSUME_TAC THENL + [MATCH_MP_TAC DIV_BOUNDS_EQ THEN CONV_TAC NUM_REDUCE_CONV THEN ASM_ARITH_TAC; ALL_TAC] THEN + ASM_REWRITE_TAC[] THEN CONV_TAC NUM_REDUCE_CONV THEN + COND_CASES_TAC THENL + [MP_TAC(SPECL [`n:num`; `523776`] DIVISION) THEN ANTS_TAC THENL [ARITH_TAC; ALL_TAC] THEN + ASM_REWRITE_TAC[] THEN CONV_TAC NUM_REDUCE_CONV THEN STRIP_TAC THEN ASM_ARITH_TAC; + REWRITE_TAC[]]; + SUBGOAL_THEN `n = 8380416` SUBST_ALL_TAC THENL [ASM_ARITH_TAC; ALL_TAC] THEN + CONV_TAC NUM_REDUCE_CONV]);; + +let CMOD_ABS_BOUND_523776 = prove( + `!n. abs(mldsa_cmod n 523776) <= &261888`, + GEN_TAC THEN REWRITE_TAC[mldsa_cmod] THEN + SUBGOAL_THEN `n MOD 523776 < 523776` MP_TAC THENL + [SIMP_TAC[MOD_LT_EQ; ARITH_RULE `~(523776 = 0)`]; ALL_TAC] THEN + SPEC_TAC(`n MOD 523776`, `m:num`) THEN GEN_TAC THEN DISCH_TAC THEN + COND_CASES_TAC THEN + REWRITE_TAC[INT_ABS; INT_POS; INT_OF_NUM_LE; INT_OF_NUM_SUB; INT_SUB_LE; INT_NEG_SUB] THEN + ASM_ARITH_TAC);; + +(* a1 lane computes the high bits FST(mldsa_decompose_32(val x)). *) +let DECOMPOSE32_A1_CORRECT = prove( + `!x:int32. val x < 8380417 + ==> val(decompose32_a1 x) = FST(mldsa_decompose_32(val x))`, + GEN_TAC THEN DISCH_TAC THEN + REWRITE_TAC[DECOMPOSE32_A1_CASES; MLDSA_DECOMPOSE_32_EXPAND; LET_DEF; LET_END_DEF; FST] THEN + COND_CASES_TAC THENL + [REWRITE_TAC[VAL_WORD_0; FST] THEN + SUBGOAL_THEN `val(x:int32) < 2 EXP 31` ASSUME_TAC THENL [ASM_ARITH_TAC; ALL_TAC] THEN + MP_TAC(SPEC `x:int32` IVAL_EQ_VAL) THEN ASM_REWRITE_TAC[] THEN DISCH_TAC THEN + SUBGOAL_THEN `&(val(x:int32)):int > &8118528` MP_TAC THENL [ASM_MESON_TAC[]; ALL_TAC] THEN + REWRITE_TAC[INT_OF_NUM_GT; GT] THEN DISCH_TAC THEN + MP_TAC(SPEC `val(x:int32)` ROUND32_SPECIAL) THEN + ASM_REWRITE_TAC[] THEN DISCH_THEN SUBST1_TAC THEN REWRITE_TAC[FST]; + MP_TAC(SPEC `x:int32` H32_CORRECT) THEN ASM_REWRITE_TAC[] THEN DISCH_THEN SUBST1_TAC THEN + COND_CASES_TAC THENL + [SUBGOAL_THEN `val(x:int32) <= 8118528` ASSUME_TAC THENL + [SUBGOAL_THEN `val(x:int32) < 2 EXP 31` ASSUME_TAC THENL [ASM_ARITH_TAC; ALL_TAC] THEN + MP_TAC(SPEC `x:int32` IVAL_EQ_VAL) THEN ASM_REWRITE_TAC[] THEN DISCH_TAC THEN + SUBGOAL_THEN `~(&(val(x:int32)):int > &8118528)` MP_TAC THENL [ASM_MESON_TAC[]; ALL_TAC] THEN + REWRITE_TAC[INT_GT; INT_NOT_LT; INT_OF_NUM_LE]; ALL_TAC] THEN + SUBGOAL_THEN `~(val(x:int32) DIV 523776 = 16)` ASSUME_TAC THENL + [DISCH_TAC THEN MP_TAC(SPECL [`val(x:int32)`; `523776`] DIVISION) THEN + ANTS_TAC THENL [ARITH_TAC; ALL_TAC] THEN + ASM_REWRITE_TAC[] THEN CONV_TAC NUM_REDUCE_CONV THEN STRIP_TAC THEN ASM_ARITH_TAC; ALL_TAC] THEN + ASM_REWRITE_TAC[FST]; + SUBGOAL_THEN `val(x:int32) <= 8118528` ASSUME_TAC THENL + [SUBGOAL_THEN `val(x:int32) < 2 EXP 31` ASSUME_TAC THENL [ASM_ARITH_TAC; ALL_TAC] THEN + MP_TAC(SPEC `x:int32` IVAL_EQ_VAL) THEN ASM_REWRITE_TAC[] THEN DISCH_TAC THEN + SUBGOAL_THEN `~(&(val(x:int32)):int > &8118528)` MP_TAC THENL [ASM_MESON_TAC[]; ALL_TAC] THEN + REWRITE_TAC[INT_GT; INT_NOT_LT; INT_OF_NUM_LE]; ALL_TAC] THEN + SUBGOAL_THEN `~(val(x:int32) DIV 523776 + 1 = 16)` ASSUME_TAC THENL + [REWRITE_TAC[ARITH_RULE `n + 1 = 16 <=> n = 15`] THEN DISCH_TAC THEN + MP_TAC(SPECL [`val(x:int32)`; `523776`] DIVISION) THEN + ANTS_TAC THENL [ARITH_TAC; ALL_TAC] THEN + ASM_REWRITE_TAC[] THEN CONV_TAC NUM_REDUCE_CONV THEN STRIP_TAC THEN ASM_ARITH_TAC; ALL_TAC] THEN + ASM_REWRITE_TAC[FST]]]);; + +(* a0 lane computes the low bits SND(mldsa_decompose_32(val x)). *) +let DECOMPOSE32_A0_CORRECT = prove( + `!x:int32. val x < 8380417 + ==> ival(decompose32_a0 x) = SND(mldsa_decompose_32(val x))`, + GEN_TAC THEN DISCH_TAC THEN + REWRITE_TAC[DECOMPOSE32_A0_CASES; MLDSA_DECOMPOSE_32_EXPAND; LET_DEF; LET_END_DEF; SND] THEN + SUBGOAL_THEN `word_sub x (word_mul (decompose32_h x) (word 523776)) : int32 = + iword(ival x - ival(decompose32_h x) * &523776)` SUBST1_TAC THENL + [CONV_TAC WORD_RULE; ALL_TAC] THEN + SUBGOAL_THEN `ival(x:int32) = &(val x)` SUBST1_TAC THENL + [MATCH_MP_TAC IVAL_EQ_VAL THEN ASM_ARITH_TAC; ALL_TAC] THEN + SUBGOAL_THEN `ival(decompose32_h x:int32) = &(val(decompose32_h x))` SUBST1_TAC THENL + [MATCH_MP_TAC IVAL_EQ_VAL THEN + MP_TAC(SPEC `x:int32` H32_CORRECT) THEN ASM_REWRITE_TAC[] THEN DISCH_TAC THEN + ASM_SIMP_TAC[RDIV_LT_EQ; ARITH_RULE `~(523776 = 0)`] THEN + CONV_TAC NUM_REDUCE_CONV THEN ASM_ARITH_TAC; ALL_TAC] THEN + MP_TAC(SPEC `x:int32` H32_CORRECT) THEN ASM_REWRITE_TAC[] THEN DISCH_THEN SUBST1_TAC THEN + REWRITE_TAC[INT_OF_NUM_GT] THEN + ABBREV_TAC `h = (if val(x:int32) MOD 523776 * 2 <= 523776 + then val x DIV 523776 else val x DIV 523776 + 1)` THEN + SUBGOAL_THEN `&(val(x:int32)):int = + &(val x DIV 523776) * &523776 + &(val x MOD 523776)` ASSUME_TAC THENL + [MP_TAC(SPECL [`val(x:int32)`; `523776`] DIVISION) THEN + ANTS_TAC THENL [ARITH_TAC; ALL_TAC] THEN + DISCH_THEN(MP_TAC o AP_TERM `int_of_num` o CONJUNCT1) THEN + REWRITE_TAC[INT_OF_NUM_MUL; INT_OF_NUM_ADD]; ALL_TAC] THEN + SUBGOAL_THEN `&(val(x:int32)) - &h * &523776 = mldsa_cmod (val x) 523776` + ASSUME_TAC THENL + [REWRITE_TAC[mldsa_cmod] THEN + FIRST_X_ASSUM(MP_TAC o SYM o check (fun th -> + fst(dest_cond(fst(dest_eq(concl th)))) = + `val (x:int32) MOD 523776 * 2 <= 523776`)) THEN + COND_CASES_TAC THENL + [DISCH_THEN SUBST1_TAC THEN ASM_REWRITE_TAC[] THEN INT_ARITH_TAC; + DISCH_THEN SUBST1_TAC THEN ASM_REWRITE_TAC[GSYM INT_OF_NUM_ADD; + GSYM INT_OF_NUM_MUL] THEN INT_ARITH_TAC]; ALL_TAC] THEN + ASM_REWRITE_TAC[] THEN + COND_CASES_TAC THENL + [SUBGOAL_THEN `h = 16` SUBST1_TAC THENL + [MP_TAC(SPEC `val(x:int32)` ROUND32_SPECIAL) THEN + ANTS_TAC THENL [ASM_ARITH_TAC; ALL_TAC] THEN ASM_MESON_TAC[]; ALL_TAC] THEN + REWRITE_TAC[SND] THEN + SUBGOAL_THEN `word_sub (iword(mldsa_cmod (val(x:int32)) 523776)) (word 1) : int32 = + iword(mldsa_cmod (val x) 523776 - &1)` SUBST1_TAC THENL + [REWRITE_TAC[GSYM IWORD_INT_SUB; WORD_IWORD]; ALL_TAC] THEN + MATCH_MP_TAC(INST_TYPE [`:32`,`:N`] IVAL_IWORD) THEN + REWRITE_TAC[DIMINDEX_32] THEN CONV_TAC NUM_REDUCE_CONV THEN + MP_TAC(SPEC `val(x:int32)` CMOD_ABS_BOUND_523776) THEN INT_ARITH_TAC; + SUBGOAL_THEN `~(h = 16)` ASSUME_TAC THENL + [DISCH_TAC THEN + SUBGOAL_THEN `val(x:int32) <= 8118528` ASSUME_TAC THENL [ASM_ARITH_TAC; ALL_TAC] THEN + SUBGOAL_THEN `(if val(x:int32) MOD 523776 * 2 <= 523776 + then val x DIV 523776 else val x DIV 523776 + 1) = 16` MP_TAC THENL + [ASM_MESON_TAC[]; ALL_TAC] THEN + COND_CASES_TAC THENL + [DISCH_TAC THEN MP_TAC(SPECL [`val(x:int32)`; `523776`] DIVISION) THEN + ANTS_TAC THENL [ARITH_TAC; ALL_TAC] THEN + ASM_REWRITE_TAC[] THEN CONV_TAC NUM_REDUCE_CONV THEN STRIP_TAC THEN ASM_ARITH_TAC; + REWRITE_TAC[ARITH_RULE `n + 1 = 16 <=> n = 15`] THEN DISCH_TAC THEN + MP_TAC(SPECL [`val(x:int32)`; `523776`] DIVISION) THEN + ANTS_TAC THENL [ARITH_TAC; ALL_TAC] THEN + ASM_REWRITE_TAC[] THEN CONV_TAC NUM_REDUCE_CONV THEN STRIP_TAC THEN ASM_ARITH_TAC]; ALL_TAC] THEN + ASM_REWRITE_TAC[SND] THEN + MATCH_MP_TAC(INST_TYPE [`:32`,`:N`] IVAL_IWORD) THEN + REWRITE_TAC[DIMINDEX_32] THEN CONV_TAC NUM_REDUCE_CONV THEN + MP_TAC(SPEC `val(x:int32)` CMOD_ABS_BOUND_523776) THEN INT_ARITH_TAC]);; + +(* Range bounds on the lane outputs, phrased on the word-level lane functions + so the main proof can discharge the CBMC-contract bounds uniformly. *) +let DECOMPOSE32_A1_BOUND_LEMMA = prove( + `!x:int32. val x < 8380417 ==> val(decompose32_a1 x) <= 15`, + GEN_TAC THEN DISCH_TAC THEN + ASM_SIMP_TAC[DECOMPOSE32_A1_CORRECT; MLDSA_DECOMPOSE_32_A1_BOUND]);; + +let DECOMPOSE32_A0_BOUND_LO = prove( + `!x:int32. val x < 8380417 ==> --(&261888) <= ival(decompose32_a0 x)`, + GEN_TAC THEN DISCH_TAC THEN + ASM_SIMP_TAC[DECOMPOSE32_A0_CORRECT] THEN + MP_TAC(SPEC `val(x:int32)` MLDSA_DECOMPOSE_32_A0_BOUND) THEN ASM_REWRITE_TAC[] THEN INT_ARITH_TAC);; + +let DECOMPOSE32_A0_BOUND_HI = prove( + `!x:int32. val x < 8380417 ==> ival(decompose32_a0 x) <= &261888`, + GEN_TAC THEN DISCH_TAC THEN + ASM_SIMP_TAC[DECOMPOSE32_A0_CORRECT] THEN + MP_TAC(SPEC `val(x:int32)` MLDSA_DECOMPOSE_32_A0_BOUND) THEN ASM_REWRITE_TAC[] THEN INT_ARITH_TAC);; + +(* ========================================================================= *) +(* Core correctness theorem *) +(* ========================================================================= *) + +let MLDSA_DECOMPOSE32_CORRECT = prove( + `!a1 a (x:num->int32) pc. + ALL (nonoverlapping (word pc, 2144)) + [(a1,1024); (a,1024)] /\ + nonoverlapping (a1,1024) (a,1024) /\ + aligned 32 a1 /\ aligned 32 a + ==> ensures x86 + (\s. bytes_loaded s (word pc) (BUTLAST mldsa_decompose32_tmc) /\ + read RIP s = word pc /\ + C_ARGUMENTS [a1; a] s /\ + (!i. i < 256 ==> + read(memory :> bytes32(word_add a (word(4 * i)))) s = x i) /\ + (!i. i < 256 ==> val(x i:int32) < 8380417)) + (\s. read RIP s = word(pc + 2143) /\ + (!i. i < 256 + ==> val(read(memory :> bytes32(word_add a1 (word(4*i)))) s) = + FST(mldsa_decompose_32(val(x i)))) /\ + (!i. i < 256 + ==> ival(read(memory :> bytes32(word_add a (word(4*i)))) s) = + SND(mldsa_decompose_32(val(x i)))) /\ + (!i. i < 256 + ==> val(read(memory :> bytes32(word_add a1 (word(4*i)))) s) <= 15) /\ + (!i. i < 256 + ==> --(&261888) <= + ival(read(memory :> bytes32(word_add a (word(4*i)))) s) /\ + ival(read(memory :> bytes32(word_add a (word(4*i)))) s) <= &261888)) + (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(a1,1024)] ,, + MAYCHANGE [memory :> bytes(a,1024)])`, + MAP_EVERY X_GEN_TAC [`a1:int64`; `a:int64`; `x:num->int32`; `pc:num`] THEN + REWRITE_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; C_ARGUMENTS; ALL; + NONOVERLAPPING_CLAUSES; fst MLDSA_DECOMPOSE32_EXEC] THEN + STRIP_TAC THEN + CONV_TAC(RATOR_CONV(LAND_CONV(ONCE_DEPTH_CONV + (EXPAND_CASES_CONV THENC ONCE_DEPTH_CONV NUM_MULT_CONV)))) THEN + ENSURES_INIT_TAC "s0" THEN + MP_TAC(end_itlist CONJ + (map (fun n -> READ_MEMORY_MERGE_CONV 3 + (subst[mk_small_numeral(32*n),`n:num`] + `read (memory :> bytes256(word_add a (word n))) s0`)) + (0--31))) THEN + ASM_REWRITE_TAC[WORD_ADD_0] THEN + DISCARD_MATCHING_ASSUMPTIONS [`read (memory :> bytes32 a) s = x`] THEN + STRIP_TAC THEN + MAP_EVERY (fun n -> + X86_STEPS_TAC MLDSA_DECOMPOSE32_EXEC [n] THEN + SIMD_SIMPLIFY_TAC[decompose32_a1; decompose32_a0]) (1--399) THEN + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + RULE_ASSUM_TAC(REWRITE_RULE[WORD_NOT_JOIN_256; WORD_NOT_JOIN_128; WORD_NOT_JOIN_64]) THEN + REPEAT(FIRST_X_ASSUM(STRIP_ASSUME_TAC o + CONV_RULE(SIMD_SIMPLIFY_CONV[decompose32_h; decompose32_a1; decompose32_a0]) o + CONV_RULE(READ_MEMORY_SPLIT_CONV 3) o + check (can (term_match [] `read (memory :> bytes256 zzz) s399 = xxx`) o concl))) THEN + CONV_TAC(ONCE_DEPTH_CONV EXPAND_CASES_CONV THENC ONCE_DEPTH_CONV NUM_MULT_CONV) THEN + ASM_REWRITE_TAC[WORD_ADD_0] THEN + REPEAT CONJ_TAC THEN + FIRST (map (fun th -> MATCH_MP_TAC th THEN FIRST_ASSUM ACCEPT_TAC) + [DECOMPOSE32_A1_CORRECT; DECOMPOSE32_A0_CORRECT; + DECOMPOSE32_A1_BOUND_LEMMA; DECOMPOSE32_A0_BOUND_LO; + DECOMPOSE32_A0_BOUND_HI]));; + +(* ========================================================================= *) +(* Subroutine form with return, bounds matching the CBMC contract. *) +(* This must be kept in sync with the CBMC specification in *) +(* mldsa/src/native/x86_64/src/arith_native_x86_64.h *) +(* ========================================================================= *) + +let MLDSA_DECOMPOSE32_NOIBT_SUBROUTINE_CORRECT = prove( + `!a1 a (x:num->int32) pc stackpointer returnaddress. + aligned 32 a1 /\ aligned 32 a /\ + ALL (nonoverlapping (word pc, LENGTH mldsa_decompose32_tmc)) + [(a1,1024); (a,1024)] /\ + nonoverlapping (a1,1024) (a,1024) /\ + nonoverlapping (stackpointer,8) (a1,1024) /\ + nonoverlapping (stackpointer,8) (a,1024) + ==> ensures x86 + (\s. bytes_loaded s (word pc) mldsa_decompose32_tmc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [a1; a] s /\ + (!i. i < 256 + ==> read(memory :> bytes32(word_add a (word(4 * i)))) s = x i) /\ + (!i. i < 256 ==> val(x i:int32) < 8380417)) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + (!i. i < 256 + ==> val(read(memory :> bytes32(word_add a1 (word(4*i)))) s) = + FST(mldsa_decompose_32(val(x i)))) /\ + (!i. i < 256 + ==> ival(read(memory :> bytes32(word_add a (word(4*i)))) s) = + SND(mldsa_decompose_32(val(x i)))) /\ + (!i. i < 256 + ==> val(read(memory :> bytes32(word_add a1 (word(4*i)))) s) <= 15) /\ + (!i. i < 256 + ==> --(&261888) <= + ival(read(memory :> bytes32(word_add a (word(4*i)))) s) /\ + ival(read(memory :> bytes32(word_add a (word(4*i)))) s) <= &261888)) + (MAYCHANGE [RSP] ,, MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(a1,1024)] ,, + MAYCHANGE [memory :> bytes(a,1024)])`, + X86_PROMOTE_RETURN_NOSTACK_TAC mldsa_decompose32_tmc MLDSA_DECOMPOSE32_CORRECT);; + +let MLDSA_DECOMPOSE32_SUBROUTINE_CORRECT = prove( + `!a1 a (x:num->int32) pc stackpointer returnaddress. + aligned 32 a1 /\ aligned 32 a /\ + ALL (nonoverlapping (word pc, LENGTH mldsa_decompose32_mc)) + [(a1,1024); (a,1024)] /\ + nonoverlapping (a1,1024) (a,1024) /\ + nonoverlapping (stackpointer,8) (a1,1024) /\ + nonoverlapping (stackpointer,8) (a,1024) + ==> ensures x86 + (\s. bytes_loaded s (word pc) mldsa_decompose32_mc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [a1; a] s /\ + (!i. i < 256 + ==> read(memory :> bytes32(word_add a (word(4 * i)))) s = x i) /\ + (!i. i < 256 ==> val(x i:int32) < 8380417)) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + (!i. i < 256 + ==> val(read(memory :> bytes32(word_add a1 (word(4*i)))) s) = + FST(mldsa_decompose_32(val(x i)))) /\ + (!i. i < 256 + ==> ival(read(memory :> bytes32(word_add a (word(4*i)))) s) = + SND(mldsa_decompose_32(val(x i)))) /\ + (!i. i < 256 + ==> val(read(memory :> bytes32(word_add a1 (word(4*i)))) s) <= 15) /\ + (!i. i < 256 + ==> --(&261888) <= + ival(read(memory :> bytes32(word_add a (word(4*i)))) s) /\ + ival(read(memory :> bytes32(word_add a (word(4*i)))) s) <= &261888)) + (MAYCHANGE [RSP] ,, MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(a1,1024)] ,, + MAYCHANGE [memory :> bytes(a,1024)])`, + MATCH_ACCEPT_TAC(ADD_IBT_RULE MLDSA_DECOMPOSE32_NOIBT_SUBROUTINE_CORRECT));; + +(* ========================================================================= *) +(* Memory safety. *) +(* Decompose has no data-dependent control flow or memory accesses, so the *) +(* memory-safety property is proven directly from the correctness spec. *) +(* ========================================================================= *) + +needs "s2n_bignum/x86/proofs/consttime.ml";; +needs "mldsa_native/x86_64/proofs/subroutine_signatures.ml";; + +let full_spec,public_vars = mk_safety_spec + ~keep_maychanges:true + (assoc "mldsa_poly_decompose_32_x86" subroutine_signatures) + (REWRITE_RULE[SOME_FLAGS] MLDSA_DECOMPOSE32_CORRECT) + MLDSA_DECOMPOSE32_EXEC;; + +let MLDSA_DECOMPOSE32_SAFE = + REWRITE_RULE [MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; SOME_FLAGS] + (time prove + (full_spec, + REWRITE_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; SOME_FLAGS] THEN + PROVE_SAFETY_SPEC_TAC ~public_vars:public_vars + MLDSA_DECOMPOSE32_EXEC));; + +let MLDSA_DECOMPOSE32_NOIBT_SUBROUTINE_SAFE = time prove + (`exists f_events. + forall e a1 a pc stackpointer returnaddress. + aligned 32 a1 /\ aligned 32 a /\ + ALL (nonoverlapping (word pc, LENGTH mldsa_decompose32_tmc)) + [(a1,1024); (a,1024)] /\ + nonoverlapping (a1,1024) (a,1024) /\ + nonoverlapping (stackpointer,8) (a1,1024) /\ + nonoverlapping (stackpointer,8) (a,1024) + ==> ensures x86 + (\s. + bytes_loaded s (word pc) mldsa_decompose32_tmc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [a1; a] s /\ + read events s = e) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + (exists e2. + read events s = APPEND e2 e /\ + e2 = f_events a1 a pc stackpointer returnaddress /\ + memaccess_inbounds e2 [a,1024; a1,1024; a,1024; stackpointer,8] + [a1,1024; a,1024; stackpointer,8])) + (\s s'. true)`, + X86_PROMOTE_RETURN_NOSTACK_TAC mldsa_decompose32_tmc MLDSA_DECOMPOSE32_SAFE THEN + DISCHARGE_SAFETY_PROPERTY_TAC);; + +let MLDSA_DECOMPOSE32_SUBROUTINE_SAFE = time prove + (`exists f_events. + forall e a1 a pc stackpointer returnaddress. + aligned 32 a1 /\ aligned 32 a /\ + ALL (nonoverlapping (word pc, LENGTH mldsa_decompose32_mc)) + [(a1,1024); (a,1024)] /\ + nonoverlapping (a1,1024) (a,1024) /\ + nonoverlapping (stackpointer,8) (a1,1024) /\ + nonoverlapping (stackpointer,8) (a,1024) + ==> ensures x86 + (\s. + bytes_loaded s (word pc) mldsa_decompose32_mc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [a1; a] s /\ + read events s = e) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + (exists e2. + read events s = APPEND e2 e /\ + e2 = f_events a1 a pc stackpointer returnaddress /\ + memaccess_inbounds e2 [a,1024; a1,1024; a,1024; stackpointer,8] + [a1,1024; a,1024; stackpointer,8])) + (\s s'. true)`, + MATCH_ACCEPT_TAC(ADD_IBT_RULE MLDSA_DECOMPOSE32_NOIBT_SUBROUTINE_SAFE));; diff --git a/proofs/hol_light/x86_64/proofs/poly_decompose_88_avx2_asm.ml b/proofs/hol_light/x86_64/proofs/poly_decompose_88_avx2_asm.ml new file mode 100644 index 000000000..e8d0e8505 --- /dev/null +++ b/proofs/hol_light/x86_64/proofs/poly_decompose_88_avx2_asm.ml @@ -0,0 +1,1300 @@ +(* + * Copyright (c) The mldsa-native project authors + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(* ========================================================================= *) +(* Functional correctness of poly_decompose_88 (x86_64 AVX2): *) +(* Decompose polynomial coefficients into (a1, a0) where *) +(* a mod^+ Q = a1*2*GAMMA2 + a0 for GAMMA2 = (Q-1)/88 = 95232 *) +(* (ML-DSA-44). *) +(* *) +(* The high-bits quotient is computed with the AVX2 mulhi/mulhrs Barrett *) +(* sequence (VPADDD/VPSRLD/VPMULHUW/VPMULHRSW), which differs from the *) +(* AArch64 SQDMULH+SRSHR path but computes the same round-half-down *) +(* quotient. *) +(* ========================================================================= *) + +needs "s2n_bignum/x86/proofs/base.ml";; +needs "mldsa_native/common/mldsa_specs.ml";; +needs "mldsa_native/x86_64/proofs/mldsa_utils.ml";; + + +(**** print_literal_from_elf "x86_64/mldsa/poly_decompose_88_avx2_asm.o";; + ****) + +let mldsa_decompose88_mc = define_assert_from_elf "mldsa_decompose88_mc" "x86_64/mldsa/poly_decompose_88_avx2_asm.o" +(*** BYTECODE START ***) +[ + 0xf3; 0x0f; 0x1e; 0xfa; (* ENDBR64 *) + 0xb8; 0x7f; 0x00; 0x00; 0x00; + (* MOV (% eax) (Imm32 (word 127)) *) + 0xc5; 0x79; 0x6e; 0xd0; (* VMOVD (%_% xmm10) (% eax) *) + 0xc4; 0x42; 0x7d; 0x58; 0xd2; + (* VPBROADCASTD (%_% ymm10) (%_% xmm10) *) + 0xb8; 0x0b; 0x2c; 0x00; 0x00; + (* MOV (% eax) (Imm32 (word 11275)) *) + 0xc5; 0x79; 0x6e; 0xd8; (* VMOVD (%_% xmm11) (% eax) *) + 0xc4; 0x42; 0x7d; 0x58; 0xdb; + (* VPBROADCASTD (%_% ymm11) (%_% xmm11) *) + 0xb8; 0x80; 0x00; 0x00; 0x00; + (* MOV (% eax) (Imm32 (word 128)) *) + 0xc5; 0x79; 0x6e; 0xe0; (* VMOVD (%_% xmm12) (% eax) *) + 0xc4; 0x42; 0x7d; 0x58; 0xe4; + (* VPBROADCASTD (%_% ymm12) (%_% xmm12) *) + 0xb8; 0x00; 0x6c; 0x7e; 0x00; + (* MOV (% eax) (Imm32 (word 8285184)) *) + 0xc5; 0x79; 0x6e; 0xe8; (* VMOVD (%_% xmm13) (% eax) *) + 0xc4; 0x42; 0x7d; 0x58; 0xed; + (* VPBROADCASTD (%_% ymm13) (%_% xmm13) *) + 0xb8; 0x00; 0xe8; 0x02; 0x00; + (* MOV (% eax) (Imm32 (word 190464)) *) + 0xc5; 0x79; 0x6e; 0xf0; (* VMOVD (%_% xmm14) (% eax) *) + 0xc4; 0x42; 0x7d; 0x58; 0xf6; + (* VPBROADCASTD (%_% ymm14) (%_% xmm14) *) + 0xc5; 0xfd; 0x6f; 0x06; (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,0))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x0f; (* VMOVDQA (Memop Word256 (%% (rdi,0))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x16; (* VMOVDQA (Memop Word256 (%% (rsi,0))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x46; 0x20; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,32))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x4f; 0x20; + (* VMOVDQA (Memop Word256 (%% (rdi,32))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x56; 0x20; + (* VMOVDQA (Memop Word256 (%% (rsi,32))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x46; 0x40; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,64))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x4f; 0x40; + (* VMOVDQA (Memop Word256 (%% (rdi,64))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x56; 0x40; + (* VMOVDQA (Memop Word256 (%% (rsi,64))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x46; 0x60; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,96))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x4f; 0x60; + (* VMOVDQA (Memop Word256 (%% (rdi,96))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x56; 0x60; + (* VMOVDQA (Memop Word256 (%% (rsi,96))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,128))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,128))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,128))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,160))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,160))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,160))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,192))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,192))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,192))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,224))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,224))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,224))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,256))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,256))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,256))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,288))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,288))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,288))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,320))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,320))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,320))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,352))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,352))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,352))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,384))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,384))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,384))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,416))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,416))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,416))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,448))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,448))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,448))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,480))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,480))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,480))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x00; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,512))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x00; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,512))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x00; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,512))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x20; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,544))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x20; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,544))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x20; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,544))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x40; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,576))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x40; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,576))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x40; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,576))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x60; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,608))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x60; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,608))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x60; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,608))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x80; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,640))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x80; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,640))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x80; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,640))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xa0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,672))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xa0; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,672))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xa0; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,672))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xc0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,704))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xc0; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,704))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xc0; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,704))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xe0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,736))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xe0; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,736))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xe0; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,736))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x00; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,768))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x00; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,768))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x00; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,768))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x20; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,800))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x20; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,800))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x20; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,800))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x40; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,832))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x40; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,832))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x40; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,832))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x60; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,864))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x60; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,864))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x60; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,864))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0x80; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,896))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0x80; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,896))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0x80; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,896))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xa0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,928))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xa0; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,928))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xa0; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,928))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xc0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,960))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xc0; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,960))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xc0; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,960))) (%_% ymm2) *) + 0xc5; 0xfd; 0x6f; 0x86; 0xe0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm0) (Memop Word256 (%% (rsi,992))) *) + 0xc4; 0xc1; 0x7d; 0xfe; 0xca; + (* VPADDD (%_% ymm1) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0xf5; 0x72; 0xd1; 0x07; + (* VPSRLD (%_% ymm1) (%_% ymm1) (Imm8 (word 7)) *) + 0xc4; 0xc1; 0x75; 0xe4; 0xcb; + (* VPMULHUW (%_% ymm1) (%_% ymm1) (%_% ymm11) *) + 0xc4; 0xc2; 0x75; 0x0b; 0xcc; + (* VPMULHRSW (%_% ymm1) (%_% ymm1) (%_% ymm12) *) + 0xc4; 0xc1; 0x7d; 0x66; 0xdd; + (* VPCMPGTD (%_% ymm3) (%_% ymm0) (%_% ymm13) *) + 0xc4; 0xc2; 0x75; 0x40; 0xd6; + (* VPMULLD (%_% ymm2) (%_% ymm1) (%_% ymm14) *) + 0xc5; 0xfd; 0xfa; 0xd2; (* VPSUBD (%_% ymm2) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xe5; 0xdf; 0xc9; (* VPANDN (%_% ymm1) (%_% ymm3) (%_% ymm1) *) + 0xc5; 0xed; 0xfe; 0xd3; (* VPADDD (%_% ymm2) (%_% ymm2) (%_% ymm3) *) + 0xc5; 0xfd; 0x7f; 0x8f; 0xe0; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,992))) (%_% ymm1) *) + 0xc5; 0xfd; 0x7f; 0x96; 0xe0; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rsi,992))) (%_% ymm2) *) + 0xc3 (* RET *) +];; +(*** BYTECODE END ***) + +let mldsa_decompose88_tmc = define_trimmed "mldsa_decompose88_tmc" mldsa_decompose88_mc;; +let MLDSA_DECOMPOSE88_EXEC = X86_MK_CORE_EXEC_RULE mldsa_decompose88_tmc;; + +(* ========================================================================= *) +(* Word-level lane functions matching the AVX2 instruction sequence. *) +(* ========================================================================= *) + +(* High-bits quotient h, x86 mulhi/mulhrs path (matches VPMULHUW+VPMULHRSW). *) +let decompose88_h = define + `decompose88_h (y:int32) : int32 = + word_join + (word_subword + (word_add + (word_ushr + (word_mul + (word_sx + (word_subword + (word_mul + (word_zx (word_subword (word_ushr (word_add y (word 127)) 7) (16,16):16 word):int32) + (word 0:int32)) (16,16):16 word):int32) + (word 0:int32)) 14) + (word 1:int32)) (1,16):16 word) + (word_subword + (word_add + (word_ushr + (word_mul + (word_sx + (word_subword + (word_mul + (word_zx (word_subword (word_ushr (word_add y (word 127)) 7) (0,16):16 word):int32) + (word 11275:int32)) (16,16):16 word):int32) + (word 128:int32)) 14) + (word 1:int32)) (1,16):16 word) :int32`;; + +let decompose88_a1 = define + `decompose88_a1 (y:int32) : int32 = + word_and (word_not (if word_igt y (word 8285184) then word 4294967295 else word 0)) + (decompose88_h y)`;; + +let decompose88_a0 = define + `decompose88_a0 (y:int32) : int32 = + word_add (word_sub y (word_mul (decompose88_h y) (word 190464))) + (if word_igt y (word 8285184) then word 4294967295 else word 0)`;; + +(* The unsigned high-16 multiply (VPMULHUW) on a sub-2^16 lane, specialized + to the Barrett magic constant 11275 from the shared MULHI_LANE_GEN. *) +let MULHI_LANE = prove( + `!t:int32. val t < 65536 ==> + val(word_subword (word_mul (word_zx (word_subword t (0,16):16 word):int32) + (word 11275)) (16,16):16 word) = + (val t * 11275) DIV 65536`, + GEN_TAC THEN DISCH_TAC THEN MATCH_MP_TAC MULHI_LANE_GEN THEN + ASM_REWRITE_TAC[] THEN ARITH_TAC);; + +(* The high 16-bit lane of the VPMULHUW/VPMULHRSW chain is identically zero + (the high subword of (x+127)>>7 is multiplied by zero). *) +let HI16_ZERO = + let rhs = rand(concl decompose88_h) in + let hi16 = rand(rator rhs) in + prove(mk_eq(hi16, `word 0:16 word`), + SUBGOAL_THEN `word_mul (word_zx (word_subword (word_ushr (word_add (y:int32) (word 127)) 7) (16,16):16 word):int32) + (word 0):int32 = word 0` SUBST1_TAC THENL + [CONV_TAC WORD_RULE; ALL_TAC] THEN CONV_TAC WORD_REDUCE_CONV);; + +(* Numerical form of decompose88_h: the nested mulhi/mulhrs floors. *) +let H_NUM = prove( + `!x:int32. val x < 8380417 ==> + val(decompose88_h x) = + (((((val x + 127) DIV 128 * 11275) DIV 65536) * 128) DIV 16384 + 1) DIV 2`, + GEN_TAC THEN DISCH_TAC THEN + GEN_REWRITE_TAC (LAND_CONV o RAND_CONV) [decompose88_h] THEN + REWRITE_TAC[HI16_ZERO] THEN + REWRITE_TAC[VAL_WORD_JOIN; DIMINDEX_16; DIMINDEX_32; VAL_WORD_0; ADD_CLAUSES; MULT_CLAUSES] THEN + ABBREV_TAC `m:16 word = word_subword (word_mul (word_zx (word_subword (word_ushr (word_add (x:int32) (word 127)) 7) (0,16):16 word):int32) (word 11275)) (16,16)` THEN + SUBGOAL_THEN `val(m:16 word) = ((val(x:int32) + 127) DIV 128 * 11275) DIV 65536` ASSUME_TAC THENL + [EXPAND_TAC "m" THEN + MP_TAC(SPEC `word_ushr (word_add (x:int32) (word 127)) 7` MULHI_LANE) THEN + ASM_SIMP_TAC[H_T] THEN ANTS_TAC THENL + [ASM_SIMP_TAC[H_T] THEN MP_TAC(SPEC `x:int32` T_BOUND) THEN ASM_ARITH_TAC; + DISCH_THEN SUBST1_TAC THEN ASM_SIMP_TAC[H_T]]; ALL_TAC] THEN + SUBGOAL_THEN `val(m:16 word) < 11275` ASSUME_TAC THENL + [ASM_REWRITE_TAC[] THEN SIMP_TAC[RDIV_LT_EQ; ARITH_RULE `~(65536 = 0)`] THEN + MP_TAC(SPEC `x:int32` T_BOUND) THEN ASM_ARITH_TAC; ALL_TAC] THEN + REWRITE_TAC[VAL_WORD_SUBWORD; VAL_WORD_ADD; VAL_WORD_USHR; VAL_WORD_MUL; VAL_WORD; + DIMINDEX_16; DIMINDEX_32] THEN + CONV_TAC NUM_REDUCE_CONV THEN + SUBGOAL_THEN `val(word_sx (m:16 word):int32) = val m` SUBST1_TAC THENL + [MATCH_MP_TAC VAL_SX_16_32 THEN ASM_ARITH_TAC; ALL_TAC] THEN + ASM_REWRITE_TAC[] THEN + SUBGOAL_THEN `((val(x:int32) + 127) DIV 128 * 11275) DIV 65536 < 11275` ASSUME_TAC THENL + [ASM_MESON_TAC[]; ALL_TAC] THEN + ABBREV_TAC `q = ((val(x:int32) + 127) DIV 128 * 11275) DIV 65536` THEN + SUBGOAL_THEN `(q * 128) MOD 4294967296 = q * 128` SUBST1_TAC THENL + [MATCH_MP_TAC MOD_LT THEN ASM_ARITH_TAC; ALL_TAC] THEN + SUBGOAL_THEN `(q * 128) DIV 16384 < 89` ASSUME_TAC THENL + [SIMP_TAC[RDIV_LT_EQ; ARITH_RULE `~(16384 = 0)`] THEN ASM_ARITH_TAC; ALL_TAC] THEN + SUBGOAL_THEN `((q * 128) DIV 16384 + 1) MOD 4294967296 = (q * 128) DIV 16384 + 1` SUBST1_TAC THENL + [MATCH_MP_TAC MOD_LT THEN ASM_ARITH_TAC; ALL_TAC] THEN + SUBGOAL_THEN `((q * 128) DIV 16384 + 1) DIV 2 < 65536` ASSUME_TAC THENL + [ASM_ARITH_TAC; ALL_TAC] THEN + ASM_SIMP_TAC[MOD_LT; ARITH_RULE `n < 65536 ==> n < 4294967296`]);; + +(* Collapse the two trailing right-shift floors into one. *) +let ADD128_DIV = prove( + `!c1. (c1 + 128) DIV 128 = c1 DIV 128 + 1`, + GEN_TAC THEN SUBGOAL_THEN `c1 + 128 = c1 + 1 * 128` SUBST1_TAC THENL + [ARITH_TAC; SIMP_TAC[DIV_MULT_ADD; ARITH_RULE `~(128 = 0)`]]);; + +let DIVMUL_HELP = prove(`!a. (a * 128) DIV 16384 = a DIV 128`, + GEN_TAC THEN REWRITE_TAC[ARITH_RULE `16384 = 128 * 128`] THEN + GEN_REWRITE_TAC (LAND_CONV o RAND_CONV) [MULT_SYM] THEN + ONCE_REWRITE_TAC[MULT_SYM] THEN + SIMP_TAC[DIV_MULT2; ARITH_RULE `~(128 = 0)`]);; + +let E_COLLAPSE = prove( + `!c. ((((c * 11275) DIV 65536) * 128) DIV 16384 + 1) DIV 2 = + ((c * 11275) DIV 65536 + 128) DIV 256`, + GEN_TAC THEN REWRITE_TAC[DIVMUL_HELP; GSYM ADD128_DIV] THEN + REWRITE_TAC[ARITH_RULE `256 = 128 * 2`; DIV_DIV]);; + +(* Barrett-style correctness of the mulhi/mulhrs quotient. *) +let BARRETT_CORE = prove( + `!c. c < 65473 ==> ((c * 11275) DIV 65536 + 128) DIV 256 = (c + 743) DIV 1488`, + GEN_TAC THEN DISCH_TAC THEN + ABBREV_TAC `c1 = (c * 11275) DIV 65536` THEN + MP_TAC(SPECL [`c * 11275`; `65536`] DIVISION) THEN ANTS_TAC THENL [ARITH_TAC; ALL_TAC] THEN + ASM_REWRITE_TAC[] THEN STRIP_TAC THEN + MATCH_MP_TAC DIV_BOUNDS_EQ THEN CONV_TAC NUM_REDUCE_CONV THEN + MP_TAC(SPECL [`c + 743`; `1488`] DIVISION) THEN ANTS_TAC THENL [ARITH_TAC; ALL_TAC] THEN + ABBREV_TAC `q = (c + 743) DIV 1488` THEN + ABBREV_TAC `t = (c + 743) MOD 1488` THEN STRIP_TAC THEN + SUBGOAL_THEN `q <= 44` ASSUME_TAC THENL + [SUBGOAL_THEN `q * 1488 <= c + 743` MP_TAC THENL [ASM_ARITH_TAC; ASM_ARITH_TAC]; ALL_TAC] THEN + ASM_ARITH_TAC);; + +(* round-half-down closed form. *) +let ROUND_CLOSED = prove( + `!r. (if r MOD 190464 * 2 <= 190464 then r DIV 190464 else r DIV 190464 + 1) = + (r + 95231) DIV 190464`, + GEN_TAC THEN + MP_TAC(SPECL [`r:num`; `190464`] DIVISION) THEN ANTS_TAC THENL [ARITH_TAC; STRIP_TAC] THEN + ABBREV_TAC `q = r DIV 190464` THEN ABBREV_TAC `m = r MOD 190464` THEN + COND_CASES_TAC THENL + [SUBGOAL_THEN `(r + 95231) DIV 190464 = q` (fun th -> REWRITE_TAC[th]) THEN + MATCH_MP_TAC DIV_UNIQ THEN EXISTS_TAC `m + 95231` THEN ASM_ARITH_TAC; + SUBGOAL_THEN `95233 <= m` ASSUME_TAC THENL [ASM_ARITH_TAC; ALL_TAC] THEN + SUBGOAL_THEN `(r + 95231) DIV 190464 = q + 1` (fun th -> REWRITE_TAC[th]) THEN + MATCH_MP_TAC DIV_UNIQ THEN EXISTS_TAC `m - 95233` THEN ASM_ARITH_TAC]);; + +let C_TO_R = prove( + `!r. (((r + 127) DIV 128) + 743) DIV 1488 = (r + 95231) DIV 190464`, + GEN_TAC THEN + SUBGOAL_THEN `(r + 127) DIV 128 + 743 = (r + 95231) DIV 128` SUBST1_TAC THENL + [SUBGOAL_THEN `r + 95231 = (r + 127) + 743 * 128` SUBST1_TAC THENL + [ARITH_TAC; SIMP_TAC[DIV_MULT_ADD; ARITH_RULE `~(128 = 0)`]]; ALL_TAC] THEN + REWRITE_TAC[DIV_DIV] THEN REWRITE_TAC[ARITH_RULE `128 * 1488 = 190464`]);; + +let H_ROUND = prove( + `!r. r < 8380417 ==> + (((((r + 127) DIV 128 * 11275) DIV 65536) * 128) DIV 16384 + 1) DIV 2 = + (if r MOD 190464 * 2 <= 190464 then r DIV 190464 else r DIV 190464 + 1)`, + GEN_TAC THEN DISCH_TAC THEN + REWRITE_TAC[E_COLLAPSE] THEN + SUBGOAL_THEN `(r + 127) DIV 128 < 65473` ASSUME_TAC THENL + [SIMP_TAC[RDIV_LT_EQ; ARITH_RULE `~(128 = 0)`] THEN ASM_ARITH_TAC; ALL_TAC] THEN + ASM_SIMP_TAC[BARRETT_CORE] THEN + REWRITE_TAC[C_TO_R; ROUND_CLOSED]);; + +(* Correctness of the high-bits quotient: same round-half-down form proven for + AArch64 (h32), enabling reuse of the spec-connection lemmas below. *) +let H32_CORRECT = prove( + `!x:int32. val x < 8380417 ==> + val(decompose88_h x) = (if val x MOD 190464 * 2 <= 190464 + then val x DIV 190464 + else val x DIV 190464 + 1)`, + GEN_TAC THEN DISCH_TAC THEN ASM_SIMP_TAC[H_NUM] THEN ASM_SIMP_TAC[H_ROUND]);; + +(* The wrap-around test in word form, specialized to the threshold 8285184 + from the shared IGT_BOUND_GEN. *) +let IGT_BOUND = + GEN_ALL(MP (SPECL [`x:int32`; `8285184`] IGT_BOUND_GEN) + (ARITH_RULE `8285184 < 2147483648`));; + +(* a1 = 0 on wrap-around, else h. *) +let DECOMPOSE88_A1_CASES = prove( + `!x:int32. decompose88_a1 x = if ival x > &8285184 then word 0 else decompose88_h x`, + GEN_TAC THEN REWRITE_TAC[decompose88_a1; GSYM IGT_BOUND] THEN + ABBREV_TAC `h = decompose88_h x` THEN + COND_CASES_TAC THEN ASM_REWRITE_TAC[] THEN BITBLAST_TAC);; + +(* a0 subtracts one extra on wrap-around. *) +let DECOMPOSE88_A0_CASES = prove( + `!x:int32. decompose88_a0 x = + if ival x > &8285184 + then word_sub (word_sub x (word_mul (decompose88_h x) (word 190464))) (word 1) + else word_sub x (word_mul (decompose88_h x) (word 190464))`, + GEN_TAC THEN REWRITE_TAC[decompose88_a0; GSYM IGT_BOUND] THEN + ABBREV_TAC `h = decompose88_h x` THEN + SUBGOAL_THEN `word 4294967295:int32 = word_neg(word 1)` SUBST1_TAC THENL + [CONV_TAC WORD_REDUCE_CONV; ALL_TAC] THEN + COND_CASES_TAC THEN ASM_REWRITE_TAC[] THEN CONV_TAC WORD_RULE);; + +(* On wrap-around (val x > 87*GAMMA2) the rounding quotient is exactly 44. *) +let ROUND32_SPECIAL = prove( + `!n. 8285184 < n /\ n < 8380417 ==> + (if n MOD 190464 * 2 <= 190464 then n DIV 190464 else n DIV 190464 + 1) = 44`, + REPEAT STRIP_TAC THEN + ASM_CASES_TAC `n < 8380416` THENL + [SUBGOAL_THEN `n DIV 190464 = 43` ASSUME_TAC THENL + [MATCH_MP_TAC DIV_BOUNDS_EQ THEN CONV_TAC NUM_REDUCE_CONV THEN ASM_ARITH_TAC; ALL_TAC] THEN + ASM_REWRITE_TAC[] THEN CONV_TAC NUM_REDUCE_CONV THEN + COND_CASES_TAC THENL + [MP_TAC(SPECL [`n:num`; `190464`] DIVISION) THEN ANTS_TAC THENL [ARITH_TAC; ALL_TAC] THEN + ASM_REWRITE_TAC[] THEN CONV_TAC NUM_REDUCE_CONV THEN STRIP_TAC THEN ASM_ARITH_TAC; + REWRITE_TAC[]]; + SUBGOAL_THEN `n = 8380416` SUBST_ALL_TAC THENL [ASM_ARITH_TAC; ALL_TAC] THEN + CONV_TAC NUM_REDUCE_CONV]);; + +let CMOD_ABS_BOUND_190464 = prove( + `!n. abs(mldsa_cmod n 190464) <= &95232`, + GEN_TAC THEN REWRITE_TAC[mldsa_cmod] THEN + SUBGOAL_THEN `n MOD 190464 < 190464` MP_TAC THENL + [SIMP_TAC[MOD_LT_EQ; ARITH_RULE `~(190464 = 0)`]; ALL_TAC] THEN + SPEC_TAC(`n MOD 190464`, `m:num`) THEN GEN_TAC THEN DISCH_TAC THEN + COND_CASES_TAC THEN + REWRITE_TAC[INT_ABS; INT_POS; INT_OF_NUM_LE; INT_OF_NUM_SUB; INT_SUB_LE; INT_NEG_SUB] THEN + ASM_ARITH_TAC);; + +(* a1 lane computes the high bits FST(mldsa_decompose_88(val x)). *) +let DECOMPOSE88_A1_CORRECT = prove( + `!x:int32. val x < 8380417 + ==> val(decompose88_a1 x) = FST(mldsa_decompose_88(val x))`, + GEN_TAC THEN DISCH_TAC THEN + REWRITE_TAC[DECOMPOSE88_A1_CASES; MLDSA_DECOMPOSE_88_EXPAND; LET_DEF; LET_END_DEF; FST] THEN + COND_CASES_TAC THENL + [REWRITE_TAC[VAL_WORD_0; FST] THEN + SUBGOAL_THEN `val(x:int32) < 2 EXP 31` ASSUME_TAC THENL [ASM_ARITH_TAC; ALL_TAC] THEN + MP_TAC(SPEC `x:int32` IVAL_EQ_VAL) THEN ASM_REWRITE_TAC[] THEN DISCH_TAC THEN + SUBGOAL_THEN `&(val(x:int32)):int > &8285184` MP_TAC THENL [ASM_MESON_TAC[]; ALL_TAC] THEN + REWRITE_TAC[INT_OF_NUM_GT; GT] THEN DISCH_TAC THEN + MP_TAC(SPEC `val(x:int32)` ROUND32_SPECIAL) THEN + ASM_REWRITE_TAC[] THEN DISCH_THEN SUBST1_TAC THEN REWRITE_TAC[FST]; + MP_TAC(SPEC `x:int32` H32_CORRECT) THEN ASM_REWRITE_TAC[] THEN DISCH_THEN SUBST1_TAC THEN + COND_CASES_TAC THENL + [SUBGOAL_THEN `val(x:int32) <= 8285184` ASSUME_TAC THENL + [SUBGOAL_THEN `val(x:int32) < 2 EXP 31` ASSUME_TAC THENL [ASM_ARITH_TAC; ALL_TAC] THEN + MP_TAC(SPEC `x:int32` IVAL_EQ_VAL) THEN ASM_REWRITE_TAC[] THEN DISCH_TAC THEN + SUBGOAL_THEN `~(&(val(x:int32)):int > &8285184)` MP_TAC THENL [ASM_MESON_TAC[]; ALL_TAC] THEN + REWRITE_TAC[INT_GT; INT_NOT_LT; INT_OF_NUM_LE]; ALL_TAC] THEN + SUBGOAL_THEN `~(val(x:int32) DIV 190464 = 44)` ASSUME_TAC THENL + [DISCH_TAC THEN MP_TAC(SPECL [`val(x:int32)`; `190464`] DIVISION) THEN + ANTS_TAC THENL [ARITH_TAC; ALL_TAC] THEN + ASM_REWRITE_TAC[] THEN CONV_TAC NUM_REDUCE_CONV THEN STRIP_TAC THEN ASM_ARITH_TAC; ALL_TAC] THEN + ASM_REWRITE_TAC[FST]; + SUBGOAL_THEN `val(x:int32) <= 8285184` ASSUME_TAC THENL + [SUBGOAL_THEN `val(x:int32) < 2 EXP 31` ASSUME_TAC THENL [ASM_ARITH_TAC; ALL_TAC] THEN + MP_TAC(SPEC `x:int32` IVAL_EQ_VAL) THEN ASM_REWRITE_TAC[] THEN DISCH_TAC THEN + SUBGOAL_THEN `~(&(val(x:int32)):int > &8285184)` MP_TAC THENL [ASM_MESON_TAC[]; ALL_TAC] THEN + REWRITE_TAC[INT_GT; INT_NOT_LT; INT_OF_NUM_LE]; ALL_TAC] THEN + SUBGOAL_THEN `~(val(x:int32) DIV 190464 + 1 = 44)` ASSUME_TAC THENL + [REWRITE_TAC[ARITH_RULE `n + 1 = 44 <=> n = 43`] THEN DISCH_TAC THEN + MP_TAC(SPECL [`val(x:int32)`; `190464`] DIVISION) THEN + ANTS_TAC THENL [ARITH_TAC; ALL_TAC] THEN + ASM_REWRITE_TAC[] THEN CONV_TAC NUM_REDUCE_CONV THEN STRIP_TAC THEN ASM_ARITH_TAC; ALL_TAC] THEN + ASM_REWRITE_TAC[FST]]]);; + +(* a0 lane computes the low bits SND(mldsa_decompose_88(val x)). *) +let DECOMPOSE88_A0_CORRECT = prove( + `!x:int32. val x < 8380417 + ==> ival(decompose88_a0 x) = SND(mldsa_decompose_88(val x))`, + GEN_TAC THEN DISCH_TAC THEN + REWRITE_TAC[DECOMPOSE88_A0_CASES; MLDSA_DECOMPOSE_88_EXPAND; LET_DEF; LET_END_DEF; SND] THEN + SUBGOAL_THEN `word_sub x (word_mul (decompose88_h x) (word 190464)) : int32 = + iword(ival x - ival(decompose88_h x) * &190464)` SUBST1_TAC THENL + [CONV_TAC WORD_RULE; ALL_TAC] THEN + SUBGOAL_THEN `ival(x:int32) = &(val x)` SUBST1_TAC THENL + [MATCH_MP_TAC IVAL_EQ_VAL THEN ASM_ARITH_TAC; ALL_TAC] THEN + SUBGOAL_THEN `ival(decompose88_h x:int32) = &(val(decompose88_h x))` SUBST1_TAC THENL + [MATCH_MP_TAC IVAL_EQ_VAL THEN + MP_TAC(SPEC `x:int32` H32_CORRECT) THEN ASM_REWRITE_TAC[] THEN DISCH_TAC THEN + ASM_SIMP_TAC[RDIV_LT_EQ; ARITH_RULE `~(190464 = 0)`] THEN + CONV_TAC NUM_REDUCE_CONV THEN ASM_ARITH_TAC; ALL_TAC] THEN + MP_TAC(SPEC `x:int32` H32_CORRECT) THEN ASM_REWRITE_TAC[] THEN DISCH_THEN SUBST1_TAC THEN + REWRITE_TAC[INT_OF_NUM_GT] THEN + ABBREV_TAC `h = (if val(x:int32) MOD 190464 * 2 <= 190464 + then val x DIV 190464 else val x DIV 190464 + 1)` THEN + SUBGOAL_THEN `&(val(x:int32)):int = + &(val x DIV 190464) * &190464 + &(val x MOD 190464)` ASSUME_TAC THENL + [MP_TAC(SPECL [`val(x:int32)`; `190464`] DIVISION) THEN + ANTS_TAC THENL [ARITH_TAC; ALL_TAC] THEN + DISCH_THEN(MP_TAC o AP_TERM `int_of_num` o CONJUNCT1) THEN + REWRITE_TAC[INT_OF_NUM_MUL; INT_OF_NUM_ADD]; ALL_TAC] THEN + SUBGOAL_THEN `&(val(x:int32)) - &h * &190464 = mldsa_cmod (val x) 190464` + ASSUME_TAC THENL + [REWRITE_TAC[mldsa_cmod] THEN + FIRST_X_ASSUM(MP_TAC o SYM o check (fun th -> + fst(dest_cond(fst(dest_eq(concl th)))) = + `val (x:int32) MOD 190464 * 2 <= 190464`)) THEN + COND_CASES_TAC THENL + [DISCH_THEN SUBST1_TAC THEN ASM_REWRITE_TAC[] THEN INT_ARITH_TAC; + DISCH_THEN SUBST1_TAC THEN ASM_REWRITE_TAC[GSYM INT_OF_NUM_ADD; + GSYM INT_OF_NUM_MUL] THEN INT_ARITH_TAC]; ALL_TAC] THEN + ASM_REWRITE_TAC[] THEN + COND_CASES_TAC THENL + [SUBGOAL_THEN `h = 44` SUBST1_TAC THENL + [MP_TAC(SPEC `val(x:int32)` ROUND32_SPECIAL) THEN + ANTS_TAC THENL [ASM_ARITH_TAC; ALL_TAC] THEN ASM_MESON_TAC[]; ALL_TAC] THEN + REWRITE_TAC[SND] THEN + SUBGOAL_THEN `word_sub (iword(mldsa_cmod (val(x:int32)) 190464)) (word 1) : int32 = + iword(mldsa_cmod (val x) 190464 - &1)` SUBST1_TAC THENL + [REWRITE_TAC[GSYM IWORD_INT_SUB; WORD_IWORD]; ALL_TAC] THEN + MATCH_MP_TAC(INST_TYPE [`:32`,`:N`] IVAL_IWORD) THEN + REWRITE_TAC[DIMINDEX_32] THEN CONV_TAC NUM_REDUCE_CONV THEN + MP_TAC(SPEC `val(x:int32)` CMOD_ABS_BOUND_190464) THEN INT_ARITH_TAC; + SUBGOAL_THEN `~(h = 44)` ASSUME_TAC THENL + [DISCH_TAC THEN + SUBGOAL_THEN `val(x:int32) <= 8285184` ASSUME_TAC THENL [ASM_ARITH_TAC; ALL_TAC] THEN + SUBGOAL_THEN `(if val(x:int32) MOD 190464 * 2 <= 190464 + then val x DIV 190464 else val x DIV 190464 + 1) = 44` MP_TAC THENL + [ASM_MESON_TAC[]; ALL_TAC] THEN + COND_CASES_TAC THENL + [DISCH_TAC THEN MP_TAC(SPECL [`val(x:int32)`; `190464`] DIVISION) THEN + ANTS_TAC THENL [ARITH_TAC; ALL_TAC] THEN + ASM_REWRITE_TAC[] THEN CONV_TAC NUM_REDUCE_CONV THEN STRIP_TAC THEN ASM_ARITH_TAC; + REWRITE_TAC[ARITH_RULE `n + 1 = 44 <=> n = 43`] THEN DISCH_TAC THEN + MP_TAC(SPECL [`val(x:int32)`; `190464`] DIVISION) THEN + ANTS_TAC THENL [ARITH_TAC; ALL_TAC] THEN + ASM_REWRITE_TAC[] THEN CONV_TAC NUM_REDUCE_CONV THEN STRIP_TAC THEN ASM_ARITH_TAC]; ALL_TAC] THEN + ASM_REWRITE_TAC[SND] THEN + MATCH_MP_TAC(INST_TYPE [`:32`,`:N`] IVAL_IWORD) THEN + REWRITE_TAC[DIMINDEX_32] THEN CONV_TAC NUM_REDUCE_CONV THEN + MP_TAC(SPEC `val(x:int32)` CMOD_ABS_BOUND_190464) THEN INT_ARITH_TAC]);; + +(* Range bounds on the lane outputs, phrased on the word-level lane functions + so the main proof can discharge the CBMC-contract bounds uniformly. *) +let DECOMPOSE88_A1_BOUND_LEMMA = prove( + `!x:int32. val x < 8380417 ==> val(decompose88_a1 x) <= 43`, + GEN_TAC THEN DISCH_TAC THEN + ASM_SIMP_TAC[DECOMPOSE88_A1_CORRECT; MLDSA_DECOMPOSE_88_A1_BOUND]);; + +let DECOMPOSE88_A0_BOUND_LO = prove( + `!x:int32. val x < 8380417 ==> --(&95232) <= ival(decompose88_a0 x)`, + GEN_TAC THEN DISCH_TAC THEN + ASM_SIMP_TAC[DECOMPOSE88_A0_CORRECT] THEN + MP_TAC(SPEC `val(x:int32)` MLDSA_DECOMPOSE_88_A0_BOUND) THEN ASM_REWRITE_TAC[] THEN INT_ARITH_TAC);; + +let DECOMPOSE88_A0_BOUND_HI = prove( + `!x:int32. val x < 8380417 ==> ival(decompose88_a0 x) <= &95232`, + GEN_TAC THEN DISCH_TAC THEN + ASM_SIMP_TAC[DECOMPOSE88_A0_CORRECT] THEN + MP_TAC(SPEC `val(x:int32)` MLDSA_DECOMPOSE_88_A0_BOUND) THEN ASM_REWRITE_TAC[] THEN INT_ARITH_TAC);; + +(* ========================================================================= *) +(* Core correctness theorem *) +(* ========================================================================= *) + +let MLDSA_DECOMPOSE88_CORRECT = prove( + `!a1 a (x:num->int32) pc. + ALL (nonoverlapping (word pc, 2144)) + [(a1,1024); (a,1024)] /\ + nonoverlapping (a1,1024) (a,1024) /\ + aligned 32 a1 /\ aligned 32 a + ==> ensures x86 + (\s. bytes_loaded s (word pc) (BUTLAST mldsa_decompose88_tmc) /\ + read RIP s = word pc /\ + C_ARGUMENTS [a1; a] s /\ + (!i. i < 256 ==> + read(memory :> bytes32(word_add a (word(4 * i)))) s = x i) /\ + (!i. i < 256 ==> val(x i:int32) < 8380417)) + (\s. read RIP s = word(pc + 2143) /\ + (!i. i < 256 + ==> val(read(memory :> bytes32(word_add a1 (word(4*i)))) s) = + FST(mldsa_decompose_88(val(x i)))) /\ + (!i. i < 256 + ==> ival(read(memory :> bytes32(word_add a (word(4*i)))) s) = + SND(mldsa_decompose_88(val(x i)))) /\ + (!i. i < 256 + ==> val(read(memory :> bytes32(word_add a1 (word(4*i)))) s) <= 43) /\ + (!i. i < 256 + ==> --(&95232) <= + ival(read(memory :> bytes32(word_add a (word(4*i)))) s) /\ + ival(read(memory :> bytes32(word_add a (word(4*i)))) s) <= &95232)) + (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(a1,1024)] ,, + MAYCHANGE [memory :> bytes(a,1024)])`, + MAP_EVERY X_GEN_TAC [`a1:int64`; `a:int64`; `x:num->int32`; `pc:num`] THEN + REWRITE_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; C_ARGUMENTS; ALL; + NONOVERLAPPING_CLAUSES; fst MLDSA_DECOMPOSE88_EXEC] THEN + STRIP_TAC THEN + CONV_TAC(RATOR_CONV(LAND_CONV(ONCE_DEPTH_CONV + (EXPAND_CASES_CONV THENC ONCE_DEPTH_CONV NUM_MULT_CONV)))) THEN + ENSURES_INIT_TAC "s0" THEN + MP_TAC(end_itlist CONJ + (map (fun n -> READ_MEMORY_MERGE_CONV 3 + (subst[mk_small_numeral(32*n),`n:num`] + `read (memory :> bytes256(word_add a (word n))) s0`)) + (0--31))) THEN + ASM_REWRITE_TAC[WORD_ADD_0] THEN + DISCARD_MATCHING_ASSUMPTIONS [`read (memory :> bytes32 a) s = x`] THEN + STRIP_TAC THEN + MAP_EVERY (fun n -> + X86_STEPS_TAC MLDSA_DECOMPOSE88_EXEC [n] THEN + SIMD_SIMPLIFY_TAC[decompose88_a1; decompose88_a0]) (1--399) THEN + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + RULE_ASSUM_TAC(REWRITE_RULE[WORD_NOT_JOIN_256; WORD_NOT_JOIN_128; WORD_NOT_JOIN_64]) THEN + REPEAT(FIRST_X_ASSUM(STRIP_ASSUME_TAC o + CONV_RULE(SIMD_SIMPLIFY_CONV[decompose88_h; decompose88_a1; decompose88_a0]) o + CONV_RULE(READ_MEMORY_SPLIT_CONV 3) o + check (can (term_match [] `read (memory :> bytes256 zzz) s399 = xxx`) o concl))) THEN + CONV_TAC(ONCE_DEPTH_CONV EXPAND_CASES_CONV THENC ONCE_DEPTH_CONV NUM_MULT_CONV) THEN + ASM_REWRITE_TAC[WORD_ADD_0] THEN + REPEAT CONJ_TAC THEN + FIRST (map (fun th -> MATCH_MP_TAC th THEN FIRST_ASSUM ACCEPT_TAC) + [DECOMPOSE88_A1_CORRECT; DECOMPOSE88_A0_CORRECT; + DECOMPOSE88_A1_BOUND_LEMMA; DECOMPOSE88_A0_BOUND_LO; + DECOMPOSE88_A0_BOUND_HI]));; + +(* ========================================================================= *) +(* Subroutine form with return, bounds matching the CBMC contract. *) +(* This must be kept in sync with the CBMC specification in *) +(* mldsa/src/native/x86_64/src/arith_native_x86_64.h *) +(* ========================================================================= *) + +let MLDSA_DECOMPOSE88_NOIBT_SUBROUTINE_CORRECT = prove( + `!a1 a (x:num->int32) pc stackpointer returnaddress. + aligned 32 a1 /\ aligned 32 a /\ + ALL (nonoverlapping (word pc, LENGTH mldsa_decompose88_tmc)) + [(a1,1024); (a,1024)] /\ + nonoverlapping (a1,1024) (a,1024) /\ + nonoverlapping (stackpointer,8) (a1,1024) /\ + nonoverlapping (stackpointer,8) (a,1024) + ==> ensures x86 + (\s. bytes_loaded s (word pc) mldsa_decompose88_tmc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [a1; a] s /\ + (!i. i < 256 + ==> read(memory :> bytes32(word_add a (word(4 * i)))) s = x i) /\ + (!i. i < 256 ==> val(x i:int32) < 8380417)) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + (!i. i < 256 + ==> val(read(memory :> bytes32(word_add a1 (word(4*i)))) s) = + FST(mldsa_decompose_88(val(x i)))) /\ + (!i. i < 256 + ==> ival(read(memory :> bytes32(word_add a (word(4*i)))) s) = + SND(mldsa_decompose_88(val(x i)))) /\ + (!i. i < 256 + ==> val(read(memory :> bytes32(word_add a1 (word(4*i)))) s) <= 43) /\ + (!i. i < 256 + ==> --(&95232) <= + ival(read(memory :> bytes32(word_add a (word(4*i)))) s) /\ + ival(read(memory :> bytes32(word_add a (word(4*i)))) s) <= &95232)) + (MAYCHANGE [RSP] ,, MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(a1,1024)] ,, + MAYCHANGE [memory :> bytes(a,1024)])`, + X86_PROMOTE_RETURN_NOSTACK_TAC mldsa_decompose88_tmc MLDSA_DECOMPOSE88_CORRECT);; + +let MLDSA_DECOMPOSE88_SUBROUTINE_CORRECT = prove( + `!a1 a (x:num->int32) pc stackpointer returnaddress. + aligned 32 a1 /\ aligned 32 a /\ + ALL (nonoverlapping (word pc, LENGTH mldsa_decompose88_mc)) + [(a1,1024); (a,1024)] /\ + nonoverlapping (a1,1024) (a,1024) /\ + nonoverlapping (stackpointer,8) (a1,1024) /\ + nonoverlapping (stackpointer,8) (a,1024) + ==> ensures x86 + (\s. bytes_loaded s (word pc) mldsa_decompose88_mc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [a1; a] s /\ + (!i. i < 256 + ==> read(memory :> bytes32(word_add a (word(4 * i)))) s = x i) /\ + (!i. i < 256 ==> val(x i:int32) < 8380417)) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + (!i. i < 256 + ==> val(read(memory :> bytes32(word_add a1 (word(4*i)))) s) = + FST(mldsa_decompose_88(val(x i)))) /\ + (!i. i < 256 + ==> ival(read(memory :> bytes32(word_add a (word(4*i)))) s) = + SND(mldsa_decompose_88(val(x i)))) /\ + (!i. i < 256 + ==> val(read(memory :> bytes32(word_add a1 (word(4*i)))) s) <= 43) /\ + (!i. i < 256 + ==> --(&95232) <= + ival(read(memory :> bytes32(word_add a (word(4*i)))) s) /\ + ival(read(memory :> bytes32(word_add a (word(4*i)))) s) <= &95232)) + (MAYCHANGE [RSP] ,, MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(a1,1024)] ,, + MAYCHANGE [memory :> bytes(a,1024)])`, + MATCH_ACCEPT_TAC(ADD_IBT_RULE MLDSA_DECOMPOSE88_NOIBT_SUBROUTINE_CORRECT));; + +(* ========================================================================= *) +(* Memory safety. *) +(* Decompose has no data-dependent control flow or memory accesses, so the *) +(* memory-safety property is proven directly from the correctness spec. *) +(* ========================================================================= *) + +needs "s2n_bignum/x86/proofs/consttime.ml";; +needs "mldsa_native/x86_64/proofs/subroutine_signatures.ml";; + +let full_spec,public_vars = mk_safety_spec + ~keep_maychanges:true + (assoc "mldsa_poly_decompose_88_x86" subroutine_signatures) + (REWRITE_RULE[SOME_FLAGS] MLDSA_DECOMPOSE88_CORRECT) + MLDSA_DECOMPOSE88_EXEC;; + +let MLDSA_DECOMPOSE88_SAFE = + REWRITE_RULE [MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; SOME_FLAGS] + (time prove + (full_spec, + REWRITE_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; SOME_FLAGS] THEN + PROVE_SAFETY_SPEC_TAC ~public_vars:public_vars + MLDSA_DECOMPOSE88_EXEC));; + +let MLDSA_DECOMPOSE88_NOIBT_SUBROUTINE_SAFE = time prove + (`exists f_events. + forall e a1 a pc stackpointer returnaddress. + aligned 32 a1 /\ aligned 32 a /\ + ALL (nonoverlapping (word pc, LENGTH mldsa_decompose88_tmc)) + [(a1,1024); (a,1024)] /\ + nonoverlapping (a1,1024) (a,1024) /\ + nonoverlapping (stackpointer,8) (a1,1024) /\ + nonoverlapping (stackpointer,8) (a,1024) + ==> ensures x86 + (\s. + bytes_loaded s (word pc) mldsa_decompose88_tmc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [a1; a] s /\ + read events s = e) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + (exists e2. + read events s = APPEND e2 e /\ + e2 = f_events a1 a pc stackpointer returnaddress /\ + memaccess_inbounds e2 [a,1024; a1,1024; a,1024; stackpointer,8] + [a1,1024; a,1024; stackpointer,8])) + (\s s'. true)`, + X86_PROMOTE_RETURN_NOSTACK_TAC mldsa_decompose88_tmc MLDSA_DECOMPOSE88_SAFE THEN + DISCHARGE_SAFETY_PROPERTY_TAC);; + +let MLDSA_DECOMPOSE88_SUBROUTINE_SAFE = time prove + (`exists f_events. + forall e a1 a pc stackpointer returnaddress. + aligned 32 a1 /\ aligned 32 a /\ + ALL (nonoverlapping (word pc, LENGTH mldsa_decompose88_mc)) + [(a1,1024); (a,1024)] /\ + nonoverlapping (a1,1024) (a,1024) /\ + nonoverlapping (stackpointer,8) (a1,1024) /\ + nonoverlapping (stackpointer,8) (a,1024) + ==> ensures x86 + (\s. + bytes_loaded s (word pc) mldsa_decompose88_mc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [a1; a] s /\ + read events s = e) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + (exists e2. + read events s = APPEND e2 e /\ + e2 = f_events a1 a pc stackpointer returnaddress /\ + memaccess_inbounds e2 [a,1024; a1,1024; a,1024; stackpointer,8] + [a1,1024; a,1024; stackpointer,8])) + (\s s'. true)`, + MATCH_ACCEPT_TAC(ADD_IBT_RULE MLDSA_DECOMPOSE88_NOIBT_SUBROUTINE_SAFE));; diff --git a/proofs/hol_light/x86_64/proofs/subroutine_signatures.ml b/proofs/hol_light/x86_64/proofs/subroutine_signatures.ml index 2695535a3..07a35b90c 100644 --- a/proofs/hol_light/x86_64/proofs/subroutine_signatures.ml +++ b/proofs/hol_light/x86_64/proofs/subroutine_signatures.ml @@ -70,6 +70,40 @@ let subroutine_signatures = [ ]) ); +("mldsa_poly_decompose_32_x86", + ([(*args*) + ("a1", "int32_t[static 256]", (*is const?*)"false"); + ("a0", "int32_t[static 256]", (*is const?*)"false"); + ], + "void", + [(* input buffers *) + ("a0", "256"(* num elems *), 4(* elem bytesize *)); + ], + [(* output buffers *) + ("a1", "256"(* num elems *), 4(* elem bytesize *)); + ("a0", "256"(* num elems *), 4(* elem bytesize *)); + ], + [(* temporary buffers *) + ]) +); + +("mldsa_poly_decompose_88_x86", + ([(*args*) + ("a1", "int32_t[static 256]", (*is const?*)"false"); + ("a0", "int32_t[static 256]", (*is const?*)"false"); + ], + "void", + [(* input buffers *) + ("a0", "256"(* num elems *), 4(* elem bytesize *)); + ], + [(* output buffers *) + ("a1", "256"(* num elems *), 4(* elem bytesize *)); + ("a0", "256"(* num elems *), 4(* elem bytesize *)); + ], + [(* temporary buffers *) + ]) +); + ("mldsa_pointwise_x86", ([(*args*) ("a", "int32_t[static 256]", (*is const?*)"false"); diff --git a/proofs/isabelle/compress/ML-DSA_Compress.thy b/proofs/isabelle/compress/ML-DSA_Compress.thy index 89a1c595b..ef5a6e20a 100644 --- a/proofs/isabelle/compress/ML-DSA_Compress.thy +++ b/proofs/isabelle/compress/ML-DSA_Compress.thy @@ -26,8 +26,8 @@ subsection \C and AVX2 implementations\ text \ The C reference \<^file>\../../../mldsa/src/rounding.h\ and AVX2 implementations - \<^file>\../../../dev/x86_64/src/poly_decompose_32_avx2.c\ and - \<^file>\../../../dev/x86_64/src/poly_decompose_88_avx2.c\ + \<^file>\../../../dev/x86_64/src/poly_decompose_32_avx2_asm.S\ and + \<^file>\../../../dev/x86_64/src/poly_decompose_88_avx2_asm.S\ first compute \<^verbatim>\ceil(f / 128)\, then Barrett divide by \<^verbatim>\B = 2*GAMMA2 / 128\. \ diff --git a/scripts/autogen b/scripts/autogen index 3ce73146f..86344836f 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -2844,6 +2844,18 @@ def hol_light_asm_joblist(): f"-Imldsa/src/native/x86_64/src -Imldsa/src/common.h {x86_64_flags}", "x86_64", ), + ( + "poly_decompose_32_avx2_asm.S", + "dev/x86_64/src", + f"-Imldsa/src/native/x86_64/src -Imldsa/src/common.h {x86_64_flags}", + "x86_64", + ), + ( + "poly_decompose_88_avx2_asm.S", + "dev/x86_64/src", + f"-Imldsa/src/native/x86_64/src -Imldsa/src/common.h {x86_64_flags}", + "x86_64", + ), ( "polyz_unpack_17_avx2_asm.S", "dev/x86_64/src", diff --git a/test/bench/bench_components_mldsa.c b/test/bench/bench_components_mldsa.c index 948915dae..184ceb167 100644 --- a/test/bench/bench_components_mldsa.c +++ b/test/bench/bench_components_mldsa.c @@ -102,6 +102,8 @@ static int bench(void) chknorm_acc ^= mld_poly_chknorm((const mld_poly *)data0, MLDSA_GAMMA1 - MLDSA_BETA);) + BENCH("poly_decompose", mld_poly_decompose((mld_poly *)data0, &poly_out)) + return (int)chknorm_acc; }