Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/hol_light.yml
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,10 @@ jobs:
needs: ["mldsa_specs.ml", "mldsa_utils.ml", "subroutine_signatures.ml"]
- name: polyz_unpack_19_avx2_asm
needs: ["mldsa_specs.ml", "mldsa_utils.ml", "subroutine_signatures.ml"]
- name: poly_use_hint_32_avx2_asm
needs: ["mldsa_specs.ml", "mldsa_utils.ml", "subroutine_signatures.ml"]
- name: poly_use_hint_88_avx2_asm
needs: ["mldsa_specs.ml", "mldsa_utils.ml", "subroutine_signatures.ml"]
- name: ntt_avx2_asm
needs: ["mldsa_specs.ml", "mldsa_utils.ml", "mldsa_zetas.ml", "subroutine_signatures.ml"]
- name: intt_avx2_asm
Expand Down
10 changes: 6 additions & 4 deletions BIBLIOGRAPHY.md
Original file line number Diff line number Diff line change
Expand Up @@ -274,8 +274,8 @@ source code and documentation.
- [dev/x86_64/src/poly_chknorm_avx2_asm.S](dev/x86_64/src/poly_chknorm_avx2_asm.S)
- [dev/x86_64/src/poly_decompose_32_avx2.c](dev/x86_64/src/poly_decompose_32_avx2.c)
- [dev/x86_64/src/poly_decompose_88_avx2.c](dev/x86_64/src/poly_decompose_88_avx2.c)
- [dev/x86_64/src/poly_use_hint_32_avx2.c](dev/x86_64/src/poly_use_hint_32_avx2.c)
- [dev/x86_64/src/poly_use_hint_88_avx2.c](dev/x86_64/src/poly_use_hint_88_avx2.c)
- [dev/x86_64/src/poly_use_hint_32_avx2_asm.S](dev/x86_64/src/poly_use_hint_32_avx2_asm.S)
- [dev/x86_64/src/poly_use_hint_88_avx2_asm.S](dev/x86_64/src/poly_use_hint_88_avx2_asm.S)
- [dev/x86_64/src/polyz_unpack_17_avx2_asm.S](dev/x86_64/src/polyz_unpack_17_avx2_asm.S)
- [dev/x86_64/src/polyz_unpack_19_avx2_asm.S](dev/x86_64/src/polyz_unpack_19_avx2_asm.S)
- [dev/x86_64/src/rej_uniform_avx2.c](dev/x86_64/src/rej_uniform_avx2.c)
Expand All @@ -292,8 +292,8 @@ source code and documentation.
- [mldsa/src/native/x86_64/src/poly_chknorm_avx2_asm.S](mldsa/src/native/x86_64/src/poly_chknorm_avx2_asm.S)
- [mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c](mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c)
- [mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c](mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c)
- [mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c)
- [mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c)
- [mldsa/src/native/x86_64/src/poly_use_hint_32_avx2_asm.S](mldsa/src/native/x86_64/src/poly_use_hint_32_avx2_asm.S)
- [mldsa/src/native/x86_64/src/poly_use_hint_88_avx2_asm.S](mldsa/src/native/x86_64/src/poly_use_hint_88_avx2_asm.S)
- [mldsa/src/native/x86_64/src/polyz_unpack_17_avx2_asm.S](mldsa/src/native/x86_64/src/polyz_unpack_17_avx2_asm.S)
- [mldsa/src/native/x86_64/src/polyz_unpack_19_avx2_asm.S](mldsa/src/native/x86_64/src/polyz_unpack_19_avx2_asm.S)
- [mldsa/src/native/x86_64/src/rej_uniform_avx2.c](mldsa/src/native/x86_64/src/rej_uniform_avx2.c)
Expand All @@ -308,6 +308,8 @@ source code and documentation.
- [proofs/hol_light/x86_64/mldsa/pointwise_avx2_asm.S](proofs/hol_light/x86_64/mldsa/pointwise_avx2_asm.S)
- [proofs/hol_light/x86_64/mldsa/poly_caddq_avx2_asm.S](proofs/hol_light/x86_64/mldsa/poly_caddq_avx2_asm.S)
- [proofs/hol_light/x86_64/mldsa/poly_chknorm_avx2_asm.S](proofs/hol_light/x86_64/mldsa/poly_chknorm_avx2_asm.S)
- [proofs/hol_light/x86_64/mldsa/poly_use_hint_32_avx2_asm.S](proofs/hol_light/x86_64/mldsa/poly_use_hint_32_avx2_asm.S)
- [proofs/hol_light/x86_64/mldsa/poly_use_hint_88_avx2_asm.S](proofs/hol_light/x86_64/mldsa/poly_use_hint_88_avx2_asm.S)
- [proofs/hol_light/x86_64/mldsa/polyz_unpack_17_avx2_asm.S](proofs/hol_light/x86_64/mldsa/polyz_unpack_17_avx2_asm.S)
- [proofs/hol_light/x86_64/mldsa/polyz_unpack_19_avx2_asm.S](proofs/hol_light/x86_64/mldsa/polyz_unpack_19_avx2_asm.S)

Expand Down
4 changes: 2 additions & 2 deletions dev/x86_64/meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ static MLD_INLINE int mld_poly_use_hint_32_native(int32_t *a, const int32_t *h)
{
return MLD_NATIVE_FUNC_FALLBACK;
}
mld_poly_use_hint_32_avx2(a, h);
mld_poly_use_hint_32_avx2_asm(a, h);
return MLD_NATIVE_FUNC_SUCCESS;
}
#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
Expand All @@ -209,7 +209,7 @@ static MLD_INLINE int mld_poly_use_hint_88_native(int32_t *a, const int32_t *h)
{
return MLD_NATIVE_FUNC_FALLBACK;
}
mld_poly_use_hint_88_avx2(a, h);
mld_poly_use_hint_88_avx2_asm(a, h);
return MLD_NATIVE_FUNC_SUCCESS;
}
#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \
Expand Down
30 changes: 26 additions & 4 deletions dev/x86_64/src/arith_native_x86_64.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,33 @@ __contract__(
);

#if !defined(MLD_CONFIG_NO_VERIFY_API)
#define mld_poly_use_hint_32_avx2 MLD_NAMESPACE(mld_poly_use_hint_32_avx2)
void mld_poly_use_hint_32_avx2(int32_t *a, const int32_t *h);
#define mld_poly_use_hint_32_avx2_asm MLD_NAMESPACE(poly_use_hint_32_avx2_asm)
MLD_SYSV_ABI
void mld_poly_use_hint_32_avx2_asm(int32_t *a, const int32_t *h)
/* This must be kept in sync with the HOL-Light specification
* in proofs/hol_light/x86_64/proofs/poly_use_hint_32_avx2_asm.ml */
__contract__(
requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N))
requires(memory_no_alias(h, sizeof(int32_t) * MLDSA_N))
requires(array_bound(a, 0, MLDSA_N, 0, MLDSA_Q))
requires(array_bound(h, 0, MLDSA_N, 0, 2))
assigns(memory_slice(a, sizeof(int32_t) * MLDSA_N))
ensures(array_bound(a, 0, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)))
);

#define mld_poly_use_hint_88_avx2 MLD_NAMESPACE(mld_poly_use_hint_88_avx2)
void mld_poly_use_hint_88_avx2(int32_t *a, const int32_t *h);
#define mld_poly_use_hint_88_avx2_asm MLD_NAMESPACE(poly_use_hint_88_avx2_asm)
MLD_SYSV_ABI
void mld_poly_use_hint_88_avx2_asm(int32_t *a, const int32_t *h)
/* This must be kept in sync with the HOL-Light specification
* in proofs/hol_light/x86_64/proofs/poly_use_hint_88_avx2_asm.ml */
__contract__(
requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N))
requires(memory_no_alias(h, sizeof(int32_t) * MLDSA_N))
requires(array_bound(a, 0, MLDSA_N, 0, MLDSA_Q))
requires(array_bound(h, 0, MLDSA_N, 0, 2))
assigns(memory_slice(a, sizeof(int32_t) * MLDSA_N))
ensures(array_bound(a, 0, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)))
);
#endif /* !MLD_CONFIG_NO_VERIFY_API */

#define mld_poly_chknorm_avx2_asm MLD_NAMESPACE(poly_chknorm_avx2_asm)
Expand Down
103 changes: 0 additions & 103 deletions dev/x86_64/src/poly_use_hint_32_avx2.c

This file was deleted.

148 changes: 148 additions & 0 deletions dev/x86_64/src/poly_use_hint_32_avx2_asm.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
/*
* Copyright (c) The mldsa-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/

/* References
* ==========
*
* - [REF_AVX2]
* CRYSTALS-Dilithium optimized AVX2 implementation
* Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/dilithium/tree/master/avx2
*/

/*
* This file is derived from the public domain
* AVX2 Dilithium implementation @[REF_AVX2].
*/


/*************************************************
* Name: mld_poly_use_hint_32_avx2_asm
*
* Description: Use hint polynomial to correct the high bits of a polynomial.
* Variant for parameter sets ML-DSA-65 and ML-DSA-87
* (GAMMA2 = (Q-1)/32). Operates in place.
*
* Arguments: - int32_t *a: pointer to input/output polynomial; the
* corrected high bits are written back here
* - const int32_t *h: pointer to input hint polynomial
**************************************************/

#include "../../../common.h"

#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
(defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
(MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87))

/* simpasm: header-end */

/* Reference:
* - @[REF_AVX2] calls poly_decompose to compute all a1, a0 before the loop.
* - Our implementation of decompose() is slightly different from that in
* @[REF_AVX2]. See poly_decompose_32_avx2.c for more information.
*/

// a aliased with a0
.macro decompose32_avx2 a1, a, temp1, temp2, temp3
// Compute a1 = round-(a / 523776) ≈ round(a * 1074791425 /
// 2^49), where round-() denotes "round half down". This is
// exact for 0 <= a < Q. Note that half is rounded down since
// 1074791425 / 2^49 ≲ 1 / 523776.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These comments seem to describe the AArch64 code, not the x86_64 code. Can you please adjust them to describe what this code is doing?

vpaddd \a, %ymm5, \temp1
vpsrld $7, \temp1, \temp1
vpmulhuw %ymm8, \temp1, \temp1
vpmulhrsw %ymm7, \temp1, \temp1

// If a1 = 16, i.e. a > 31*GAMMA2, proceed as if a' = a - Q was
// given instead. (For a = 31*GAMMA2 + 1 thus a' = -GAMMA2, we
// still round it to 0 like other "wrapped around" cases.)

// Check for wrap-around
vpcmpgtd %ymm4, \a, \temp2
vpandn \temp1, \temp2, \a1

// Compute remainder a0 = a - a1 * 2 * GAMMA2 = a - a1 * 523776
vpslld $10, \temp1, \temp3
vpsubd \temp1, \temp3, \temp1
vpslld $9, \temp1, \temp1
vpsubd \temp1, \a, \a

// If wrap-around is required, adjust a0 by -1
vpaddd \temp2, \a, \a
.endm

/* Reference: The reference avx2 implementation checks a0 >= 0, which is
* different from the specification and the reference C implementation. We
* follow the specification and check a0 > 0.
*/
Comment on lines +77 to +80

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is no longer true since 3 weeks ago: pq-crystals/dilithium@bba9534. We should reference that commit here


// a aliased with delta
.macro use_hint32_avx2 b, a, h, a1, temp1, temp2, temp3
decompose32_avx2 \a1, \a, \temp1, \temp2, \temp3

// delta = (a0 <= 0) ? -1 : 1
vpcmpgtd %ymm6, \a, \a
vpandn \h, \a, \a
vpslld $1, \a, \a
vpsubd \a, \h, \h

// b = (b + delta * h) % 16
vpaddd \a1, \h, \b
vpand %ymm3, \b, \b
.endm

.text
.balign 16
.global MLD_ASM_NAMESPACE(poly_use_hint_32_avx2_asm)
MLD_ASM_FN_SYMBOL(poly_use_hint_32_avx2_asm)

// Initialize constants
movl $127, %ecx

/* check-magic: 1025 == floor(2^22 / 4092) */
movl $1025, %r8d
vmovd %r8d, %xmm8
vpbroadcastd %xmm8, %ymm8

xorl %eax, %eax
vpxor %xmm6, %xmm6, %xmm6
vmovd %ecx, %xmm5

/* 31 * ((Q-1) / 32) == 31 * GAMMA2, wrap-around threshold */
movl $8118528, %ecx

/* round(x * 2^9 / 2^15) => round(x / 2^6), for f1 = round(f1'' / 2^6) */
movl $512, %r9d
vmovd %r9d, %xmm7
vpbroadcastd %xmm7, %ymm7

vmovd %ecx, %xmm4
movl $15, %ecx
vpbroadcastd %xmm5, %ymm5
vmovd %ecx, %xmm3
vpbroadcastd %xmm4, %ymm4
vpbroadcastd %xmm3, %ymm3


poly_use_hint_32_avx2_asm_loop:
vmovdqa (%rdi), %ymm0
vmovdqa (%rsi), %ymm2

use_hint32_avx2 %ymm2, %ymm0, %ymm2, %ymm9, %ymm1, %ymm11, %ymm10

vmovdqa %ymm2, (%rdi)
addq $32, %rdi
addq $32, %rsi
addq $32, %rax
cmpq $1024, %rax
jne poly_use_hint_32_avx2_asm_loop
ret

/* simpasm: footer-start */

#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
&& (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
65 || MLD_CONFIG_PARAMETER_SET == 87) */
Loading
Loading