Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions dev/aarch64_opt/src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,10 @@ poly_chknorm_aarch64_asm.S: ../../aarch64_clean/src/poly_chknorm_aarch64_asm.S
cp $< $@

poly_decompose_32_aarch64_asm.S: ../../aarch64_clean/src/poly_decompose_32_aarch64_asm.S
cp $< $@
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l poly_decompose_32_loop $(SLOTHY_FLAGS) $(RESERVE_ALL_FLAG)

poly_decompose_88_aarch64_asm.S: ../../aarch64_clean/src/poly_decompose_88_aarch64_asm.S
cp $< $@
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l poly_decompose_88_loop $(SLOTHY_FLAGS) $(RESERVE_ALL_FLAG)

poly_use_hint_32_aarch64_asm.S: ../../aarch64_clean/src/poly_use_hint_32_aarch64_asm.S
cp $< $@
Expand Down
326 changes: 303 additions & 23 deletions dev/aarch64_opt/src/poly_decompose_32_aarch64_asm.S

Large diffs are not rendered by default.

326 changes: 303 additions & 23 deletions dev/aarch64_opt/src/poly_decompose_88_aarch64_asm.S

Large diffs are not rendered by default.

148 changes: 110 additions & 38 deletions mldsa/src/native/aarch64/src/poly_decompose_32_aarch64_asm.S
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
*/
#include "../../../common.h"

#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_NO_SIGN_API) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
(defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87))
#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_NO_SIGN_API) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87))

/*
* WARNING: This file is auto-derived from the mldsa-native source file
Expand All @@ -31,46 +30,119 @@ MLD_ASM_FN_SYMBOL(poly_decompose_32_aarch64_asm)
movk w11, #0x4010, lsl #16
dup v23.4s, w11
mov x3, #0x10 // =16
ldr q0, [x1, #0x10]
ldr q17, [x1, #0x20]
ldr q2, [x1, #0x30]
ldr q26, [x1, #0x40]
ldr q16, [x1]
ldr q18, [x1, #0x60]
cmgt v31.4s, v17.4s, v21.4s
sqdmulh v3.4s, v17.4s, v23.4s
cmgt v4.4s, v2.4s, v21.4s
cmgt v5.4s, v16.4s, v21.4s
sqdmulh v19.4s, v16.4s, v23.4s
sqdmulh v7.4s, v2.4s, v23.4s
srshr v3.4s, v3.4s, #0x12
sqdmulh v24.4s, v26.4s, v23.4s
srshr v27.4s, v19.4s, #0x12
mls v17.4s, v3.4s, v22.4s
srshr v29.4s, v7.4s, #0x12
bic v6.16b, v3.16b, v31.16b
bic v1.16b, v27.16b, v5.16b
mls v16.4s, v27.4s, v22.4s
str q6, [x0, #0x20]
str q1, [x0], #0x40
mls v2.4s, v29.4s, v22.4s
add v6.4s, v17.4s, v31.4s
sqdmulh v3.4s, v18.4s, v23.4s
str q6, [x1, #0x20]
add v20.4s, v16.4s, v5.4s
str q20, [x1], #0x40
add v17.4s, v2.4s, v4.4s
sub x3, x3, #0x2

Lpoly_decompose_32_loop:
ldr q0, [x1]
ldr q1, [x1, #0x10]
ldr q2, [x1, #0x20]
ldr q3, [x1, #0x30]
sqdmulh v5.4s, v1.4s, v23.4s
srshr v5.4s, v5.4s, #0x12
cmgt v24.4s, v1.4s, v21.4s
mls v1.4s, v5.4s, v22.4s
bic v5.16b, v5.16b, v24.16b
add v1.4s, v1.4s, v24.4s
sqdmulh v6.4s, v2.4s, v23.4s
srshr v6.4s, v6.4s, #0x12
cmgt v24.4s, v2.4s, v21.4s
mls v2.4s, v6.4s, v22.4s
bic v6.16b, v6.16b, v24.16b
add v2.4s, v2.4s, v24.4s
sqdmulh v7.4s, v3.4s, v23.4s
srshr v7.4s, v7.4s, #0x12
cmgt v24.4s, v3.4s, v21.4s
mls v3.4s, v7.4s, v22.4s
bic v7.16b, v7.16b, v24.16b
add v3.4s, v3.4s, v24.4s
sqdmulh v4.4s, v0.4s, v23.4s
srshr v4.4s, v4.4s, #0x12
cmgt v24.4s, v0.4s, v21.4s
mls v0.4s, v4.4s, v22.4s
bic v4.16b, v4.16b, v24.16b
add v0.4s, v0.4s, v24.4s
str q5, [x0, #0x10]
str q6, [x0, #0x20]
str q7, [x0, #0x30]
str q4, [x0], #0x40
str q1, [x1, #0x10]
str q2, [x1, #0x20]
str q3, [x1, #0x30]
str q0, [x1], #0x40
cmgt v1.4s, v0.4s, v21.4s
ldr q5, [x1, #0x30]
cmgt v6.4s, v18.4s, v21.4s
sqdmulh v20.4s, v0.4s, v23.4s
stur q17, [x1, #-0x10]
srshr v3.4s, v3.4s, #0x12
bic v7.16b, v29.16b, v4.16b
cmgt v2.4s, v26.4s, v21.4s
sqdmulh v16.4s, v5.4s, v23.4s
srshr v28.4s, v24.4s, #0x12
stur q7, [x0, #-0x10]
cmgt v4.4s, v5.4s, v21.4s
srshr v31.4s, v20.4s, #0x12
mls v18.4s, v3.4s, v22.4s
bic v3.16b, v3.16b, v6.16b
mls v26.4s, v28.4s, v22.4s
bic v19.16b, v28.16b, v2.16b
str q3, [x0, #0x20]
srshr v29.4s, v16.4s, #0x12
str q19, [x0], #0x40
mls v0.4s, v31.4s, v22.4s
bic v30.16b, v31.16b, v1.16b
add v3.4s, v18.4s, v6.4s
ldr q18, [x1, #0x60]
mls v5.4s, v29.4s, v22.4s
stur q30, [x0, #-0x70]
add v27.4s, v26.4s, v2.4s
ldr q26, [x1, #0x40]
add v25.4s, v0.4s, v1.4s
ldr q0, [x1, #0x10]
str q3, [x1, #0x20]
sqdmulh v3.4s, v18.4s, v23.4s
stur q25, [x1, #-0x30]
str q27, [x1], #0x40
add v17.4s, v5.4s, v4.4s
sqdmulh v24.4s, v26.4s, v23.4s
subs x3, x3, #0x1
b.ne Lpoly_decompose_32_loop
ldr q16, [x1, #0x30]
ldr q5, [x1, #0x10]
sqdmulh v28.4s, v0.4s, v23.4s
cmgt v19.4s, v26.4s, v21.4s
cmgt v2.4s, v0.4s, v21.4s
stur q17, [x1, #-0x10]
srshr v31.4s, v3.4s, #0x12
sqdmulh v1.4s, v5.4s, v23.4s
cmgt v6.4s, v16.4s, v21.4s
srshr v24.4s, v24.4s, #0x12
srshr v25.4s, v28.4s, #0x12
sqdmulh v20.4s, v16.4s, v23.4s
cmgt v17.4s, v18.4s, v21.4s
mls v18.4s, v31.4s, v22.4s
cmgt v27.4s, v5.4s, v21.4s
srshr v30.4s, v1.4s, #0x12
mls v0.4s, v25.4s, v22.4s
bic v28.16b, v25.16b, v2.16b
srshr v7.4s, v20.4s, #0x12
bic v25.16b, v24.16b, v19.16b
mls v26.4s, v24.4s, v22.4s
bic v20.16b, v30.16b, v27.16b
stur q28, [x0, #-0x30]
add v3.4s, v18.4s, v17.4s
mls v5.4s, v30.4s, v22.4s
str q20, [x0, #0x10]
bic v28.16b, v7.16b, v6.16b
mls v16.4s, v7.4s, v22.4s
str q3, [x1, #0x20]
str q28, [x0, #0x30]
add v3.4s, v0.4s, v2.4s
bic v7.16b, v29.16b, v4.16b
str q25, [x0], #0x40
stur q3, [x1, #-0x30]
add v24.4s, v5.4s, v27.4s
stur q7, [x0, #-0x50]
add v19.4s, v26.4s, v19.4s
add v25.4s, v16.4s, v6.4s
bic v6.16b, v31.16b, v17.16b
str q24, [x1, #0x10]
str q19, [x1], #0x40
stur q6, [x0, #-0x20]
stur q25, [x1, #-0x10]
ret
.cfi_endproc

Expand Down
148 changes: 110 additions & 38 deletions mldsa/src/native/aarch64/src/poly_decompose_88_aarch64_asm.S
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
*/
#include "../../../common.h"

#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_NO_SIGN_API) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
(defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44)
#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_NO_SIGN_API) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44)

/*
* WARNING: This file is auto-derived from the mldsa-native source file
Expand All @@ -31,46 +30,119 @@ MLD_ASM_FN_SYMBOL(poly_decompose_88_aarch64_asm)
movk w11, #0x5816, lsl #16
dup v23.4s, w11
mov x3, #0x10 // =16
ldr q0, [x1, #0x10]
ldr q17, [x1, #0x20]
ldr q2, [x1, #0x30]
ldr q26, [x1, #0x40]
ldr q16, [x1]
ldr q18, [x1, #0x60]
cmgt v31.4s, v17.4s, v21.4s
sqdmulh v3.4s, v17.4s, v23.4s
cmgt v4.4s, v2.4s, v21.4s
cmgt v5.4s, v16.4s, v21.4s
sqdmulh v19.4s, v16.4s, v23.4s
sqdmulh v7.4s, v2.4s, v23.4s
srshr v3.4s, v3.4s, #0x11
sqdmulh v24.4s, v26.4s, v23.4s
srshr v27.4s, v19.4s, #0x11
mls v17.4s, v3.4s, v22.4s
srshr v29.4s, v7.4s, #0x11
bic v6.16b, v3.16b, v31.16b
bic v1.16b, v27.16b, v5.16b
mls v16.4s, v27.4s, v22.4s
str q6, [x0, #0x20]
str q1, [x0], #0x40
mls v2.4s, v29.4s, v22.4s
add v6.4s, v17.4s, v31.4s
sqdmulh v3.4s, v18.4s, v23.4s
str q6, [x1, #0x20]
add v20.4s, v16.4s, v5.4s
str q20, [x1], #0x40
add v17.4s, v2.4s, v4.4s
sub x3, x3, #0x2

Lpoly_decompose_88_loop:
ldr q0, [x1]
ldr q1, [x1, #0x10]
ldr q2, [x1, #0x20]
ldr q3, [x1, #0x30]
sqdmulh v5.4s, v1.4s, v23.4s
srshr v5.4s, v5.4s, #0x11
cmgt v24.4s, v1.4s, v21.4s
mls v1.4s, v5.4s, v22.4s
bic v5.16b, v5.16b, v24.16b
add v1.4s, v1.4s, v24.4s
sqdmulh v6.4s, v2.4s, v23.4s
srshr v6.4s, v6.4s, #0x11
cmgt v24.4s, v2.4s, v21.4s
mls v2.4s, v6.4s, v22.4s
bic v6.16b, v6.16b, v24.16b
add v2.4s, v2.4s, v24.4s
sqdmulh v7.4s, v3.4s, v23.4s
srshr v7.4s, v7.4s, #0x11
cmgt v24.4s, v3.4s, v21.4s
mls v3.4s, v7.4s, v22.4s
bic v7.16b, v7.16b, v24.16b
add v3.4s, v3.4s, v24.4s
sqdmulh v4.4s, v0.4s, v23.4s
srshr v4.4s, v4.4s, #0x11
cmgt v24.4s, v0.4s, v21.4s
mls v0.4s, v4.4s, v22.4s
bic v4.16b, v4.16b, v24.16b
add v0.4s, v0.4s, v24.4s
str q5, [x0, #0x10]
str q6, [x0, #0x20]
str q7, [x0, #0x30]
str q4, [x0], #0x40
str q1, [x1, #0x10]
str q2, [x1, #0x20]
str q3, [x1, #0x30]
str q0, [x1], #0x40
cmgt v1.4s, v0.4s, v21.4s
ldr q5, [x1, #0x30]
cmgt v6.4s, v18.4s, v21.4s
sqdmulh v20.4s, v0.4s, v23.4s
stur q17, [x1, #-0x10]
srshr v3.4s, v3.4s, #0x11
bic v7.16b, v29.16b, v4.16b
cmgt v2.4s, v26.4s, v21.4s
sqdmulh v16.4s, v5.4s, v23.4s
srshr v28.4s, v24.4s, #0x11
stur q7, [x0, #-0x10]
cmgt v4.4s, v5.4s, v21.4s
srshr v31.4s, v20.4s, #0x11
mls v18.4s, v3.4s, v22.4s
bic v3.16b, v3.16b, v6.16b
mls v26.4s, v28.4s, v22.4s
bic v19.16b, v28.16b, v2.16b
str q3, [x0, #0x20]
srshr v29.4s, v16.4s, #0x11
str q19, [x0], #0x40
mls v0.4s, v31.4s, v22.4s
bic v30.16b, v31.16b, v1.16b
add v3.4s, v18.4s, v6.4s
ldr q18, [x1, #0x60]
mls v5.4s, v29.4s, v22.4s
stur q30, [x0, #-0x70]
add v27.4s, v26.4s, v2.4s
ldr q26, [x1, #0x40]
add v25.4s, v0.4s, v1.4s
ldr q0, [x1, #0x10]
str q3, [x1, #0x20]
sqdmulh v3.4s, v18.4s, v23.4s
stur q25, [x1, #-0x30]
str q27, [x1], #0x40
add v17.4s, v5.4s, v4.4s
sqdmulh v24.4s, v26.4s, v23.4s
subs x3, x3, #0x1
b.ne Lpoly_decompose_88_loop
ldr q16, [x1, #0x30]
ldr q5, [x1, #0x10]
sqdmulh v28.4s, v0.4s, v23.4s
cmgt v19.4s, v26.4s, v21.4s
cmgt v2.4s, v0.4s, v21.4s
stur q17, [x1, #-0x10]
srshr v31.4s, v3.4s, #0x11
sqdmulh v1.4s, v5.4s, v23.4s
cmgt v6.4s, v16.4s, v21.4s
srshr v24.4s, v24.4s, #0x11
srshr v25.4s, v28.4s, #0x11
sqdmulh v20.4s, v16.4s, v23.4s
cmgt v17.4s, v18.4s, v21.4s
mls v18.4s, v31.4s, v22.4s
cmgt v27.4s, v5.4s, v21.4s
srshr v30.4s, v1.4s, #0x11
mls v0.4s, v25.4s, v22.4s
bic v28.16b, v25.16b, v2.16b
srshr v7.4s, v20.4s, #0x11
bic v25.16b, v24.16b, v19.16b
mls v26.4s, v24.4s, v22.4s
bic v20.16b, v30.16b, v27.16b
stur q28, [x0, #-0x30]
add v3.4s, v18.4s, v17.4s
mls v5.4s, v30.4s, v22.4s
str q20, [x0, #0x10]
bic v28.16b, v7.16b, v6.16b
mls v16.4s, v7.4s, v22.4s
str q3, [x1, #0x20]
str q28, [x0, #0x30]
add v3.4s, v0.4s, v2.4s
bic v7.16b, v29.16b, v4.16b
str q25, [x0], #0x40
stur q3, [x1, #-0x30]
add v24.4s, v5.4s, v27.4s
stur q7, [x0, #-0x50]
add v19.4s, v26.4s, v19.4s
add v25.4s, v16.4s, v6.4s
bic v6.16b, v31.16b, v17.16b
str q24, [x1, #0x10]
str q19, [x1], #0x40
stur q6, [x0, #-0x20]
stur q25, [x1, #-0x10]
ret
.cfi_endproc

Expand Down
Loading
Loading