diff --git a/dev/aarch64_opt/src/Makefile b/dev/aarch64_opt/src/Makefile index 6b15af633..59b1302c2 100644 --- a/dev/aarch64_opt/src/Makefile +++ b/dev/aarch64_opt/src/Makefile @@ -95,10 +95,10 @@ poly_chknorm_aarch64_asm.S: ../../aarch64_clean/src/poly_chknorm_aarch64_asm.S cp $< $@ poly_decompose_32_aarch64_asm.S: ../../aarch64_clean/src/poly_decompose_32_aarch64_asm.S - cp $< $@ + slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l poly_decompose_32_loop $(SLOTHY_FLAGS) $(RESERVE_ALL_FLAG) poly_decompose_88_aarch64_asm.S: ../../aarch64_clean/src/poly_decompose_88_aarch64_asm.S - cp $< $@ + slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l poly_decompose_88_loop $(SLOTHY_FLAGS) $(RESERVE_ALL_FLAG) poly_use_hint_32_aarch64_asm.S: ../../aarch64_clean/src/poly_use_hint_32_aarch64_asm.S cp $< $@ diff --git a/dev/aarch64_opt/src/poly_decompose_32_aarch64_asm.S b/dev/aarch64_opt/src/poly_decompose_32_aarch64_asm.S index 1096e3eae..5b3df8fbf 100644 --- a/dev/aarch64_opt/src/poly_decompose_32_aarch64_asm.S +++ b/dev/aarch64_opt/src/poly_decompose_32_aarch64_asm.S @@ -4,18 +4,46 @@ */ #include "../../../common.h" -#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_NO_SIGN_API) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ - (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)) +#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_NO_SIGN_API) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)) /* simpasm: header-end */ // a aliased with a0 .macro decompose32 a1, a, temp // range: 0 <= a <= Q-1 = 32*GAMMA2 + /* check-magic: 523776 == 2 * intdiv(MLDSA_Q - 1, 32) */ + /* check-magic: 1074791425 == floor(2**49 / 523776) */ + /* check-magic: 575897802350002176 == 1 / (1 / 523776 - 1074791425 / 2^49) */ // Compute a1 = round-(a / (2*GAMMA2)) = round-(a / 523776) ≈ // round(a * 1074791425 / 2^49), where round-() denotes "round half - // down". This is exact for 0 <= a < Q. Note that half is rounded down - // since 1074791425 / 2^49 ≲ 1 / 523776. + // down". This is exact for 0 <= a < Q. We'll prove this in the + // following paragraphs, in which we denote 2*GAMMA2 as B to avoid + // clutter. + + // Consider the (signed) error a * (1 / B - 1074791425 / 2^49) between + // a / B and the (under-)approximation a * 1074791425 / 2^49. Because + // eps := 1 / B - 1074791425 / 2^49 is 1 / 575897802350002176 ≈ + // 2^(-58.99) < 2^(-58), we have 0 <= a * eps < 2^23 * 2^(-58) = + // 1 / 2^35 < 1 / 2^19 < 1 / B (note that a is non-negative). + + // On the other hand, 1 / B is the spacing between the integral + // multiples of 1 / B, which includes all rounding boundaries n + 0.5 + // (since B is even). Hence, if a / B is not of the form n + 0.5, then + // it is at least 1 / B away from the nearest rounding boundary, so + // moving from a / B to a * 1074791425 / 2^49 does not affect the + // rounding result, no matter the type of rounding used in either side. + // In particular, we have round-(a / B) = round(a * 1074791425 / 2^49) + // as claimed. + + // As for the remaining case where a / B _is_ of the form n + 0.5, + // because a * 1074791425 / 2^49 is slightly but strictly below a / B = + // n + 0.5 (note that a and thus the error a * eps cannot be 0 here), it + // is always rounded down to n. More precisely, we have round-(a / B) = + // round(a * 1074791425 / 2^49), where the round-down on the LHS is + // essential, and on the RHS the type of rounding again does not matter. + // This concludes the proof. + + // See proofs/isabelle/compress for a formalization of this argument. sqdmulh \a1\().4s, \a\().4s, barrett_const.4s srshr \a1\().4s, \a1\().4s, #18 // range: 0 <= a1 <= 16 @@ -30,7 +58,7 @@ // Compute remainder a0 mls \a\().4s, \a1\().4s, gamma2_2x.4s // range: -GAMMA2 < a0 <= GAMMA2 - // + // This holds since a1 = round-(a / (2*GAMMA2)) was computed exactly. // If wrap-around is required, set a1 = 0 and a0 -= 1 @@ -75,28 +103,280 @@ MLD_ASM_FN_SYMBOL(poly_decompose_32_aarch64_asm) mov count, #(64/4) + // Instructions: 29 + // Expected cycles: 23 + // Expected IPC: 1.26 + // + // Cycle bound: 23.0 + // IPC bound: 1.26 + // + // Wall time: 0.58s + // User time: 0.58s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q0, [x1, #16] // *............................. + ldr q17, [x1, #32] // *............................. + ldr q2, [x1, #48] // .*............................ + ldr q26, [x1, #64] // .*............................ + ldr q16, [x1, #0] // ..*........................... + ldr q18, [x1, #96] // ..*........................... + cmgt v31.4S, v17.4S, v21.4S // ....*......................... + sqdmulh v3.4S, v17.4S, v23.4S // ....*......................... + cmgt v4.4S, v2.4S, v21.4S // .....*........................ + cmgt v5.4S, v16.4S, v21.4S // ......*....................... + sqdmulh v19.4S, v16.4S, v23.4S // ......*....................... + sqdmulh v7.4S, v2.4S, v23.4S // ........*..................... + srshr v3.4S, v3.4S, #18 // .........*.................... + sqdmulh v24.4S, v26.4S, v23.4S // ..........*................... + srshr v27.4S, v19.4S, #18 // ...........*.................. + mls v17.4S, v3.4S, v22.4S // .............*................ + srshr v29.4S, v7.4S, #18 // .............*................ + bic v6.16B, v3.16B, v31.16B // ..............*............... + bic v1.16B, v27.16B, v5.16B // ...............*.............. + mls v16.4S, v27.4S, v22.4S // ...............*.............. + str q6, [x0, #32] // ................*............. + str q1, [x0], #4*16 // .................*............ + mls v2.4S, v29.4S, v22.4S // .................*............ + add v6.4S, v17.4S, v31.4S // ..................*........... + sqdmulh v3.4S, v18.4S, v23.4S // ...................*.......... + str q6, [x1, #32] // ....................*......... + add v20.4S, v16.4S, v5.4S // ....................*......... + str q20, [x1], #4*16 // ......................*....... + add v17.4S, v2.4S, v4.4S // ......................*....... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q18, [x1, #32] // *.............................. + // ldr q26, [x1, #0] // ..*............................ + // sqdmulh v3.4S, v18.4S, v23.4S // ....*.......................... + // sqdmulh v24.4S, v26.4S, v23.4S // ......*........................ + // ldr q5, [x1, #48] // .*............................. + // cmgt v6.4S, v18.4S, v21.4S // ....*.......................... + // srshr v3.4S, v3.4S, #18 // .........*..................... + // cmgt v2.4S, v26.4S, v21.4S // ......*........................ + // sqdmulh v16.4S, v5.4S, v23.4S // ........*...................... + // srshr v28.4S, v24.4S, #18 // ...........*................... + // cmgt v4.4S, v5.4S, v21.4S // .....*......................... + // mls v18.4S, v3.4S, v22.4S // .............*................. + // bic v3.16B, v3.16B, v6.16B // ..............*................ + // mls v26.4S, v28.4S, v22.4S // ...............*............... + // bic v19.16B, v28.16B, v2.16B // ...............*............... + // str q3, [x0, #32] // ................*.............. + // srshr v29.4S, v16.4S, #18 // .............*................. + // str q19, [x0], #4*16 // .................*............. + // add v3.4S, v18.4S, v6.4S // ..................*............ + // ldr q18, [x1, #96] // ..*............................ + // mls v5.4S, v29.4S, v22.4S // .................*............. + // add v27.4S, v26.4S, v2.4S // ....................*.......... + // ldr q26, [x1, #64] // .*............................. + // ldr q0, [x1, #16] // *.............................. + // str q3, [x1, #32] // ....................*.......... + // sqdmulh v3.4S, v18.4S, v23.4S // ...................*........... + // str q27, [x1], #4*16 // ......................*........ + // add v17.4S, v5.4S, v4.4S // ......................*........ + // sqdmulh v24.4S, v26.4S, v23.4S // ..........*.................... + + sub count, count, #2 poly_decompose_32_loop: - ldr q0, [a0_ptr, #0*16] - ldr q1, [a0_ptr, #1*16] - ldr q2, [a0_ptr, #2*16] - ldr q3, [a0_ptr, #3*16] - - decompose32 v5, v1, v24 - decompose32 v6, v2, v24 - decompose32 v7, v3, v24 - decompose32 v4, v0, v24 - - str q5, [a1_ptr, #1*16] - str q6, [a1_ptr, #2*16] - str q7, [a1_ptr, #3*16] - str q4, [a1_ptr], #4*16 - str q1, [a0_ptr, #1*16] - str q2, [a0_ptr, #2*16] - str q3, [a0_ptr, #3*16] - str q0, [a0_ptr], #4*16 + // Instructions: 36 + // Expected cycles: 19 + // Expected IPC: 1.89 + // + // Cycle bound: 19.0 + // IPC bound: 1.89 + // + // Wall time: 30.48s + // User time: 30.48s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + cmgt v1.4S, v0.4S, v21.4S // l............................. + ldr q5, [x1, #48] // *............................. + cmgt v6.4S, v18.4S, v21.4S // .*............................ + sqdmulh v20.4S, v0.4S, v23.4S // .l............................ + str q17, [x1, #-16] // ..l........................... + srshr v3.4S, v3.4S, #18 // ..*........................... + bic v7.16B, v29.16B, v4.16B // ...l.......................... + cmgt v2.4S, v26.4S, v21.4S // ...*.......................... + sqdmulh v16.4S, v5.4S, v23.4S // ....*......................... + srshr v28.4S, v24.4S, #18 // ....*......................... + str q7, [x0, #-16] // .....l........................ + cmgt v4.4S, v5.4S, v21.4S // .....*........................ + srshr v31.4S, v20.4S, #18 // ......l....................... + mls v18.4S, v3.4S, v22.4S // ......*....................... + bic v3.16B, v3.16B, v6.16B // .......*...................... + mls v26.4S, v28.4S, v22.4S // ........*..................... + bic v19.16B, v28.16B, v2.16B // ........*..................... + str q3, [x0, #32] // .........*.................... + srshr v29.4S, v16.4S, #18 // .........*.................... + str q19, [x0], #4*16 // ..........*................... + mls v0.4S, v31.4S, v22.4S // ..........l................... + bic v30.16B, v31.16B, v1.16B // ...........l.................. + add v3.4S, v18.4S, v6.4S // ............*................. + ldr q18, [x1, #96] // ............e................. + mls v5.4S, v29.4S, v22.4S // .............*................ + str q30, [x0, #-112] // .............l................ + add v27.4S, v26.4S, v2.4S // ..............*............... + ldr q26, [x1, #64] // ..............e............... + add v25.4S, v0.4S, v1.4S // ...............l.............. + ldr q0, [x1, #16] // ...............*.............. + str q3, [x1, #32] // ................*............. + sqdmulh v3.4S, v18.4S, v23.4S // ................e............. + str q25, [x1, #-48] // .................l............ + str q27, [x1], #4*16 // .................*............ + add v17.4S, v5.4S, v4.4S // ..................*........... + sqdmulh v24.4S, v26.4S, v23.4S // ..................e........... + + // ------------ cycle (expected) -------------> + // 0 25 + // |------------------------|------------------ + // ldr q0, [x1, #0*16] // ..e....'.............~....'.............~... + // ldr q1, [x1, #1*16] // ...~...'..............*...'..............~.. + // ldr q2, [x1, #2*16] // e......'...........~......'...........~..... + // ldr q3, [x1, #3*16] // .......*..................~................. + // sqdmulh v5.4s, v1.4s, v23.4s // .......'~.................'l................ + // srshr v5.4s, v5.4s, #18 // .......'.....~............'.....l........... + // cmgt v24.4s, v1.4s, v21.4s // .......~..................l................. + // mls v1.4s, v5.4s, v22.4s // .......'.........~........'.........l....... + // bic v5.16b, v5.16b, v24.16b // .......'..........~.......'..........l...... + // add v1.4s, v1.4s, v24.4s // ...~...'..............~...'..............l.. + // sqdmulh v6.4s, v2.4s, v23.4s // ....e..'...............~..'...............~. + // srshr v6.4s, v6.4s, #18 // .......'.*................'.~............... + // cmgt v24.4s, v2.4s, v21.4s // .......'*.................'~................ + // mls v2.4s, v6.4s, v22.4s // .......'.....*............'.....~........... + // bic v6.16b, v6.16b, v24.16b // .......'......*...........'......~.......... + // add v2.4s, v2.4s, v24.4s // ~......'...........*......'...........~..... + // sqdmulh v7.4s, v3.4s, v23.4s // .......'...*..............'...~............. + // srshr v7.4s, v7.4s, #18 // .......'........*.........'........~........ + // cmgt v24.4s, v3.4s, v21.4s // .......'....*.............'....~............ + // mls v3.4s, v7.4s, v22.4s // .~.....'............*.....'............~.... + // bic v7.16b, v7.16b, v24.16b // .......'..~...............'..l.............. + // add v3.4s, v3.4s, v24.4s // ......~'.................*'................. + // sqdmulh v4.4s, v0.4s, v23.4s // ......e'.................~'................. + // srshr v4.4s, v4.4s, #18 // .......'...*..............'...~............. + // cmgt v24.4s, v0.4s, v21.4s // .......'..*...............'..~.............. + // mls v0.4s, v4.4s, v22.4s // .......'.......*..........'.......~......... + // bic v4.16b, v4.16b, v24.16b // .......'.......*..........'.......~......... + // add v0.4s, v0.4s, v24.4s // ..~....'.............*....'.............~... + // str q5, [x0, #1*16] // .~.....'............~.....'............l.... + // str q6, [x0, #2*16] // .......'........*.........'........~........ + // str q7, [x0, #3*16] // .......'....~.............'....l............ + // str q4, [x0], #4*16 // .......'.........*........'.........~....... + // str q1, [x1, #1*16] // .....~.'................~.'................l + // str q2, [x1, #2*16] // ....~..'...............*..'...............~. + // str q3, [x1, #3*16] // .......'.~................'.l............... + // str q0, [x1], #4*16 // .....~.'................*.'................. subs count, count, #1 bne poly_decompose_32_loop + // Instructions: 43 + // Expected cycles: 24 + // Expected IPC: 1.79 + // + // Cycle bound: 24.0 + // IPC bound: 1.79 + // + // Wall time: 2.41s + // User time: 2.41s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q16, [x1, #48] // *............................. + ldr q5, [x1, #16] // *............................. + sqdmulh v28.4S, v0.4S, v23.4S // .*............................ + cmgt v19.4S, v26.4S, v21.4S // .*............................ + cmgt v2.4S, v0.4S, v21.4S // ..*........................... + str q17, [x1, #-16] // ..*........................... + srshr v31.4S, v3.4S, #18 // ...*.......................... + sqdmulh v1.4S, v5.4S, v23.4S // ....*......................... + cmgt v6.4S, v16.4S, v21.4S // ....*......................... + srshr v24.4S, v24.4S, #18 // .....*........................ + srshr v25.4S, v28.4S, #18 // ......*....................... + sqdmulh v20.4S, v16.4S, v23.4S // ......*....................... + cmgt v17.4S, v18.4S, v21.4S // .......*...................... + mls v18.4S, v31.4S, v22.4S // ........*..................... + cmgt v27.4S, v5.4S, v21.4S // ........*..................... + srshr v30.4S, v1.4S, #18 // .........*.................... + mls v0.4S, v25.4S, v22.4S // ..........*................... + bic v28.16B, v25.16B, v2.16B // ..........*................... + srshr v7.4S, v20.4S, #18 // ...........*.................. + bic v25.16B, v24.16B, v19.16B // ............*................. + mls v26.4S, v24.4S, v22.4S // ............*................. + bic v20.16B, v30.16B, v27.16B // .............*................ + str q28, [x0, #-48] // .............*................ + add v3.4S, v18.4S, v17.4S // ..............*............... + mls v5.4S, v30.4S, v22.4S // ..............*............... + str q20, [x0, #16] // ...............*.............. + bic v28.16B, v7.16B, v6.16B // ...............*.............. + mls v16.4S, v7.4S, v22.4S // ................*............. + str q3, [x1, #32] // ................*............. + str q28, [x0, #48] // .................*............ + add v3.4S, v0.4S, v2.4S // .................*............ + bic v7.16B, v29.16B, v4.16B // ..................*........... + str q25, [x0], #4*16 // ..................*........... + str q3, [x1, #-48] // ...................*.......... + add v24.4S, v5.4S, v27.4S // ...................*.......... + str q7, [x0, #-80] // ....................*......... + add v19.4S, v26.4S, v19.4S // ....................*......... + add v25.4S, v16.4S, v6.4S // .....................*........ + bic v6.16B, v31.16B, v17.16B // .....................*........ + str q24, [x1, #16] // ......................*....... + str q19, [x1], #4*16 // ......................*....... + str q6, [x0, #-32] // .......................*...... + str q25, [x1, #-16] // .......................*...... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // cmgt v1.4S, v0.4S, v21.4S // ..*............................ + // ldr q5, [x1, #48] // *.............................. + // cmgt v6.4S, v18.4S, v21.4S // .......*....................... + // sqdmulh v20.4S, v0.4S, v23.4S // .*............................. + // str q17, [x1, #-16] // ..*............................ + // srshr v3.4S, v3.4S, #18 // ...*........................... + // bic v7.16B, v29.16B, v4.16B // ..................*............ + // cmgt v2.4S, v26.4S, v21.4S // .*............................. + // sqdmulh v16.4S, v5.4S, v23.4S // ......*........................ + // srshr v28.4S, v24.4S, #18 // .....*......................... + // str q7, [x0, #-16] // ....................*.......... + // cmgt v4.4S, v5.4S, v21.4S // ....*.......................... + // srshr v31.4S, v20.4S, #18 // ......*........................ + // mls v18.4S, v3.4S, v22.4S // ........*...................... + // bic v3.16B, v3.16B, v6.16B // .....................*......... + // mls v26.4S, v28.4S, v22.4S // ............*.................. + // bic v19.16B, v28.16B, v2.16B // ............*.................. + // str q3, [x0, #32] // .......................*....... + // srshr v29.4S, v16.4S, #18 // ...........*................... + // str q19, [x0], #4*16 // ..................*............ + // mls v0.4S, v31.4S, v22.4S // ..........*.................... + // bic v30.16B, v31.16B, v1.16B // ..........*.................... + // add v3.4S, v18.4S, v6.4S // ..............*................ + // mls v5.4S, v29.4S, v22.4S // ................*.............. + // str q30, [x0, #-112] // .............*................. + // add v27.4S, v26.4S, v2.4S // ....................*.......... + // add v25.4S, v0.4S, v1.4S // .................*............. + // ldr q0, [x1, #16] // *.............................. + // str q3, [x1, #32] // ................*.............. + // str q25, [x1, #-48] // ...................*........... + // str q27, [x1], #4*16 // ......................*........ + // add v17.4S, v5.4S, v4.4S // .....................*......... + // cmgt v1.4S, v0.4S, v21.4S // ........*...................... + // sqdmulh v20.4S, v0.4S, v23.4S // ....*.......................... + // str q17, [x1, #-16] // .......................*....... + // bic v7.16B, v29.16B, v4.16B // ...............*............... + // str q7, [x0, #-16] // .................*............. + // srshr v31.4S, v20.4S, #18 // .........*..................... + // mls v0.4S, v31.4S, v22.4S // ..............*................ + // bic v30.16B, v31.16B, v1.16B // .............*................. + // str q30, [x0, #-48] // ...............*............... + // add v25.4S, v0.4S, v1.4S // ...................*........... + // str q25, [x1, #-48] // ......................*........ + ret diff --git a/dev/aarch64_opt/src/poly_decompose_88_aarch64_asm.S b/dev/aarch64_opt/src/poly_decompose_88_aarch64_asm.S index 94568eaaa..d074f8b7b 100644 --- a/dev/aarch64_opt/src/poly_decompose_88_aarch64_asm.S +++ b/dev/aarch64_opt/src/poly_decompose_88_aarch64_asm.S @@ -4,18 +4,46 @@ */ #include "../../../common.h" -#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_NO_SIGN_API) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ - (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44) +#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_NO_SIGN_API) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44) /* simpasm: header-end */ // a aliased with a0 .macro decompose88 a1, a, temp // range: 0 <= a <= Q-1 = 88*GAMMA2 + /* check-magic: 190464 == 2 * intdiv(MLDSA_Q - 1, 88) */ + /* check-magic: 1477838209 == floor(2**48 / 190464) */ + /* check-magic: 26177172834091008 == 35 / (1 / 190464 - 1477838209 / 2^48) */ // Compute a1 = round-(a / (2*GAMMA2)) = round-(a / 190464) ≈ // round(a * 1477838209 / 2^48), where round-() denotes "round half - // down". This is exact for 0 <= a < Q. Note that half is rounded down - // since 1477838209 / 2^48 ≲ 1 / 190464. + // down". This is exact for 0 <= a < Q. We'll prove this in the + // following paragraphs, in which we denote 2*GAMMA2 as B to avoid + // clutter. + + // Consider the (signed) error a * (1 / B - 1477838209 / 2^48) between + // a / B and the (under-)approximation a * 1477838209 / 2^48. Because + // eps := 1 / B - 1477838209 / 2^48 is 35 / 26177172834091008 ≈ + // 2^(-49.41) < 2^(-49), we have 0 <= a * eps < 2^23 * 2^(-49) = + // 1 / 2^26 < 1 / 2^18 < 1 / B (note that a is non-negative). + + // On the other hand, 1 / B is the spacing between the integral + // multiples of 1 / B, which includes all rounding boundaries n + 0.5 + // (since B is even). Hence, if a / B is not of the form n + 0.5, then + // it is at least 1 / B away from the nearest rounding boundary, so + // moving from a / B to a * 1477838209 / 2^48 does not affect the + // rounding result, no matter the type of rounding used in either side. + // In particular, we have round-(a / B) = round(a * 1477838209 / 2^48) + // as claimed. + + // As for the remaining case where a / B _is_ of the form n + 0.5, + // because a * 1477838209 / 2^48 is slightly but strictly below a / B = + // n + 0.5 (note that a and thus the error a * eps cannot be 0 here), it + // is always rounded down to n. More precisely, we have round-(a / B) = + // round(a * 1477838209 / 2^48), where the round-down on the LHS is + // essential, and on the RHS the type of rounding again does not matter. + // This concludes the proof. + + // See proofs/isabelle/compress for a formalization of this argument. sqdmulh \a1\().4s, \a\().4s, barrett_const.4s srshr \a1\().4s, \a1\().4s, #17 // range: 0 <= a1 <= 44 @@ -30,7 +58,7 @@ // Compute remainder a0 mls \a\().4s, \a1\().4s, gamma2_2x.4s // range: -GAMMA2 < a0 <= GAMMA2 - // + // This holds since a1 = round-(a / (2*GAMMA2)) was computed exactly. // If wrap-around is required, set a1 = 0 and a0 -= 1 @@ -73,28 +101,280 @@ MLD_ASM_FN_SYMBOL(poly_decompose_88_aarch64_asm) dup barrett_const.4s, w11 mov count, #(64/4) + // Instructions: 29 + // Expected cycles: 23 + // Expected IPC: 1.26 + // + // Cycle bound: 23.0 + // IPC bound: 1.26 + // + // Wall time: 0.58s + // User time: 0.58s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q0, [x1, #16] // *............................. + ldr q17, [x1, #32] // *............................. + ldr q2, [x1, #48] // .*............................ + ldr q26, [x1, #64] // .*............................ + ldr q16, [x1, #0] // ..*........................... + ldr q18, [x1, #96] // ..*........................... + cmgt v31.4S, v17.4S, v21.4S // ....*......................... + sqdmulh v3.4S, v17.4S, v23.4S // ....*......................... + cmgt v4.4S, v2.4S, v21.4S // .....*........................ + cmgt v5.4S, v16.4S, v21.4S // ......*....................... + sqdmulh v19.4S, v16.4S, v23.4S // ......*....................... + sqdmulh v7.4S, v2.4S, v23.4S // ........*..................... + srshr v3.4S, v3.4S, #17 // .........*.................... + sqdmulh v24.4S, v26.4S, v23.4S // ..........*................... + srshr v27.4S, v19.4S, #17 // ...........*.................. + mls v17.4S, v3.4S, v22.4S // .............*................ + srshr v29.4S, v7.4S, #17 // .............*................ + bic v6.16B, v3.16B, v31.16B // ..............*............... + bic v1.16B, v27.16B, v5.16B // ...............*.............. + mls v16.4S, v27.4S, v22.4S // ...............*.............. + str q6, [x0, #32] // ................*............. + str q1, [x0], #4*16 // .................*............ + mls v2.4S, v29.4S, v22.4S // .................*............ + add v6.4S, v17.4S, v31.4S // ..................*........... + sqdmulh v3.4S, v18.4S, v23.4S // ...................*.......... + str q6, [x1, #32] // ....................*......... + add v20.4S, v16.4S, v5.4S // ....................*......... + str q20, [x1], #4*16 // ......................*....... + add v17.4S, v2.4S, v4.4S // ......................*....... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q18, [x1, #32] // *.............................. + // ldr q26, [x1, #0] // ..*............................ + // sqdmulh v3.4S, v18.4S, v23.4S // ....*.......................... + // sqdmulh v24.4S, v26.4S, v23.4S // ......*........................ + // ldr q5, [x1, #48] // .*............................. + // cmgt v6.4S, v18.4S, v21.4S // ....*.......................... + // srshr v3.4S, v3.4S, #17 // .........*..................... + // cmgt v2.4S, v26.4S, v21.4S // ......*........................ + // sqdmulh v16.4S, v5.4S, v23.4S // ........*...................... + // srshr v28.4S, v24.4S, #17 // ...........*................... + // cmgt v4.4S, v5.4S, v21.4S // .....*......................... + // mls v18.4S, v3.4S, v22.4S // .............*................. + // bic v3.16B, v3.16B, v6.16B // ..............*................ + // mls v26.4S, v28.4S, v22.4S // ...............*............... + // bic v19.16B, v28.16B, v2.16B // ...............*............... + // str q3, [x0, #32] // ................*.............. + // srshr v29.4S, v16.4S, #17 // .............*................. + // str q19, [x0], #4*16 // .................*............. + // add v3.4S, v18.4S, v6.4S // ..................*............ + // ldr q18, [x1, #96] // ..*............................ + // mls v5.4S, v29.4S, v22.4S // .................*............. + // add v27.4S, v26.4S, v2.4S // ....................*.......... + // ldr q26, [x1, #64] // .*............................. + // ldr q0, [x1, #16] // *.............................. + // str q3, [x1, #32] // ....................*.......... + // sqdmulh v3.4S, v18.4S, v23.4S // ...................*........... + // str q27, [x1], #4*16 // ......................*........ + // add v17.4S, v5.4S, v4.4S // ......................*........ + // sqdmulh v24.4S, v26.4S, v23.4S // ..........*.................... + + sub count, count, #2 poly_decompose_88_loop: - ldr q0, [a0_ptr, #0*16] - ldr q1, [a0_ptr, #1*16] - ldr q2, [a0_ptr, #2*16] - ldr q3, [a0_ptr, #3*16] - - decompose88 v5, v1, v24 - decompose88 v6, v2, v24 - decompose88 v7, v3, v24 - decompose88 v4, v0, v24 - - str q5, [a1_ptr, #1*16] - str q6, [a1_ptr, #2*16] - str q7, [a1_ptr, #3*16] - str q4, [a1_ptr], #4*16 - str q1, [a0_ptr, #1*16] - str q2, [a0_ptr, #2*16] - str q3, [a0_ptr, #3*16] - str q0, [a0_ptr], #4*16 + // Instructions: 36 + // Expected cycles: 19 + // Expected IPC: 1.89 + // + // Cycle bound: 19.0 + // IPC bound: 1.89 + // + // Wall time: 31.64s + // User time: 31.64s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + cmgt v1.4S, v0.4S, v21.4S // l............................. + ldr q5, [x1, #48] // *............................. + cmgt v6.4S, v18.4S, v21.4S // .*............................ + sqdmulh v20.4S, v0.4S, v23.4S // .l............................ + str q17, [x1, #-16] // ..l........................... + srshr v3.4S, v3.4S, #17 // ..*........................... + bic v7.16B, v29.16B, v4.16B // ...l.......................... + cmgt v2.4S, v26.4S, v21.4S // ...*.......................... + sqdmulh v16.4S, v5.4S, v23.4S // ....*......................... + srshr v28.4S, v24.4S, #17 // ....*......................... + str q7, [x0, #-16] // .....l........................ + cmgt v4.4S, v5.4S, v21.4S // .....*........................ + srshr v31.4S, v20.4S, #17 // ......l....................... + mls v18.4S, v3.4S, v22.4S // ......*....................... + bic v3.16B, v3.16B, v6.16B // .......*...................... + mls v26.4S, v28.4S, v22.4S // ........*..................... + bic v19.16B, v28.16B, v2.16B // ........*..................... + str q3, [x0, #32] // .........*.................... + srshr v29.4S, v16.4S, #17 // .........*.................... + str q19, [x0], #4*16 // ..........*................... + mls v0.4S, v31.4S, v22.4S // ..........l................... + bic v30.16B, v31.16B, v1.16B // ...........l.................. + add v3.4S, v18.4S, v6.4S // ............*................. + ldr q18, [x1, #96] // ............e................. + mls v5.4S, v29.4S, v22.4S // .............*................ + str q30, [x0, #-112] // .............l................ + add v27.4S, v26.4S, v2.4S // ..............*............... + ldr q26, [x1, #64] // ..............e............... + add v25.4S, v0.4S, v1.4S // ...............l.............. + ldr q0, [x1, #16] // ...............*.............. + str q3, [x1, #32] // ................*............. + sqdmulh v3.4S, v18.4S, v23.4S // ................e............. + str q25, [x1, #-48] // .................l............ + str q27, [x1], #4*16 // .................*............ + add v17.4S, v5.4S, v4.4S // ..................*........... + sqdmulh v24.4S, v26.4S, v23.4S // ..................e........... + + // ------------ cycle (expected) -------------> + // 0 25 + // |------------------------|------------------ + // ldr q0, [x1, #0*16] // ..e....'.............~....'.............~... + // ldr q1, [x1, #1*16] // ...~...'..............*...'..............~.. + // ldr q2, [x1, #2*16] // e......'...........~......'...........~..... + // ldr q3, [x1, #3*16] // .......*..................~................. + // sqdmulh v5.4s, v1.4s, v23.4s // .......'~.................'l................ + // srshr v5.4s, v5.4s, #17 // .......'.....~............'.....l........... + // cmgt v24.4s, v1.4s, v21.4s // .......~..................l................. + // mls v1.4s, v5.4s, v22.4s // .......'.........~........'.........l....... + // bic v5.16b, v5.16b, v24.16b // .......'..........~.......'..........l...... + // add v1.4s, v1.4s, v24.4s // ...~...'..............~...'..............l.. + // sqdmulh v6.4s, v2.4s, v23.4s // ....e..'...............~..'...............~. + // srshr v6.4s, v6.4s, #17 // .......'.*................'.~............... + // cmgt v24.4s, v2.4s, v21.4s // .......'*.................'~................ + // mls v2.4s, v6.4s, v22.4s // .......'.....*............'.....~........... + // bic v6.16b, v6.16b, v24.16b // .......'......*...........'......~.......... + // add v2.4s, v2.4s, v24.4s // ~......'...........*......'...........~..... + // sqdmulh v7.4s, v3.4s, v23.4s // .......'...*..............'...~............. + // srshr v7.4s, v7.4s, #17 // .......'........*.........'........~........ + // cmgt v24.4s, v3.4s, v21.4s // .......'....*.............'....~............ + // mls v3.4s, v7.4s, v22.4s // .~.....'............*.....'............~.... + // bic v7.16b, v7.16b, v24.16b // .......'..~...............'..l.............. + // add v3.4s, v3.4s, v24.4s // ......~'.................*'................. + // sqdmulh v4.4s, v0.4s, v23.4s // ......e'.................~'................. + // srshr v4.4s, v4.4s, #17 // .......'...*..............'...~............. + // cmgt v24.4s, v0.4s, v21.4s // .......'..*...............'..~.............. + // mls v0.4s, v4.4s, v22.4s // .......'.......*..........'.......~......... + // bic v4.16b, v4.16b, v24.16b // .......'.......*..........'.......~......... + // add v0.4s, v0.4s, v24.4s // ..~....'.............*....'.............~... + // str q5, [x0, #1*16] // .~.....'............~.....'............l.... + // str q6, [x0, #2*16] // .......'........*.........'........~........ + // str q7, [x0, #3*16] // .......'....~.............'....l............ + // str q4, [x0], #4*16 // .......'.........*........'.........~....... + // str q1, [x1, #1*16] // .....~.'................~.'................l + // str q2, [x1, #2*16] // ....~..'...............*..'...............~. + // str q3, [x1, #3*16] // .......'.~................'.l............... + // str q0, [x1], #4*16 // .....~.'................*.'................. subs count, count, #1 bne poly_decompose_88_loop + // Instructions: 43 + // Expected cycles: 24 + // Expected IPC: 1.79 + // + // Cycle bound: 24.0 + // IPC bound: 1.79 + // + // Wall time: 2.42s + // User time: 2.42s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q16, [x1, #48] // *............................. + ldr q5, [x1, #16] // *............................. + sqdmulh v28.4S, v0.4S, v23.4S // .*............................ + cmgt v19.4S, v26.4S, v21.4S // .*............................ + cmgt v2.4S, v0.4S, v21.4S // ..*........................... + str q17, [x1, #-16] // ..*........................... + srshr v31.4S, v3.4S, #17 // ...*.......................... + sqdmulh v1.4S, v5.4S, v23.4S // ....*......................... + cmgt v6.4S, v16.4S, v21.4S // ....*......................... + srshr v24.4S, v24.4S, #17 // .....*........................ + srshr v25.4S, v28.4S, #17 // ......*....................... + sqdmulh v20.4S, v16.4S, v23.4S // ......*....................... + cmgt v17.4S, v18.4S, v21.4S // .......*...................... + mls v18.4S, v31.4S, v22.4S // ........*..................... + cmgt v27.4S, v5.4S, v21.4S // ........*..................... + srshr v30.4S, v1.4S, #17 // .........*.................... + mls v0.4S, v25.4S, v22.4S // ..........*................... + bic v28.16B, v25.16B, v2.16B // ..........*................... + srshr v7.4S, v20.4S, #17 // ...........*.................. + bic v25.16B, v24.16B, v19.16B // ............*................. + mls v26.4S, v24.4S, v22.4S // ............*................. + bic v20.16B, v30.16B, v27.16B // .............*................ + str q28, [x0, #-48] // .............*................ + add v3.4S, v18.4S, v17.4S // ..............*............... + mls v5.4S, v30.4S, v22.4S // ..............*............... + str q20, [x0, #16] // ...............*.............. + bic v28.16B, v7.16B, v6.16B // ...............*.............. + mls v16.4S, v7.4S, v22.4S // ................*............. + str q3, [x1, #32] // ................*............. + str q28, [x0, #48] // .................*............ + add v3.4S, v0.4S, v2.4S // .................*............ + bic v7.16B, v29.16B, v4.16B // ..................*........... + str q25, [x0], #4*16 // ..................*........... + str q3, [x1, #-48] // ...................*.......... + add v24.4S, v5.4S, v27.4S // ...................*.......... + str q7, [x0, #-80] // ....................*......... + add v19.4S, v26.4S, v19.4S // ....................*......... + add v25.4S, v16.4S, v6.4S // .....................*........ + bic v6.16B, v31.16B, v17.16B // .....................*........ + str q24, [x1, #16] // ......................*....... + str q19, [x1], #4*16 // ......................*....... + str q6, [x0, #-32] // .......................*...... + str q25, [x1, #-16] // .......................*...... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // cmgt v1.4S, v0.4S, v21.4S // ..*............................ + // ldr q5, [x1, #48] // *.............................. + // cmgt v6.4S, v18.4S, v21.4S // .......*....................... + // sqdmulh v20.4S, v0.4S, v23.4S // .*............................. + // str q17, [x1, #-16] // ..*............................ + // srshr v3.4S, v3.4S, #17 // ...*........................... + // bic v7.16B, v29.16B, v4.16B // ..................*............ + // cmgt v2.4S, v26.4S, v21.4S // .*............................. + // sqdmulh v16.4S, v5.4S, v23.4S // ......*........................ + // srshr v28.4S, v24.4S, #17 // .....*......................... + // str q7, [x0, #-16] // ....................*.......... + // cmgt v4.4S, v5.4S, v21.4S // ....*.......................... + // srshr v31.4S, v20.4S, #17 // ......*........................ + // mls v18.4S, v3.4S, v22.4S // ........*...................... + // bic v3.16B, v3.16B, v6.16B // .....................*......... + // mls v26.4S, v28.4S, v22.4S // ............*.................. + // bic v19.16B, v28.16B, v2.16B // ............*.................. + // str q3, [x0, #32] // .......................*....... + // srshr v29.4S, v16.4S, #17 // ...........*................... + // str q19, [x0], #4*16 // ..................*............ + // mls v0.4S, v31.4S, v22.4S // ..........*.................... + // bic v30.16B, v31.16B, v1.16B // ..........*.................... + // add v3.4S, v18.4S, v6.4S // ..............*................ + // mls v5.4S, v29.4S, v22.4S // ................*.............. + // str q30, [x0, #-112] // .............*................. + // add v27.4S, v26.4S, v2.4S // ....................*.......... + // add v25.4S, v0.4S, v1.4S // .................*............. + // ldr q0, [x1, #16] // *.............................. + // str q3, [x1, #32] // ................*.............. + // str q25, [x1, #-48] // ...................*........... + // str q27, [x1], #4*16 // ......................*........ + // add v17.4S, v5.4S, v4.4S // .....................*......... + // cmgt v1.4S, v0.4S, v21.4S // ........*...................... + // sqdmulh v20.4S, v0.4S, v23.4S // ....*.......................... + // str q17, [x1, #-16] // .......................*....... + // bic v7.16B, v29.16B, v4.16B // ...............*............... + // str q7, [x0, #-16] // .................*............. + // srshr v31.4S, v20.4S, #17 // .........*..................... + // mls v0.4S, v31.4S, v22.4S // ..............*................ + // bic v30.16B, v31.16B, v1.16B // .............*................. + // str q30, [x0, #-48] // ...............*............... + // add v25.4S, v0.4S, v1.4S // ...................*........... + // str q25, [x1, #-48] // ......................*........ + ret diff --git a/mldsa/src/native/aarch64/src/poly_decompose_32_aarch64_asm.S b/mldsa/src/native/aarch64/src/poly_decompose_32_aarch64_asm.S index f2bdc5069..29d107bc0 100644 --- a/mldsa/src/native/aarch64/src/poly_decompose_32_aarch64_asm.S +++ b/mldsa/src/native/aarch64/src/poly_decompose_32_aarch64_asm.S @@ -4,8 +4,7 @@ */ #include "../../../common.h" -#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_NO_SIGN_API) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ - (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)) +#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_NO_SIGN_API) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)) /* * WARNING: This file is auto-derived from the mldsa-native source file @@ -31,46 +30,119 @@ MLD_ASM_FN_SYMBOL(poly_decompose_32_aarch64_asm) movk w11, #0x4010, lsl #16 dup v23.4s, w11 mov x3, #0x10 // =16 + ldr q0, [x1, #0x10] + ldr q17, [x1, #0x20] + ldr q2, [x1, #0x30] + ldr q26, [x1, #0x40] + ldr q16, [x1] + ldr q18, [x1, #0x60] + cmgt v31.4s, v17.4s, v21.4s + sqdmulh v3.4s, v17.4s, v23.4s + cmgt v4.4s, v2.4s, v21.4s + cmgt v5.4s, v16.4s, v21.4s + sqdmulh v19.4s, v16.4s, v23.4s + sqdmulh v7.4s, v2.4s, v23.4s + srshr v3.4s, v3.4s, #0x12 + sqdmulh v24.4s, v26.4s, v23.4s + srshr v27.4s, v19.4s, #0x12 + mls v17.4s, v3.4s, v22.4s + srshr v29.4s, v7.4s, #0x12 + bic v6.16b, v3.16b, v31.16b + bic v1.16b, v27.16b, v5.16b + mls v16.4s, v27.4s, v22.4s + str q6, [x0, #0x20] + str q1, [x0], #0x40 + mls v2.4s, v29.4s, v22.4s + add v6.4s, v17.4s, v31.4s + sqdmulh v3.4s, v18.4s, v23.4s + str q6, [x1, #0x20] + add v20.4s, v16.4s, v5.4s + str q20, [x1], #0x40 + add v17.4s, v2.4s, v4.4s + sub x3, x3, #0x2 Lpoly_decompose_32_loop: - ldr q0, [x1] - ldr q1, [x1, #0x10] - ldr q2, [x1, #0x20] - ldr q3, [x1, #0x30] - sqdmulh v5.4s, v1.4s, v23.4s - srshr v5.4s, v5.4s, #0x12 - cmgt v24.4s, v1.4s, v21.4s - mls v1.4s, v5.4s, v22.4s - bic v5.16b, v5.16b, v24.16b - add v1.4s, v1.4s, v24.4s - sqdmulh v6.4s, v2.4s, v23.4s - srshr v6.4s, v6.4s, #0x12 - cmgt v24.4s, v2.4s, v21.4s - mls v2.4s, v6.4s, v22.4s - bic v6.16b, v6.16b, v24.16b - add v2.4s, v2.4s, v24.4s - sqdmulh v7.4s, v3.4s, v23.4s - srshr v7.4s, v7.4s, #0x12 - cmgt v24.4s, v3.4s, v21.4s - mls v3.4s, v7.4s, v22.4s - bic v7.16b, v7.16b, v24.16b - add v3.4s, v3.4s, v24.4s - sqdmulh v4.4s, v0.4s, v23.4s - srshr v4.4s, v4.4s, #0x12 - cmgt v24.4s, v0.4s, v21.4s - mls v0.4s, v4.4s, v22.4s - bic v4.16b, v4.16b, v24.16b - add v0.4s, v0.4s, v24.4s - str q5, [x0, #0x10] - str q6, [x0, #0x20] - str q7, [x0, #0x30] - str q4, [x0], #0x40 - str q1, [x1, #0x10] - str q2, [x1, #0x20] - str q3, [x1, #0x30] - str q0, [x1], #0x40 + cmgt v1.4s, v0.4s, v21.4s + ldr q5, [x1, #0x30] + cmgt v6.4s, v18.4s, v21.4s + sqdmulh v20.4s, v0.4s, v23.4s + stur q17, [x1, #-0x10] + srshr v3.4s, v3.4s, #0x12 + bic v7.16b, v29.16b, v4.16b + cmgt v2.4s, v26.4s, v21.4s + sqdmulh v16.4s, v5.4s, v23.4s + srshr v28.4s, v24.4s, #0x12 + stur q7, [x0, #-0x10] + cmgt v4.4s, v5.4s, v21.4s + srshr v31.4s, v20.4s, #0x12 + mls v18.4s, v3.4s, v22.4s + bic v3.16b, v3.16b, v6.16b + mls v26.4s, v28.4s, v22.4s + bic v19.16b, v28.16b, v2.16b + str q3, [x0, #0x20] + srshr v29.4s, v16.4s, #0x12 + str q19, [x0], #0x40 + mls v0.4s, v31.4s, v22.4s + bic v30.16b, v31.16b, v1.16b + add v3.4s, v18.4s, v6.4s + ldr q18, [x1, #0x60] + mls v5.4s, v29.4s, v22.4s + stur q30, [x0, #-0x70] + add v27.4s, v26.4s, v2.4s + ldr q26, [x1, #0x40] + add v25.4s, v0.4s, v1.4s + ldr q0, [x1, #0x10] + str q3, [x1, #0x20] + sqdmulh v3.4s, v18.4s, v23.4s + stur q25, [x1, #-0x30] + str q27, [x1], #0x40 + add v17.4s, v5.4s, v4.4s + sqdmulh v24.4s, v26.4s, v23.4s subs x3, x3, #0x1 b.ne Lpoly_decompose_32_loop + ldr q16, [x1, #0x30] + ldr q5, [x1, #0x10] + sqdmulh v28.4s, v0.4s, v23.4s + cmgt v19.4s, v26.4s, v21.4s + cmgt v2.4s, v0.4s, v21.4s + stur q17, [x1, #-0x10] + srshr v31.4s, v3.4s, #0x12 + sqdmulh v1.4s, v5.4s, v23.4s + cmgt v6.4s, v16.4s, v21.4s + srshr v24.4s, v24.4s, #0x12 + srshr v25.4s, v28.4s, #0x12 + sqdmulh v20.4s, v16.4s, v23.4s + cmgt v17.4s, v18.4s, v21.4s + mls v18.4s, v31.4s, v22.4s + cmgt v27.4s, v5.4s, v21.4s + srshr v30.4s, v1.4s, #0x12 + mls v0.4s, v25.4s, v22.4s + bic v28.16b, v25.16b, v2.16b + srshr v7.4s, v20.4s, #0x12 + bic v25.16b, v24.16b, v19.16b + mls v26.4s, v24.4s, v22.4s + bic v20.16b, v30.16b, v27.16b + stur q28, [x0, #-0x30] + add v3.4s, v18.4s, v17.4s + mls v5.4s, v30.4s, v22.4s + str q20, [x0, #0x10] + bic v28.16b, v7.16b, v6.16b + mls v16.4s, v7.4s, v22.4s + str q3, [x1, #0x20] + str q28, [x0, #0x30] + add v3.4s, v0.4s, v2.4s + bic v7.16b, v29.16b, v4.16b + str q25, [x0], #0x40 + stur q3, [x1, #-0x30] + add v24.4s, v5.4s, v27.4s + stur q7, [x0, #-0x50] + add v19.4s, v26.4s, v19.4s + add v25.4s, v16.4s, v6.4s + bic v6.16b, v31.16b, v17.16b + str q24, [x1, #0x10] + str q19, [x1], #0x40 + stur q6, [x0, #-0x20] + stur q25, [x1, #-0x10] ret .cfi_endproc diff --git a/mldsa/src/native/aarch64/src/poly_decompose_88_aarch64_asm.S b/mldsa/src/native/aarch64/src/poly_decompose_88_aarch64_asm.S index 100f27780..c463c4080 100644 --- a/mldsa/src/native/aarch64/src/poly_decompose_88_aarch64_asm.S +++ b/mldsa/src/native/aarch64/src/poly_decompose_88_aarch64_asm.S @@ -4,8 +4,7 @@ */ #include "../../../common.h" -#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_NO_SIGN_API) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ - (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44) +#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_NO_SIGN_API) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44) /* * WARNING: This file is auto-derived from the mldsa-native source file @@ -31,46 +30,119 @@ MLD_ASM_FN_SYMBOL(poly_decompose_88_aarch64_asm) movk w11, #0x5816, lsl #16 dup v23.4s, w11 mov x3, #0x10 // =16 + ldr q0, [x1, #0x10] + ldr q17, [x1, #0x20] + ldr q2, [x1, #0x30] + ldr q26, [x1, #0x40] + ldr q16, [x1] + ldr q18, [x1, #0x60] + cmgt v31.4s, v17.4s, v21.4s + sqdmulh v3.4s, v17.4s, v23.4s + cmgt v4.4s, v2.4s, v21.4s + cmgt v5.4s, v16.4s, v21.4s + sqdmulh v19.4s, v16.4s, v23.4s + sqdmulh v7.4s, v2.4s, v23.4s + srshr v3.4s, v3.4s, #0x11 + sqdmulh v24.4s, v26.4s, v23.4s + srshr v27.4s, v19.4s, #0x11 + mls v17.4s, v3.4s, v22.4s + srshr v29.4s, v7.4s, #0x11 + bic v6.16b, v3.16b, v31.16b + bic v1.16b, v27.16b, v5.16b + mls v16.4s, v27.4s, v22.4s + str q6, [x0, #0x20] + str q1, [x0], #0x40 + mls v2.4s, v29.4s, v22.4s + add v6.4s, v17.4s, v31.4s + sqdmulh v3.4s, v18.4s, v23.4s + str q6, [x1, #0x20] + add v20.4s, v16.4s, v5.4s + str q20, [x1], #0x40 + add v17.4s, v2.4s, v4.4s + sub x3, x3, #0x2 Lpoly_decompose_88_loop: - ldr q0, [x1] - ldr q1, [x1, #0x10] - ldr q2, [x1, #0x20] - ldr q3, [x1, #0x30] - sqdmulh v5.4s, v1.4s, v23.4s - srshr v5.4s, v5.4s, #0x11 - cmgt v24.4s, v1.4s, v21.4s - mls v1.4s, v5.4s, v22.4s - bic v5.16b, v5.16b, v24.16b - add v1.4s, v1.4s, v24.4s - sqdmulh v6.4s, v2.4s, v23.4s - srshr v6.4s, v6.4s, #0x11 - cmgt v24.4s, v2.4s, v21.4s - mls v2.4s, v6.4s, v22.4s - bic v6.16b, v6.16b, v24.16b - add v2.4s, v2.4s, v24.4s - sqdmulh v7.4s, v3.4s, v23.4s - srshr v7.4s, v7.4s, #0x11 - cmgt v24.4s, v3.4s, v21.4s - mls v3.4s, v7.4s, v22.4s - bic v7.16b, v7.16b, v24.16b - add v3.4s, v3.4s, v24.4s - sqdmulh v4.4s, v0.4s, v23.4s - srshr v4.4s, v4.4s, #0x11 - cmgt v24.4s, v0.4s, v21.4s - mls v0.4s, v4.4s, v22.4s - bic v4.16b, v4.16b, v24.16b - add v0.4s, v0.4s, v24.4s - str q5, [x0, #0x10] - str q6, [x0, #0x20] - str q7, [x0, #0x30] - str q4, [x0], #0x40 - str q1, [x1, #0x10] - str q2, [x1, #0x20] - str q3, [x1, #0x30] - str q0, [x1], #0x40 + cmgt v1.4s, v0.4s, v21.4s + ldr q5, [x1, #0x30] + cmgt v6.4s, v18.4s, v21.4s + sqdmulh v20.4s, v0.4s, v23.4s + stur q17, [x1, #-0x10] + srshr v3.4s, v3.4s, #0x11 + bic v7.16b, v29.16b, v4.16b + cmgt v2.4s, v26.4s, v21.4s + sqdmulh v16.4s, v5.4s, v23.4s + srshr v28.4s, v24.4s, #0x11 + stur q7, [x0, #-0x10] + cmgt v4.4s, v5.4s, v21.4s + srshr v31.4s, v20.4s, #0x11 + mls v18.4s, v3.4s, v22.4s + bic v3.16b, v3.16b, v6.16b + mls v26.4s, v28.4s, v22.4s + bic v19.16b, v28.16b, v2.16b + str q3, [x0, #0x20] + srshr v29.4s, v16.4s, #0x11 + str q19, [x0], #0x40 + mls v0.4s, v31.4s, v22.4s + bic v30.16b, v31.16b, v1.16b + add v3.4s, v18.4s, v6.4s + ldr q18, [x1, #0x60] + mls v5.4s, v29.4s, v22.4s + stur q30, [x0, #-0x70] + add v27.4s, v26.4s, v2.4s + ldr q26, [x1, #0x40] + add v25.4s, v0.4s, v1.4s + ldr q0, [x1, #0x10] + str q3, [x1, #0x20] + sqdmulh v3.4s, v18.4s, v23.4s + stur q25, [x1, #-0x30] + str q27, [x1], #0x40 + add v17.4s, v5.4s, v4.4s + sqdmulh v24.4s, v26.4s, v23.4s subs x3, x3, #0x1 b.ne Lpoly_decompose_88_loop + ldr q16, [x1, #0x30] + ldr q5, [x1, #0x10] + sqdmulh v28.4s, v0.4s, v23.4s + cmgt v19.4s, v26.4s, v21.4s + cmgt v2.4s, v0.4s, v21.4s + stur q17, [x1, #-0x10] + srshr v31.4s, v3.4s, #0x11 + sqdmulh v1.4s, v5.4s, v23.4s + cmgt v6.4s, v16.4s, v21.4s + srshr v24.4s, v24.4s, #0x11 + srshr v25.4s, v28.4s, #0x11 + sqdmulh v20.4s, v16.4s, v23.4s + cmgt v17.4s, v18.4s, v21.4s + mls v18.4s, v31.4s, v22.4s + cmgt v27.4s, v5.4s, v21.4s + srshr v30.4s, v1.4s, #0x11 + mls v0.4s, v25.4s, v22.4s + bic v28.16b, v25.16b, v2.16b + srshr v7.4s, v20.4s, #0x11 + bic v25.16b, v24.16b, v19.16b + mls v26.4s, v24.4s, v22.4s + bic v20.16b, v30.16b, v27.16b + stur q28, [x0, #-0x30] + add v3.4s, v18.4s, v17.4s + mls v5.4s, v30.4s, v22.4s + str q20, [x0, #0x10] + bic v28.16b, v7.16b, v6.16b + mls v16.4s, v7.4s, v22.4s + str q3, [x1, #0x20] + str q28, [x0, #0x30] + add v3.4s, v0.4s, v2.4s + bic v7.16b, v29.16b, v4.16b + str q25, [x0], #0x40 + stur q3, [x1, #-0x30] + add v24.4s, v5.4s, v27.4s + stur q7, [x0, #-0x50] + add v19.4s, v26.4s, v19.4s + add v25.4s, v16.4s, v6.4s + bic v6.16b, v31.16b, v17.16b + str q24, [x1, #0x10] + str q19, [x1], #0x40 + stur q6, [x0, #-0x20] + stur q25, [x1, #-0x10] ret .cfi_endproc diff --git a/proofs/hol_light/aarch64/mldsa/poly_decompose_32_aarch64_asm.S b/proofs/hol_light/aarch64/mldsa/poly_decompose_32_aarch64_asm.S index b0ad0492e..c1fb22a4f 100644 --- a/proofs/hol_light/aarch64/mldsa/poly_decompose_32_aarch64_asm.S +++ b/proofs/hol_light/aarch64/mldsa/poly_decompose_32_aarch64_asm.S @@ -33,46 +33,119 @@ mld_poly_decompose_32_aarch64_asm: movk w11, #0x4010, lsl #16 dup v23.4s, w11 mov x3, #0x10 // =16 + ldr q0, [x1, #0x10] + ldr q17, [x1, #0x20] + ldr q2, [x1, #0x30] + ldr q26, [x1, #0x40] + ldr q16, [x1] + ldr q18, [x1, #0x60] + cmgt v31.4s, v17.4s, v21.4s + sqdmulh v3.4s, v17.4s, v23.4s + cmgt v4.4s, v2.4s, v21.4s + cmgt v5.4s, v16.4s, v21.4s + sqdmulh v19.4s, v16.4s, v23.4s + sqdmulh v7.4s, v2.4s, v23.4s + srshr v3.4s, v3.4s, #0x12 + sqdmulh v24.4s, v26.4s, v23.4s + srshr v27.4s, v19.4s, #0x12 + mls v17.4s, v3.4s, v22.4s + srshr v29.4s, v7.4s, #0x12 + bic v6.16b, v3.16b, v31.16b + bic v1.16b, v27.16b, v5.16b + mls v16.4s, v27.4s, v22.4s + str q6, [x0, #0x20] + str q1, [x0], #0x40 + mls v2.4s, v29.4s, v22.4s + add v6.4s, v17.4s, v31.4s + sqdmulh v3.4s, v18.4s, v23.4s + str q6, [x1, #0x20] + add v20.4s, v16.4s, v5.4s + str q20, [x1], #0x40 + add v17.4s, v2.4s, v4.4s + sub x3, x3, #0x2 Lpoly_decompose_32_loop: - ldr q0, [x1] - ldr q1, [x1, #0x10] - ldr q2, [x1, #0x20] - ldr q3, [x1, #0x30] - sqdmulh v5.4s, v1.4s, v23.4s - srshr v5.4s, v5.4s, #0x12 - cmgt v24.4s, v1.4s, v21.4s - mls v1.4s, v5.4s, v22.4s - bic v5.16b, v5.16b, v24.16b - add v1.4s, v1.4s, v24.4s - sqdmulh v6.4s, v2.4s, v23.4s - srshr v6.4s, v6.4s, #0x12 - cmgt v24.4s, v2.4s, v21.4s - mls v2.4s, v6.4s, v22.4s - bic v6.16b, v6.16b, v24.16b - add v2.4s, v2.4s, v24.4s - sqdmulh v7.4s, v3.4s, v23.4s - srshr v7.4s, v7.4s, #0x12 - cmgt v24.4s, v3.4s, v21.4s - mls v3.4s, v7.4s, v22.4s - bic v7.16b, v7.16b, v24.16b - add v3.4s, v3.4s, v24.4s - sqdmulh v4.4s, v0.4s, v23.4s - srshr v4.4s, v4.4s, #0x12 - cmgt v24.4s, v0.4s, v21.4s - mls v0.4s, v4.4s, v22.4s - bic v4.16b, v4.16b, v24.16b - add v0.4s, v0.4s, v24.4s - str q5, [x0, #0x10] - str q6, [x0, #0x20] - str q7, [x0, #0x30] - str q4, [x0], #0x40 - str q1, [x1, #0x10] - str q2, [x1, #0x20] - str q3, [x1, #0x30] - str q0, [x1], #0x40 + cmgt v1.4s, v0.4s, v21.4s + ldr q5, [x1, #0x30] + cmgt v6.4s, v18.4s, v21.4s + sqdmulh v20.4s, v0.4s, v23.4s + stur q17, [x1, #-0x10] + srshr v3.4s, v3.4s, #0x12 + bic v7.16b, v29.16b, v4.16b + cmgt v2.4s, v26.4s, v21.4s + sqdmulh v16.4s, v5.4s, v23.4s + srshr v28.4s, v24.4s, #0x12 + stur q7, [x0, #-0x10] + cmgt v4.4s, v5.4s, v21.4s + srshr v31.4s, v20.4s, #0x12 + mls v18.4s, v3.4s, v22.4s + bic v3.16b, v3.16b, v6.16b + mls v26.4s, v28.4s, v22.4s + bic v19.16b, v28.16b, v2.16b + str q3, [x0, #0x20] + srshr v29.4s, v16.4s, #0x12 + str q19, [x0], #0x40 + mls v0.4s, v31.4s, v22.4s + bic v30.16b, v31.16b, v1.16b + add v3.4s, v18.4s, v6.4s + ldr q18, [x1, #0x60] + mls v5.4s, v29.4s, v22.4s + stur q30, [x0, #-0x70] + add v27.4s, v26.4s, v2.4s + ldr q26, [x1, #0x40] + add v25.4s, v0.4s, v1.4s + ldr q0, [x1, #0x10] + str q3, [x1, #0x20] + sqdmulh v3.4s, v18.4s, v23.4s + stur q25, [x1, #-0x30] + str q27, [x1], #0x40 + add v17.4s, v5.4s, v4.4s + sqdmulh v24.4s, v26.4s, v23.4s subs x3, x3, #0x1 b.ne Lpoly_decompose_32_loop + ldr q16, [x1, #0x30] + ldr q5, [x1, #0x10] + sqdmulh v28.4s, v0.4s, v23.4s + cmgt v19.4s, v26.4s, v21.4s + cmgt v2.4s, v0.4s, v21.4s + stur q17, [x1, #-0x10] + srshr v31.4s, v3.4s, #0x12 + sqdmulh v1.4s, v5.4s, v23.4s + cmgt v6.4s, v16.4s, v21.4s + srshr v24.4s, v24.4s, #0x12 + srshr v25.4s, v28.4s, #0x12 + sqdmulh v20.4s, v16.4s, v23.4s + cmgt v17.4s, v18.4s, v21.4s + mls v18.4s, v31.4s, v22.4s + cmgt v27.4s, v5.4s, v21.4s + srshr v30.4s, v1.4s, #0x12 + mls v0.4s, v25.4s, v22.4s + bic v28.16b, v25.16b, v2.16b + srshr v7.4s, v20.4s, #0x12 + bic v25.16b, v24.16b, v19.16b + mls v26.4s, v24.4s, v22.4s + bic v20.16b, v30.16b, v27.16b + stur q28, [x0, #-0x30] + add v3.4s, v18.4s, v17.4s + mls v5.4s, v30.4s, v22.4s + str q20, [x0, #0x10] + bic v28.16b, v7.16b, v6.16b + mls v16.4s, v7.4s, v22.4s + str q3, [x1, #0x20] + str q28, [x0, #0x30] + add v3.4s, v0.4s, v2.4s + bic v7.16b, v29.16b, v4.16b + str q25, [x0], #0x40 + stur q3, [x1, #-0x30] + add v24.4s, v5.4s, v27.4s + stur q7, [x0, #-0x50] + add v19.4s, v26.4s, v19.4s + add v25.4s, v16.4s, v6.4s + bic v6.16b, v31.16b, v17.16b + str q24, [x1, #0x10] + str q19, [x1], #0x40 + stur q6, [x0, #-0x20] + stur q25, [x1, #-0x10] ret .cfi_endproc diff --git a/proofs/hol_light/aarch64/mldsa/poly_decompose_88_aarch64_asm.S b/proofs/hol_light/aarch64/mldsa/poly_decompose_88_aarch64_asm.S index 58a494c8e..d413d84ea 100644 --- a/proofs/hol_light/aarch64/mldsa/poly_decompose_88_aarch64_asm.S +++ b/proofs/hol_light/aarch64/mldsa/poly_decompose_88_aarch64_asm.S @@ -33,46 +33,119 @@ mld_poly_decompose_88_aarch64_asm: movk w11, #0x5816, lsl #16 dup v23.4s, w11 mov x3, #0x10 // =16 + ldr q0, [x1, #0x10] + ldr q17, [x1, #0x20] + ldr q2, [x1, #0x30] + ldr q26, [x1, #0x40] + ldr q16, [x1] + ldr q18, [x1, #0x60] + cmgt v31.4s, v17.4s, v21.4s + sqdmulh v3.4s, v17.4s, v23.4s + cmgt v4.4s, v2.4s, v21.4s + cmgt v5.4s, v16.4s, v21.4s + sqdmulh v19.4s, v16.4s, v23.4s + sqdmulh v7.4s, v2.4s, v23.4s + srshr v3.4s, v3.4s, #0x11 + sqdmulh v24.4s, v26.4s, v23.4s + srshr v27.4s, v19.4s, #0x11 + mls v17.4s, v3.4s, v22.4s + srshr v29.4s, v7.4s, #0x11 + bic v6.16b, v3.16b, v31.16b + bic v1.16b, v27.16b, v5.16b + mls v16.4s, v27.4s, v22.4s + str q6, [x0, #0x20] + str q1, [x0], #0x40 + mls v2.4s, v29.4s, v22.4s + add v6.4s, v17.4s, v31.4s + sqdmulh v3.4s, v18.4s, v23.4s + str q6, [x1, #0x20] + add v20.4s, v16.4s, v5.4s + str q20, [x1], #0x40 + add v17.4s, v2.4s, v4.4s + sub x3, x3, #0x2 Lpoly_decompose_88_loop: - ldr q0, [x1] - ldr q1, [x1, #0x10] - ldr q2, [x1, #0x20] - ldr q3, [x1, #0x30] - sqdmulh v5.4s, v1.4s, v23.4s - srshr v5.4s, v5.4s, #0x11 - cmgt v24.4s, v1.4s, v21.4s - mls v1.4s, v5.4s, v22.4s - bic v5.16b, v5.16b, v24.16b - add v1.4s, v1.4s, v24.4s - sqdmulh v6.4s, v2.4s, v23.4s - srshr v6.4s, v6.4s, #0x11 - cmgt v24.4s, v2.4s, v21.4s - mls v2.4s, v6.4s, v22.4s - bic v6.16b, v6.16b, v24.16b - add v2.4s, v2.4s, v24.4s - sqdmulh v7.4s, v3.4s, v23.4s - srshr v7.4s, v7.4s, #0x11 - cmgt v24.4s, v3.4s, v21.4s - mls v3.4s, v7.4s, v22.4s - bic v7.16b, v7.16b, v24.16b - add v3.4s, v3.4s, v24.4s - sqdmulh v4.4s, v0.4s, v23.4s - srshr v4.4s, v4.4s, #0x11 - cmgt v24.4s, v0.4s, v21.4s - mls v0.4s, v4.4s, v22.4s - bic v4.16b, v4.16b, v24.16b - add v0.4s, v0.4s, v24.4s - str q5, [x0, #0x10] - str q6, [x0, #0x20] - str q7, [x0, #0x30] - str q4, [x0], #0x40 - str q1, [x1, #0x10] - str q2, [x1, #0x20] - str q3, [x1, #0x30] - str q0, [x1], #0x40 + cmgt v1.4s, v0.4s, v21.4s + ldr q5, [x1, #0x30] + cmgt v6.4s, v18.4s, v21.4s + sqdmulh v20.4s, v0.4s, v23.4s + stur q17, [x1, #-0x10] + srshr v3.4s, v3.4s, #0x11 + bic v7.16b, v29.16b, v4.16b + cmgt v2.4s, v26.4s, v21.4s + sqdmulh v16.4s, v5.4s, v23.4s + srshr v28.4s, v24.4s, #0x11 + stur q7, [x0, #-0x10] + cmgt v4.4s, v5.4s, v21.4s + srshr v31.4s, v20.4s, #0x11 + mls v18.4s, v3.4s, v22.4s + bic v3.16b, v3.16b, v6.16b + mls v26.4s, v28.4s, v22.4s + bic v19.16b, v28.16b, v2.16b + str q3, [x0, #0x20] + srshr v29.4s, v16.4s, #0x11 + str q19, [x0], #0x40 + mls v0.4s, v31.4s, v22.4s + bic v30.16b, v31.16b, v1.16b + add v3.4s, v18.4s, v6.4s + ldr q18, [x1, #0x60] + mls v5.4s, v29.4s, v22.4s + stur q30, [x0, #-0x70] + add v27.4s, v26.4s, v2.4s + ldr q26, [x1, #0x40] + add v25.4s, v0.4s, v1.4s + ldr q0, [x1, #0x10] + str q3, [x1, #0x20] + sqdmulh v3.4s, v18.4s, v23.4s + stur q25, [x1, #-0x30] + str q27, [x1], #0x40 + add v17.4s, v5.4s, v4.4s + sqdmulh v24.4s, v26.4s, v23.4s subs x3, x3, #0x1 b.ne Lpoly_decompose_88_loop + ldr q16, [x1, #0x30] + ldr q5, [x1, #0x10] + sqdmulh v28.4s, v0.4s, v23.4s + cmgt v19.4s, v26.4s, v21.4s + cmgt v2.4s, v0.4s, v21.4s + stur q17, [x1, #-0x10] + srshr v31.4s, v3.4s, #0x11 + sqdmulh v1.4s, v5.4s, v23.4s + cmgt v6.4s, v16.4s, v21.4s + srshr v24.4s, v24.4s, #0x11 + srshr v25.4s, v28.4s, #0x11 + sqdmulh v20.4s, v16.4s, v23.4s + cmgt v17.4s, v18.4s, v21.4s + mls v18.4s, v31.4s, v22.4s + cmgt v27.4s, v5.4s, v21.4s + srshr v30.4s, v1.4s, #0x11 + mls v0.4s, v25.4s, v22.4s + bic v28.16b, v25.16b, v2.16b + srshr v7.4s, v20.4s, #0x11 + bic v25.16b, v24.16b, v19.16b + mls v26.4s, v24.4s, v22.4s + bic v20.16b, v30.16b, v27.16b + stur q28, [x0, #-0x30] + add v3.4s, v18.4s, v17.4s + mls v5.4s, v30.4s, v22.4s + str q20, [x0, #0x10] + bic v28.16b, v7.16b, v6.16b + mls v16.4s, v7.4s, v22.4s + str q3, [x1, #0x20] + str q28, [x0, #0x30] + add v3.4s, v0.4s, v2.4s + bic v7.16b, v29.16b, v4.16b + str q25, [x0], #0x40 + stur q3, [x1, #-0x30] + add v24.4s, v5.4s, v27.4s + stur q7, [x0, #-0x50] + add v19.4s, v26.4s, v19.4s + add v25.4s, v16.4s, v6.4s + bic v6.16b, v31.16b, v17.16b + str q24, [x1, #0x10] + str q19, [x1], #0x40 + stur q6, [x0, #-0x20] + stur q25, [x1, #-0x10] ret .cfi_endproc diff --git a/proofs/hol_light/aarch64/proofs/poly_decompose_32_aarch64_asm.ml b/proofs/hol_light/aarch64/proofs/poly_decompose_32_aarch64_asm.ml index f62b972f9..53c816a1b 100644 --- a/proofs/hol_light/aarch64/proofs/poly_decompose_32_aarch64_asm.ml +++ b/proofs/hol_light/aarch64/proofs/poly_decompose_32_aarch64_asm.ml @@ -32,44 +32,117 @@ let poly_decompose_32_aarch64_asm_mc = define_assert_from_elf "poly_decompose_32 0x72a8020b; (* arm_MOVK W11 (word 16400) 16 *) 0x4e040d77; (* arm_DUP_GEN Q23 X11 32 128 *) 0xd2800203; (* arm_MOV X3 (rvalue (word 16)) *) - 0x3dc00020; (* arm_LDR Q0 X1 (Immediate_Offset (word 0)) *) - 0x3dc00421; (* arm_LDR Q1 X1 (Immediate_Offset (word 16)) *) - 0x3dc00822; (* arm_LDR Q2 X1 (Immediate_Offset (word 32)) *) - 0x3dc00c23; (* arm_LDR Q3 X1 (Immediate_Offset (word 48)) *) - 0x4eb7b425; (* arm_SQDMULH_VEC Q5 Q1 Q23 32 128 *) - 0x4f2e24a5; (* arm_SRSHR_VEC Q5 Q5 18 32 128 *) - 0x4eb53438; (* arm_CMGT_VEC Q24 Q1 Q21 32 128 *) - 0x6eb694a1; (* arm_MLS_VEC Q1 Q5 Q22 32 128 *) - 0x4e781ca5; (* arm_BIC_VEC Q5 Q5 Q24 128 *) - 0x4eb88421; (* arm_ADD_VEC Q1 Q1 Q24 32 128 *) - 0x4eb7b446; (* arm_SQDMULH_VEC Q6 Q2 Q23 32 128 *) - 0x4f2e24c6; (* arm_SRSHR_VEC Q6 Q6 18 32 128 *) - 0x4eb53458; (* arm_CMGT_VEC Q24 Q2 Q21 32 128 *) - 0x6eb694c2; (* arm_MLS_VEC Q2 Q6 Q22 32 128 *) - 0x4e781cc6; (* arm_BIC_VEC Q6 Q6 Q24 128 *) - 0x4eb88442; (* arm_ADD_VEC Q2 Q2 Q24 32 128 *) - 0x4eb7b467; (* arm_SQDMULH_VEC Q7 Q3 Q23 32 128 *) - 0x4f2e24e7; (* arm_SRSHR_VEC Q7 Q7 18 32 128 *) - 0x4eb53478; (* arm_CMGT_VEC Q24 Q3 Q21 32 128 *) - 0x6eb694e3; (* arm_MLS_VEC Q3 Q7 Q22 32 128 *) - 0x4e781ce7; (* arm_BIC_VEC Q7 Q7 Q24 128 *) - 0x4eb88463; (* arm_ADD_VEC Q3 Q3 Q24 32 128 *) - 0x4eb7b404; (* arm_SQDMULH_VEC Q4 Q0 Q23 32 128 *) - 0x4f2e2484; (* arm_SRSHR_VEC Q4 Q4 18 32 128 *) - 0x4eb53418; (* arm_CMGT_VEC Q24 Q0 Q21 32 128 *) - 0x6eb69480; (* arm_MLS_VEC Q0 Q4 Q22 32 128 *) - 0x4e781c84; (* arm_BIC_VEC Q4 Q4 Q24 128 *) - 0x4eb88400; (* arm_ADD_VEC Q0 Q0 Q24 32 128 *) - 0x3d800405; (* arm_STR Q5 X0 (Immediate_Offset (word 16)) *) + 0x3dc00420; (* arm_LDR Q0 X1 (Immediate_Offset (word 16)) *) + 0x3dc00831; (* arm_LDR Q17 X1 (Immediate_Offset (word 32)) *) + 0x3dc00c22; (* arm_LDR Q2 X1 (Immediate_Offset (word 48)) *) + 0x3dc0103a; (* arm_LDR Q26 X1 (Immediate_Offset (word 64)) *) + 0x3dc00030; (* arm_LDR Q16 X1 (Immediate_Offset (word 0)) *) + 0x3dc01832; (* arm_LDR Q18 X1 (Immediate_Offset (word 96)) *) + 0x4eb5363f; (* arm_CMGT_VEC Q31 Q17 Q21 32 128 *) + 0x4eb7b623; (* arm_SQDMULH_VEC Q3 Q17 Q23 32 128 *) + 0x4eb53444; (* arm_CMGT_VEC Q4 Q2 Q21 32 128 *) + 0x4eb53605; (* arm_CMGT_VEC Q5 Q16 Q21 32 128 *) + 0x4eb7b613; (* arm_SQDMULH_VEC Q19 Q16 Q23 32 128 *) + 0x4eb7b447; (* arm_SQDMULH_VEC Q7 Q2 Q23 32 128 *) + 0x4f2e2463; (* arm_SRSHR_VEC Q3 Q3 18 32 128 *) + 0x4eb7b758; (* arm_SQDMULH_VEC Q24 Q26 Q23 32 128 *) + 0x4f2e267b; (* arm_SRSHR_VEC Q27 Q19 18 32 128 *) + 0x6eb69471; (* arm_MLS_VEC Q17 Q3 Q22 32 128 *) + 0x4f2e24fd; (* arm_SRSHR_VEC Q29 Q7 18 32 128 *) + 0x4e7f1c66; (* arm_BIC_VEC Q6 Q3 Q31 128 *) + 0x4e651f61; (* arm_BIC_VEC Q1 Q27 Q5 128 *) + 0x6eb69770; (* arm_MLS_VEC Q16 Q27 Q22 32 128 *) 0x3d800806; (* arm_STR Q6 X0 (Immediate_Offset (word 32)) *) - 0x3d800c07; (* arm_STR Q7 X0 (Immediate_Offset (word 48)) *) - 0x3c840404; (* arm_STR Q4 X0 (Postimmediate_Offset (word 64)) *) - 0x3d800421; (* arm_STR Q1 X1 (Immediate_Offset (word 16)) *) - 0x3d800822; (* arm_STR Q2 X1 (Immediate_Offset (word 32)) *) - 0x3d800c23; (* arm_STR Q3 X1 (Immediate_Offset (word 48)) *) - 0x3c840420; (* arm_STR Q0 X1 (Postimmediate_Offset (word 64)) *) + 0x3c840401; (* arm_STR Q1 X0 (Postimmediate_Offset (word 64)) *) + 0x6eb697a2; (* arm_MLS_VEC Q2 Q29 Q22 32 128 *) + 0x4ebf8626; (* arm_ADD_VEC Q6 Q17 Q31 32 128 *) + 0x4eb7b643; (* arm_SQDMULH_VEC Q3 Q18 Q23 32 128 *) + 0x3d800826; (* arm_STR Q6 X1 (Immediate_Offset (word 32)) *) + 0x4ea58614; (* arm_ADD_VEC Q20 Q16 Q5 32 128 *) + 0x3c840434; (* arm_STR Q20 X1 (Postimmediate_Offset (word 64)) *) + 0x4ea48451; (* arm_ADD_VEC Q17 Q2 Q4 32 128 *) + 0xd1000863; (* arm_SUB X3 X3 (rvalue (word 2)) *) + 0x4eb53401; (* arm_CMGT_VEC Q1 Q0 Q21 32 128 *) + 0x3dc00c25; (* arm_LDR Q5 X1 (Immediate_Offset (word 48)) *) + 0x4eb53646; (* arm_CMGT_VEC Q6 Q18 Q21 32 128 *) + 0x4eb7b414; (* arm_SQDMULH_VEC Q20 Q0 Q23 32 128 *) + 0x3c9f0031; (* arm_STR Q17 X1 (Immediate_Offset (word 18446744073709551600)) *) + 0x4f2e2463; (* arm_SRSHR_VEC Q3 Q3 18 32 128 *) + 0x4e641fa7; (* arm_BIC_VEC Q7 Q29 Q4 128 *) + 0x4eb53742; (* arm_CMGT_VEC Q2 Q26 Q21 32 128 *) + 0x4eb7b4b0; (* arm_SQDMULH_VEC Q16 Q5 Q23 32 128 *) + 0x4f2e271c; (* arm_SRSHR_VEC Q28 Q24 18 32 128 *) + 0x3c9f0007; (* arm_STR Q7 X0 (Immediate_Offset (word 18446744073709551600)) *) + 0x4eb534a4; (* arm_CMGT_VEC Q4 Q5 Q21 32 128 *) + 0x4f2e269f; (* arm_SRSHR_VEC Q31 Q20 18 32 128 *) + 0x6eb69472; (* arm_MLS_VEC Q18 Q3 Q22 32 128 *) + 0x4e661c63; (* arm_BIC_VEC Q3 Q3 Q6 128 *) + 0x6eb6979a; (* arm_MLS_VEC Q26 Q28 Q22 32 128 *) + 0x4e621f93; (* arm_BIC_VEC Q19 Q28 Q2 128 *) + 0x3d800803; (* arm_STR Q3 X0 (Immediate_Offset (word 32)) *) + 0x4f2e261d; (* arm_SRSHR_VEC Q29 Q16 18 32 128 *) + 0x3c840413; (* arm_STR Q19 X0 (Postimmediate_Offset (word 64)) *) + 0x6eb697e0; (* arm_MLS_VEC Q0 Q31 Q22 32 128 *) + 0x4e611ffe; (* arm_BIC_VEC Q30 Q31 Q1 128 *) + 0x4ea68643; (* arm_ADD_VEC Q3 Q18 Q6 32 128 *) + 0x3dc01832; (* arm_LDR Q18 X1 (Immediate_Offset (word 96)) *) + 0x6eb697a5; (* arm_MLS_VEC Q5 Q29 Q22 32 128 *) + 0x3c99001e; (* arm_STR Q30 X0 (Immediate_Offset (word 18446744073709551504)) *) + 0x4ea2875b; (* arm_ADD_VEC Q27 Q26 Q2 32 128 *) + 0x3dc0103a; (* arm_LDR Q26 X1 (Immediate_Offset (word 64)) *) + 0x4ea18419; (* arm_ADD_VEC Q25 Q0 Q1 32 128 *) + 0x3dc00420; (* arm_LDR Q0 X1 (Immediate_Offset (word 16)) *) + 0x3d800823; (* arm_STR Q3 X1 (Immediate_Offset (word 32)) *) + 0x4eb7b643; (* arm_SQDMULH_VEC Q3 Q18 Q23 32 128 *) + 0x3c9d0039; (* arm_STR Q25 X1 (Immediate_Offset (word 18446744073709551568)) *) + 0x3c84043b; (* arm_STR Q27 X1 (Postimmediate_Offset (word 64)) *) + 0x4ea484b1; (* arm_ADD_VEC Q17 Q5 Q4 32 128 *) + 0x4eb7b758; (* arm_SQDMULH_VEC Q24 Q26 Q23 32 128 *) 0xf1000463; (* arm_SUBS X3 X3 (rvalue (word 1)) *) 0x54fffb61; (* arm_BNE (word 2097004) *) + 0x3dc00c30; (* arm_LDR Q16 X1 (Immediate_Offset (word 48)) *) + 0x3dc00425; (* arm_LDR Q5 X1 (Immediate_Offset (word 16)) *) + 0x4eb7b41c; (* arm_SQDMULH_VEC Q28 Q0 Q23 32 128 *) + 0x4eb53753; (* arm_CMGT_VEC Q19 Q26 Q21 32 128 *) + 0x4eb53402; (* arm_CMGT_VEC Q2 Q0 Q21 32 128 *) + 0x3c9f0031; (* arm_STR Q17 X1 (Immediate_Offset (word 18446744073709551600)) *) + 0x4f2e247f; (* arm_SRSHR_VEC Q31 Q3 18 32 128 *) + 0x4eb7b4a1; (* arm_SQDMULH_VEC Q1 Q5 Q23 32 128 *) + 0x4eb53606; (* arm_CMGT_VEC Q6 Q16 Q21 32 128 *) + 0x4f2e2718; (* arm_SRSHR_VEC Q24 Q24 18 32 128 *) + 0x4f2e2799; (* arm_SRSHR_VEC Q25 Q28 18 32 128 *) + 0x4eb7b614; (* arm_SQDMULH_VEC Q20 Q16 Q23 32 128 *) + 0x4eb53651; (* arm_CMGT_VEC Q17 Q18 Q21 32 128 *) + 0x6eb697f2; (* arm_MLS_VEC Q18 Q31 Q22 32 128 *) + 0x4eb534bb; (* arm_CMGT_VEC Q27 Q5 Q21 32 128 *) + 0x4f2e243e; (* arm_SRSHR_VEC Q30 Q1 18 32 128 *) + 0x6eb69720; (* arm_MLS_VEC Q0 Q25 Q22 32 128 *) + 0x4e621f3c; (* arm_BIC_VEC Q28 Q25 Q2 128 *) + 0x4f2e2687; (* arm_SRSHR_VEC Q7 Q20 18 32 128 *) + 0x4e731f19; (* arm_BIC_VEC Q25 Q24 Q19 128 *) + 0x6eb6971a; (* arm_MLS_VEC Q26 Q24 Q22 32 128 *) + 0x4e7b1fd4; (* arm_BIC_VEC Q20 Q30 Q27 128 *) + 0x3c9d001c; (* arm_STR Q28 X0 (Immediate_Offset (word 18446744073709551568)) *) + 0x4eb18643; (* arm_ADD_VEC Q3 Q18 Q17 32 128 *) + 0x6eb697c5; (* arm_MLS_VEC Q5 Q30 Q22 32 128 *) + 0x3d800414; (* arm_STR Q20 X0 (Immediate_Offset (word 16)) *) + 0x4e661cfc; (* arm_BIC_VEC Q28 Q7 Q6 128 *) + 0x6eb694f0; (* arm_MLS_VEC Q16 Q7 Q22 32 128 *) + 0x3d800823; (* arm_STR Q3 X1 (Immediate_Offset (word 32)) *) + 0x3d800c1c; (* arm_STR Q28 X0 (Immediate_Offset (word 48)) *) + 0x4ea28403; (* arm_ADD_VEC Q3 Q0 Q2 32 128 *) + 0x4e641fa7; (* arm_BIC_VEC Q7 Q29 Q4 128 *) + 0x3c840419; (* arm_STR Q25 X0 (Postimmediate_Offset (word 64)) *) + 0x3c9d0023; (* arm_STR Q3 X1 (Immediate_Offset (word 18446744073709551568)) *) + 0x4ebb84b8; (* arm_ADD_VEC Q24 Q5 Q27 32 128 *) + 0x3c9b0007; (* arm_STR Q7 X0 (Immediate_Offset (word 18446744073709551536)) *) + 0x4eb38753; (* arm_ADD_VEC Q19 Q26 Q19 32 128 *) + 0x4ea68619; (* arm_ADD_VEC Q25 Q16 Q6 32 128 *) + 0x4e711fe6; (* arm_BIC_VEC Q6 Q31 Q17 128 *) + 0x3d800438; (* arm_STR Q24 X1 (Immediate_Offset (word 16)) *) + 0x3c840433; (* arm_STR Q19 X1 (Postimmediate_Offset (word 64)) *) + 0x3c9e0006; (* arm_STR Q6 X0 (Immediate_Offset (word 18446744073709551584)) *) + 0x3c9f0039; (* arm_STR Q25 X1 (Immediate_Offset (word 18446744073709551600)) *) 0xd65f03c0 (* arm_RET X30 *) ];; (*** BYTECODE END ***) diff --git a/proofs/hol_light/aarch64/proofs/poly_decompose_88_aarch64_asm.ml b/proofs/hol_light/aarch64/proofs/poly_decompose_88_aarch64_asm.ml index b5e319b5c..1a31923ae 100644 --- a/proofs/hol_light/aarch64/proofs/poly_decompose_88_aarch64_asm.ml +++ b/proofs/hol_light/aarch64/proofs/poly_decompose_88_aarch64_asm.ml @@ -32,44 +32,117 @@ let poly_decompose_88_aarch64_asm_mc = define_assert_from_elf "poly_decompose_88 0x72ab02cb; (* arm_MOVK W11 (word 22550) 16 *) 0x4e040d77; (* arm_DUP_GEN Q23 X11 32 128 *) 0xd2800203; (* arm_MOV X3 (rvalue (word 16)) *) - 0x3dc00020; (* arm_LDR Q0 X1 (Immediate_Offset (word 0)) *) - 0x3dc00421; (* arm_LDR Q1 X1 (Immediate_Offset (word 16)) *) - 0x3dc00822; (* arm_LDR Q2 X1 (Immediate_Offset (word 32)) *) - 0x3dc00c23; (* arm_LDR Q3 X1 (Immediate_Offset (word 48)) *) - 0x4eb7b425; (* arm_SQDMULH_VEC Q5 Q1 Q23 32 128 *) - 0x4f2f24a5; (* arm_SRSHR_VEC Q5 Q5 17 32 128 *) - 0x4eb53438; (* arm_CMGT_VEC Q24 Q1 Q21 32 128 *) - 0x6eb694a1; (* arm_MLS_VEC Q1 Q5 Q22 32 128 *) - 0x4e781ca5; (* arm_BIC_VEC Q5 Q5 Q24 128 *) - 0x4eb88421; (* arm_ADD_VEC Q1 Q1 Q24 32 128 *) - 0x4eb7b446; (* arm_SQDMULH_VEC Q6 Q2 Q23 32 128 *) - 0x4f2f24c6; (* arm_SRSHR_VEC Q6 Q6 17 32 128 *) - 0x4eb53458; (* arm_CMGT_VEC Q24 Q2 Q21 32 128 *) - 0x6eb694c2; (* arm_MLS_VEC Q2 Q6 Q22 32 128 *) - 0x4e781cc6; (* arm_BIC_VEC Q6 Q6 Q24 128 *) - 0x4eb88442; (* arm_ADD_VEC Q2 Q2 Q24 32 128 *) - 0x4eb7b467; (* arm_SQDMULH_VEC Q7 Q3 Q23 32 128 *) - 0x4f2f24e7; (* arm_SRSHR_VEC Q7 Q7 17 32 128 *) - 0x4eb53478; (* arm_CMGT_VEC Q24 Q3 Q21 32 128 *) - 0x6eb694e3; (* arm_MLS_VEC Q3 Q7 Q22 32 128 *) - 0x4e781ce7; (* arm_BIC_VEC Q7 Q7 Q24 128 *) - 0x4eb88463; (* arm_ADD_VEC Q3 Q3 Q24 32 128 *) - 0x4eb7b404; (* arm_SQDMULH_VEC Q4 Q0 Q23 32 128 *) - 0x4f2f2484; (* arm_SRSHR_VEC Q4 Q4 17 32 128 *) - 0x4eb53418; (* arm_CMGT_VEC Q24 Q0 Q21 32 128 *) - 0x6eb69480; (* arm_MLS_VEC Q0 Q4 Q22 32 128 *) - 0x4e781c84; (* arm_BIC_VEC Q4 Q4 Q24 128 *) - 0x4eb88400; (* arm_ADD_VEC Q0 Q0 Q24 32 128 *) - 0x3d800405; (* arm_STR Q5 X0 (Immediate_Offset (word 16)) *) + 0x3dc00420; (* arm_LDR Q0 X1 (Immediate_Offset (word 16)) *) + 0x3dc00831; (* arm_LDR Q17 X1 (Immediate_Offset (word 32)) *) + 0x3dc00c22; (* arm_LDR Q2 X1 (Immediate_Offset (word 48)) *) + 0x3dc0103a; (* arm_LDR Q26 X1 (Immediate_Offset (word 64)) *) + 0x3dc00030; (* arm_LDR Q16 X1 (Immediate_Offset (word 0)) *) + 0x3dc01832; (* arm_LDR Q18 X1 (Immediate_Offset (word 96)) *) + 0x4eb5363f; (* arm_CMGT_VEC Q31 Q17 Q21 32 128 *) + 0x4eb7b623; (* arm_SQDMULH_VEC Q3 Q17 Q23 32 128 *) + 0x4eb53444; (* arm_CMGT_VEC Q4 Q2 Q21 32 128 *) + 0x4eb53605; (* arm_CMGT_VEC Q5 Q16 Q21 32 128 *) + 0x4eb7b613; (* arm_SQDMULH_VEC Q19 Q16 Q23 32 128 *) + 0x4eb7b447; (* arm_SQDMULH_VEC Q7 Q2 Q23 32 128 *) + 0x4f2f2463; (* arm_SRSHR_VEC Q3 Q3 17 32 128 *) + 0x4eb7b758; (* arm_SQDMULH_VEC Q24 Q26 Q23 32 128 *) + 0x4f2f267b; (* arm_SRSHR_VEC Q27 Q19 17 32 128 *) + 0x6eb69471; (* arm_MLS_VEC Q17 Q3 Q22 32 128 *) + 0x4f2f24fd; (* arm_SRSHR_VEC Q29 Q7 17 32 128 *) + 0x4e7f1c66; (* arm_BIC_VEC Q6 Q3 Q31 128 *) + 0x4e651f61; (* arm_BIC_VEC Q1 Q27 Q5 128 *) + 0x6eb69770; (* arm_MLS_VEC Q16 Q27 Q22 32 128 *) 0x3d800806; (* arm_STR Q6 X0 (Immediate_Offset (word 32)) *) - 0x3d800c07; (* arm_STR Q7 X0 (Immediate_Offset (word 48)) *) - 0x3c840404; (* arm_STR Q4 X0 (Postimmediate_Offset (word 64)) *) - 0x3d800421; (* arm_STR Q1 X1 (Immediate_Offset (word 16)) *) - 0x3d800822; (* arm_STR Q2 X1 (Immediate_Offset (word 32)) *) - 0x3d800c23; (* arm_STR Q3 X1 (Immediate_Offset (word 48)) *) - 0x3c840420; (* arm_STR Q0 X1 (Postimmediate_Offset (word 64)) *) + 0x3c840401; (* arm_STR Q1 X0 (Postimmediate_Offset (word 64)) *) + 0x6eb697a2; (* arm_MLS_VEC Q2 Q29 Q22 32 128 *) + 0x4ebf8626; (* arm_ADD_VEC Q6 Q17 Q31 32 128 *) + 0x4eb7b643; (* arm_SQDMULH_VEC Q3 Q18 Q23 32 128 *) + 0x3d800826; (* arm_STR Q6 X1 (Immediate_Offset (word 32)) *) + 0x4ea58614; (* arm_ADD_VEC Q20 Q16 Q5 32 128 *) + 0x3c840434; (* arm_STR Q20 X1 (Postimmediate_Offset (word 64)) *) + 0x4ea48451; (* arm_ADD_VEC Q17 Q2 Q4 32 128 *) + 0xd1000863; (* arm_SUB X3 X3 (rvalue (word 2)) *) + 0x4eb53401; (* arm_CMGT_VEC Q1 Q0 Q21 32 128 *) + 0x3dc00c25; (* arm_LDR Q5 X1 (Immediate_Offset (word 48)) *) + 0x4eb53646; (* arm_CMGT_VEC Q6 Q18 Q21 32 128 *) + 0x4eb7b414; (* arm_SQDMULH_VEC Q20 Q0 Q23 32 128 *) + 0x3c9f0031; (* arm_STR Q17 X1 (Immediate_Offset (word 18446744073709551600)) *) + 0x4f2f2463; (* arm_SRSHR_VEC Q3 Q3 17 32 128 *) + 0x4e641fa7; (* arm_BIC_VEC Q7 Q29 Q4 128 *) + 0x4eb53742; (* arm_CMGT_VEC Q2 Q26 Q21 32 128 *) + 0x4eb7b4b0; (* arm_SQDMULH_VEC Q16 Q5 Q23 32 128 *) + 0x4f2f271c; (* arm_SRSHR_VEC Q28 Q24 17 32 128 *) + 0x3c9f0007; (* arm_STR Q7 X0 (Immediate_Offset (word 18446744073709551600)) *) + 0x4eb534a4; (* arm_CMGT_VEC Q4 Q5 Q21 32 128 *) + 0x4f2f269f; (* arm_SRSHR_VEC Q31 Q20 17 32 128 *) + 0x6eb69472; (* arm_MLS_VEC Q18 Q3 Q22 32 128 *) + 0x4e661c63; (* arm_BIC_VEC Q3 Q3 Q6 128 *) + 0x6eb6979a; (* arm_MLS_VEC Q26 Q28 Q22 32 128 *) + 0x4e621f93; (* arm_BIC_VEC Q19 Q28 Q2 128 *) + 0x3d800803; (* arm_STR Q3 X0 (Immediate_Offset (word 32)) *) + 0x4f2f261d; (* arm_SRSHR_VEC Q29 Q16 17 32 128 *) + 0x3c840413; (* arm_STR Q19 X0 (Postimmediate_Offset (word 64)) *) + 0x6eb697e0; (* arm_MLS_VEC Q0 Q31 Q22 32 128 *) + 0x4e611ffe; (* arm_BIC_VEC Q30 Q31 Q1 128 *) + 0x4ea68643; (* arm_ADD_VEC Q3 Q18 Q6 32 128 *) + 0x3dc01832; (* arm_LDR Q18 X1 (Immediate_Offset (word 96)) *) + 0x6eb697a5; (* arm_MLS_VEC Q5 Q29 Q22 32 128 *) + 0x3c99001e; (* arm_STR Q30 X0 (Immediate_Offset (word 18446744073709551504)) *) + 0x4ea2875b; (* arm_ADD_VEC Q27 Q26 Q2 32 128 *) + 0x3dc0103a; (* arm_LDR Q26 X1 (Immediate_Offset (word 64)) *) + 0x4ea18419; (* arm_ADD_VEC Q25 Q0 Q1 32 128 *) + 0x3dc00420; (* arm_LDR Q0 X1 (Immediate_Offset (word 16)) *) + 0x3d800823; (* arm_STR Q3 X1 (Immediate_Offset (word 32)) *) + 0x4eb7b643; (* arm_SQDMULH_VEC Q3 Q18 Q23 32 128 *) + 0x3c9d0039; (* arm_STR Q25 X1 (Immediate_Offset (word 18446744073709551568)) *) + 0x3c84043b; (* arm_STR Q27 X1 (Postimmediate_Offset (word 64)) *) + 0x4ea484b1; (* arm_ADD_VEC Q17 Q5 Q4 32 128 *) + 0x4eb7b758; (* arm_SQDMULH_VEC Q24 Q26 Q23 32 128 *) 0xf1000463; (* arm_SUBS X3 X3 (rvalue (word 1)) *) 0x54fffb61; (* arm_BNE (word 2097004) *) + 0x3dc00c30; (* arm_LDR Q16 X1 (Immediate_Offset (word 48)) *) + 0x3dc00425; (* arm_LDR Q5 X1 (Immediate_Offset (word 16)) *) + 0x4eb7b41c; (* arm_SQDMULH_VEC Q28 Q0 Q23 32 128 *) + 0x4eb53753; (* arm_CMGT_VEC Q19 Q26 Q21 32 128 *) + 0x4eb53402; (* arm_CMGT_VEC Q2 Q0 Q21 32 128 *) + 0x3c9f0031; (* arm_STR Q17 X1 (Immediate_Offset (word 18446744073709551600)) *) + 0x4f2f247f; (* arm_SRSHR_VEC Q31 Q3 17 32 128 *) + 0x4eb7b4a1; (* arm_SQDMULH_VEC Q1 Q5 Q23 32 128 *) + 0x4eb53606; (* arm_CMGT_VEC Q6 Q16 Q21 32 128 *) + 0x4f2f2718; (* arm_SRSHR_VEC Q24 Q24 17 32 128 *) + 0x4f2f2799; (* arm_SRSHR_VEC Q25 Q28 17 32 128 *) + 0x4eb7b614; (* arm_SQDMULH_VEC Q20 Q16 Q23 32 128 *) + 0x4eb53651; (* arm_CMGT_VEC Q17 Q18 Q21 32 128 *) + 0x6eb697f2; (* arm_MLS_VEC Q18 Q31 Q22 32 128 *) + 0x4eb534bb; (* arm_CMGT_VEC Q27 Q5 Q21 32 128 *) + 0x4f2f243e; (* arm_SRSHR_VEC Q30 Q1 17 32 128 *) + 0x6eb69720; (* arm_MLS_VEC Q0 Q25 Q22 32 128 *) + 0x4e621f3c; (* arm_BIC_VEC Q28 Q25 Q2 128 *) + 0x4f2f2687; (* arm_SRSHR_VEC Q7 Q20 17 32 128 *) + 0x4e731f19; (* arm_BIC_VEC Q25 Q24 Q19 128 *) + 0x6eb6971a; (* arm_MLS_VEC Q26 Q24 Q22 32 128 *) + 0x4e7b1fd4; (* arm_BIC_VEC Q20 Q30 Q27 128 *) + 0x3c9d001c; (* arm_STR Q28 X0 (Immediate_Offset (word 18446744073709551568)) *) + 0x4eb18643; (* arm_ADD_VEC Q3 Q18 Q17 32 128 *) + 0x6eb697c5; (* arm_MLS_VEC Q5 Q30 Q22 32 128 *) + 0x3d800414; (* arm_STR Q20 X0 (Immediate_Offset (word 16)) *) + 0x4e661cfc; (* arm_BIC_VEC Q28 Q7 Q6 128 *) + 0x6eb694f0; (* arm_MLS_VEC Q16 Q7 Q22 32 128 *) + 0x3d800823; (* arm_STR Q3 X1 (Immediate_Offset (word 32)) *) + 0x3d800c1c; (* arm_STR Q28 X0 (Immediate_Offset (word 48)) *) + 0x4ea28403; (* arm_ADD_VEC Q3 Q0 Q2 32 128 *) + 0x4e641fa7; (* arm_BIC_VEC Q7 Q29 Q4 128 *) + 0x3c840419; (* arm_STR Q25 X0 (Postimmediate_Offset (word 64)) *) + 0x3c9d0023; (* arm_STR Q3 X1 (Immediate_Offset (word 18446744073709551568)) *) + 0x4ebb84b8; (* arm_ADD_VEC Q24 Q5 Q27 32 128 *) + 0x3c9b0007; (* arm_STR Q7 X0 (Immediate_Offset (word 18446744073709551536)) *) + 0x4eb38753; (* arm_ADD_VEC Q19 Q26 Q19 32 128 *) + 0x4ea68619; (* arm_ADD_VEC Q25 Q16 Q6 32 128 *) + 0x4e711fe6; (* arm_BIC_VEC Q6 Q31 Q17 128 *) + 0x3d800438; (* arm_STR Q24 X1 (Immediate_Offset (word 16)) *) + 0x3c840433; (* arm_STR Q19 X1 (Postimmediate_Offset (word 64)) *) + 0x3c9e0006; (* arm_STR Q6 X0 (Immediate_Offset (word 18446744073709551584)) *) + 0x3c9f0039; (* arm_STR Q25 X1 (Immediate_Offset (word 18446744073709551600)) *) 0xd65f03c0 (* arm_RET X30 *) ];; (*** BYTECODE END ***)