diff --git a/.github/workflows/symbol-prefixes.yml b/.github/workflows/symbol-prefixes.yml
index 33142162ccf..fd767ca0092 100644
--- a/.github/workflows/symbol-prefixes.yml
+++ b/.github/workflows/symbol-prefixes.yml
@@ -49,7 +49,7 @@ jobs:
{
if (($7 !~ /^[0-9]+$/) ||
($8 ~ /^(wc_|wolf|WOLF|__pfx|fe_|sp_[a-zA-Z090-0_]*[0-9])/) ||
- ($8 ~ /(_avx[12]|_AVX[12]|_sse[12]|_SSE[12]|_aesni|_AESNI|_bmi2|_x64$)/))
+ ($8 ~ /(_avx[12]|_AVX[12]|_sse[12]|_SSE[12]|_aesni|_AESNI|_vaes|_VAES|_avx512|_AVX512|_bmi2|_x64$)/))
{
next;
}
diff --git a/.wolfssl_known_macro_extras b/.wolfssl_known_macro_extras
index 101ebf2fa88..c08a8580379 100644
--- a/.wolfssl_known_macro_extras
+++ b/.wolfssl_known_macro_extras
@@ -381,6 +381,7 @@ NO_AES_DECRYPT
NO_ARDUINO_DEFAULT
NO_ASM
NO_ASN_OLD_TYPE_NAMES
+NO_AVX512_SUPPORT
NO_CAMELLIA_CBC
NO_CERT
NO_CERT_IN_TICKET
@@ -459,6 +460,7 @@ NO_STDIO_FGETS_REMAP
NO_STM32_HMAC
NO_TKERNEL_MEM_POOL
NO_TLSX_PSKKEM_PLAIN_ANNOUNCE
+NO_VAES_SUPPORT
NO_VERIFY_OID
NO_WC_DHGENERATEPUBLIC
NO_WC_SHE_GETUID
diff --git a/linuxkm/Kbuild b/linuxkm/Kbuild
index fe3f823942f..831a45c76a4 100644
--- a/linuxkm/Kbuild
+++ b/linuxkm/Kbuild
@@ -200,6 +200,8 @@ $(obj)/wolfcrypt/src/aes_gcm_asm.o: asflags-y := $(WOLFSSL_ASFLAGS) $(ASFLAGS_FP
$(obj)/wolfcrypt/src/aes_gcm_asm.o: OBJECT_FILES_NON_STANDARD := y
$(obj)/wolfcrypt/src/aes_xts_asm.o: asflags-y := $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE)
$(obj)/wolfcrypt/src/aes_xts_asm.o: OBJECT_FILES_NON_STANDARD := y
+$(obj)/wolfcrypt/src/aes_x86_64_asm.o: asflags-y := $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE)
+$(obj)/wolfcrypt/src/aes_x86_64_asm.o: OBJECT_FILES_NON_STANDARD := y
$(obj)/wolfcrypt/src/sp_x86_64_asm.o: asflags-y := $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE)
$(obj)/wolfcrypt/src/sp_x86_64_asm.o: OBJECT_FILES_NON_STANDARD := y
$(obj)/wolfcrypt/src/sha256_asm.o: asflags-y := $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE)
diff --git a/src/include.am b/src/include.am
index 4b80e149bac..2e904706f83 100644
--- a/src/include.am
+++ b/src/include.am
@@ -109,6 +109,7 @@ endif
if BUILD_AESNI
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_asm.S
+src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_x86_64_asm.S
if BUILD_X86_ASM
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S
else
@@ -259,6 +260,7 @@ endif BUILD_PPC64_ASM
if BUILD_AESNI
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_asm.S
+src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_x86_64_asm.S
if BUILD_X86_ASM
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S
else
@@ -532,6 +534,7 @@ endif BUILD_PPC64_ASM
if BUILD_AESNI
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_asm.S
+src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_x86_64_asm.S
if BUILD_X86_ASM
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S
else
@@ -867,6 +870,7 @@ endif BUILD_AES
if BUILD_AESNI
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_asm.S
+src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_x86_64_asm.S
if BUILD_X86_ASM
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S
else
@@ -1708,6 +1712,7 @@ endif
if !BUILD_FIPS_V2_PLUS
if BUILD_AESNI
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_asm.S
+src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_x86_64_asm.S
if BUILD_X86_ASM
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S
else
diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c
index 214d873bc2f..a58fd300a3a 100644
--- a/wolfcrypt/benchmark/benchmark.c
+++ b/wolfcrypt/benchmark/benchmark.c
@@ -4794,6 +4794,8 @@ static void print_cpu_features(void)
if (IS_INTEL_MOVBE(cpuid_flags)) printf(" movbe");
if (IS_INTEL_BMI1(cpuid_flags)) printf(" bmi1");
if (IS_INTEL_SHA(cpuid_flags)) printf(" sha");
+ if (IS_INTEL_VAES(cpuid_flags)) printf(" vaes");
+ if (IS_INTEL_AVX512(cpuid_flags)) printf(" avx512");
#endif
#ifdef __aarch64__
printf("Aarch64 -");
diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c
index 6806acbc965..8a630217b9d 100644
--- a/wolfcrypt/src/aes.c
+++ b/wolfcrypt/src/aes.c
@@ -809,6 +809,218 @@ block cipher mechanism that uses n-bit binary string parameter key with 128-bits
unsigned char* key_schedule)
XASM_LINK("AES_256_Key_Expansion_AESNI");
+#ifdef WOLFSSL_X86_64_BUILD
+ /* Wide ECB / CBC / CTR variants for x86_64. They share the AES-NI key
+ * schedule declared above and are selected at runtime from intel_flags.
+ * AES_CBC_decrypt_AESNI is the single max-width path (the by4/by6/by8
+ * variants are only used by the 32-bit x86 build). */
+ #if defined(USE_INTEL_SPEEDUP)
+ #ifndef HAVE_INTEL_AVX1
+ #define HAVE_INTEL_AVX1
+ #endif
+ #if !defined(NO_AVX2_SUPPORT) && !defined(HAVE_INTEL_AVX2)
+ #define HAVE_INTEL_AVX2
+ #endif
+ #if !defined(NO_VAES_SUPPORT) && !defined(HAVE_INTEL_VAES)
+ #define HAVE_INTEL_VAES
+ #endif
+ #if !defined(NO_AVX512_SUPPORT) && !defined(HAVE_INTEL_AVX512)
+ #define HAVE_INTEL_AVX512
+ #endif
+ #endif
+
+ void AES_CTR_encrypt_AESNI(const unsigned char* in, unsigned char* out,
+ unsigned long length, const unsigned char* KS, int nr,
+ unsigned char* ctr) XASM_LINK("AES_CTR_encrypt_AESNI");
+ #ifdef HAVE_AES_DECRYPT
+ void AES_CBC_decrypt_AESNI(const unsigned char* in, unsigned char* out,
+ unsigned char* ivec, unsigned long length, const unsigned char* KS,
+ int nr) XASM_LINK("AES_CBC_decrypt_AESNI");
+ #endif
+
+ #define AES_DECL_VARIANT(suff) \
+ void AES_ECB_encrypt_##suff(const unsigned char* in, \
+ unsigned char* out, unsigned long length, \
+ const unsigned char* KS, int nr) \
+ XASM_LINK("AES_ECB_encrypt_" #suff); \
+ void AES_CBC_encrypt_##suff(const unsigned char* in, \
+ unsigned char* out, unsigned char* ivec, unsigned long length, \
+ const unsigned char* KS, int nr) \
+ XASM_LINK("AES_CBC_encrypt_" #suff); \
+ void AES_CTR_encrypt_##suff(const unsigned char* in, \
+ unsigned char* out, unsigned long length, \
+ const unsigned char* KS, int nr, unsigned char* ctr) \
+ XASM_LINK("AES_CTR_encrypt_" #suff)
+ #ifdef HAVE_AES_DECRYPT
+ #define AES_DECL_VARIANT_DEC(suff) \
+ void AES_ECB_decrypt_##suff(const unsigned char* in, \
+ unsigned char* out, unsigned long length, \
+ const unsigned char* KS, int nr) \
+ XASM_LINK("AES_ECB_decrypt_" #suff); \
+ void AES_CBC_decrypt_##suff(const unsigned char* in, \
+ unsigned char* out, unsigned char* ivec, \
+ unsigned long length, const unsigned char* KS, int nr) \
+ XASM_LINK("AES_CBC_decrypt_" #suff)
+ #else
+ #define AES_DECL_VARIANT_DEC(suff) /* no decrypt */
+ #endif
+
+ #ifdef HAVE_INTEL_AVX1
+ AES_DECL_VARIANT(avx1);
+ AES_DECL_VARIANT_DEC(avx1);
+ #endif
+ #ifdef HAVE_INTEL_VAES
+ AES_DECL_VARIANT(vaes);
+ AES_DECL_VARIANT_DEC(vaes);
+ #endif
+ #ifdef HAVE_INTEL_AVX512
+ AES_DECL_VARIANT(avx512);
+ AES_DECL_VARIANT_DEC(avx512);
+ #endif
+
+ /* Pick the widest available implementation at runtime. Callers must
+ * already be inside a VECTOR_REGISTERS_PUSH / SAVE_VECTOR_REGISTERS
+ * region (all bulk AES-NI call sites are). */
+ static WC_INLINE void AesEcbEncryptBlocks(const unsigned char* in,
+ unsigned char* out, word32 sz, const unsigned char* key, int nr)
+ {
+ #ifdef HAVE_INTEL_AVX512
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_ECB_encrypt_avx512(in, out, sz, key, nr);
+ }
+ else
+ #endif
+ #ifdef HAVE_INTEL_VAES
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_ECB_encrypt_vaes(in, out, sz, key, nr);
+ }
+ else
+ #endif
+ #ifdef HAVE_INTEL_AVX1
+ if (IS_INTEL_AVX1(intel_flags)) {
+ AES_ECB_encrypt_avx1(in, out, sz, key, nr);
+ }
+ else
+ #endif
+ {
+ AES_ECB_encrypt_AESNI(in, out, sz, key, nr);
+ }
+ }
+
+ #ifdef HAVE_AES_DECRYPT
+ static WC_INLINE void AesEcbDecryptBlocks(const unsigned char* in,
+ unsigned char* out, word32 sz, const unsigned char* key, int nr)
+ {
+ #ifdef HAVE_INTEL_AVX512
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_ECB_decrypt_avx512(in, out, sz, key, nr);
+ }
+ else
+ #endif
+ #ifdef HAVE_INTEL_VAES
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_ECB_decrypt_vaes(in, out, sz, key, nr);
+ }
+ else
+ #endif
+ #ifdef HAVE_INTEL_AVX1
+ if (IS_INTEL_AVX1(intel_flags)) {
+ AES_ECB_decrypt_avx1(in, out, sz, key, nr);
+ }
+ else
+ #endif
+ {
+ AES_ECB_decrypt_AESNI(in, out, sz, key, nr);
+ }
+ }
+ #endif
+
+ #ifdef HAVE_AES_CBC
+ static WC_INLINE void AesCbcEncryptBlocks(const unsigned char* in,
+ unsigned char* out, unsigned char* iv, word32 sz,
+ const unsigned char* key, int nr)
+ {
+ #ifdef HAVE_INTEL_AVX512
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_CBC_encrypt_avx512(in, out, iv, sz, key, nr);
+ }
+ else
+ #endif
+ #ifdef HAVE_INTEL_VAES
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_CBC_encrypt_vaes(in, out, iv, sz, key, nr);
+ }
+ else
+ #endif
+ #ifdef HAVE_INTEL_AVX1
+ if (IS_INTEL_AVX1(intel_flags)) {
+ AES_CBC_encrypt_avx1(in, out, iv, sz, key, nr);
+ }
+ else
+ #endif
+ {
+ AES_CBC_encrypt_AESNI(in, out, iv, sz, key, nr);
+ }
+ }
+ #endif /* HAVE_AES_CBC */
+
+ #ifdef HAVE_AES_DECRYPT
+ static WC_INLINE void AesCbcDecryptBlocks(const unsigned char* in,
+ unsigned char* out, unsigned char* iv, word32 sz,
+ const unsigned char* key, int nr)
+ {
+ #ifdef HAVE_INTEL_AVX512
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_CBC_decrypt_avx512(in, out, iv, sz, key, nr);
+ }
+ else
+ #endif
+ #ifdef HAVE_INTEL_VAES
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_CBC_decrypt_vaes(in, out, iv, sz, key, nr);
+ }
+ else
+ #endif
+ #ifdef HAVE_INTEL_AVX1
+ if (IS_INTEL_AVX1(intel_flags)) {
+ AES_CBC_decrypt_avx1(in, out, iv, sz, key, nr);
+ }
+ else
+ #endif
+ {
+ AES_CBC_decrypt_AESNI(in, out, iv, sz, key, nr);
+ }
+ }
+ #endif /* HAVE_AES_DECRYPT */
+
+ static WC_INLINE void AesCtrEncryptBlocks(const unsigned char* in,
+ unsigned char* out, word32 sz, const unsigned char* key, int nr,
+ unsigned char* ctr)
+ {
+ #ifdef HAVE_INTEL_AVX512
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_CTR_encrypt_avx512(in, out, sz, key, nr, ctr);
+ }
+ else
+ #endif
+ #ifdef HAVE_INTEL_VAES
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_CTR_encrypt_vaes(in, out, sz, key, nr, ctr);
+ }
+ else
+ #endif
+ #ifdef HAVE_INTEL_AVX1
+ if (IS_INTEL_AVX1(intel_flags)) {
+ AES_CTR_encrypt_avx1(in, out, sz, key, nr, ctr);
+ }
+ else
+ #endif
+ {
+ AES_CTR_encrypt_AESNI(in, out, sz, key, nr, ctr);
+ }
+ }
+#endif /* WOLFSSL_X86_64_BUILD */
+
static WARN_UNUSED_RESULT int AES_set_encrypt_key_AESNI(
const unsigned char *userKey, const int bits, Aes* aes)
@@ -6858,8 +7070,13 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
else {
tmp_align = tmp + (AESNI_ALIGN - ((wc_ptr_t)tmp % AESNI_ALIGN));
XMEMCPY(tmp_align, in, sz);
+ #ifdef WOLFSSL_X86_64_BUILD
+ AesCbcEncryptBlocks(tmp_align, tmp_align, (byte*)aes->reg, sz,
+ (byte*)aes->key, (int)aes->rounds);
+ #else
AES_CBC_encrypt_AESNI(tmp_align, tmp_align, (byte*)aes->reg, sz,
(byte*)aes->key, (int)aes->rounds);
+ #endif
/* store iv for next call */
XMEMCPY(aes->reg, tmp_align + sz - WC_AES_BLOCK_SIZE, WC_AES_BLOCK_SIZE);
@@ -6873,8 +7090,13 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
ret = BAD_ALIGN_E;
#endif
} else {
+ #ifdef WOLFSSL_X86_64_BUILD
+ AesCbcEncryptBlocks(in, out, (byte*)aes->reg, sz, (byte*)aes->key,
+ (int)aes->rounds);
+ #else
AES_CBC_encrypt_AESNI(in, out, (byte*)aes->reg, sz, (byte*)aes->key,
(int)aes->rounds);
+ #endif
/* store iv for next call */
XMEMCPY(aes->reg, out + sz - WC_AES_BLOCK_SIZE, WC_AES_BLOCK_SIZE);
@@ -7056,7 +7278,10 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
/* if input and output same will overwrite input iv */
XMEMCPY(aes->tmp, in + sz - WC_AES_BLOCK_SIZE, WC_AES_BLOCK_SIZE);
- #if defined(WOLFSSL_AESNI_BY4) || defined(WOLFSSL_X86_BUILD)
+ #if defined(WOLFSSL_X86_64_BUILD)
+ AesCbcDecryptBlocks(in, out, (byte*)aes->reg, sz, (byte*)aes->key,
+ (int)aes->rounds);
+ #elif defined(WOLFSSL_AESNI_BY4) || defined(WOLFSSL_X86_BUILD)
AES_CBC_decrypt_AESNI_by4(in, out, (byte*)aes->reg, sz, (byte*)aes->key,
aes->rounds);
#elif defined(WOLFSSL_AESNI_BY6)
@@ -7563,6 +7788,19 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
#else
VECTOR_REGISTERS_PUSH;
+ #if defined(WOLFSSL_AESNI) && defined(WOLFSSL_X86_64_BUILD)
+ if (aes->use_aesni && sz >= WC_AES_BLOCK_SIZE) {
+ word32 ctrBlocks = sz / WC_AES_BLOCK_SIZE;
+ word32 ctrBytes = ctrBlocks * WC_AES_BLOCK_SIZE;
+ AesCtrEncryptBlocks(in, out, ctrBytes, (byte*)aes->key,
+ (int)aes->rounds, (byte*)aes->reg);
+ in += ctrBytes;
+ out += ctrBytes;
+ sz -= ctrBytes;
+ aes->left = 0;
+ }
+ #endif
+
#if defined(HAVE_AES_ECB) && !defined(WOLFSSL_PIC32MZ_CRYPT) && \
!defined(XTRANSFORM_AESCTRBLOCK)
if (in != out && sz >= WC_AES_BLOCK_SIZE) {
@@ -7910,7 +8148,17 @@ void GenerateM0(Gcm* gcm)
#if defined(WOLFSSL_AESNI) && defined(USE_INTEL_SPEEDUP)
#define HAVE_INTEL_AVX1
- #define HAVE_INTEL_AVX2
+ #ifndef NO_AVX2_SUPPORT
+ #define HAVE_INTEL_AVX2
+ #endif
+ #ifdef WOLFSSL_X86_64_BUILD
+ #ifndef NO_VAES_SUPPORT
+ #define HAVE_INTEL_VAES
+ #endif
+ #ifndef NO_AVX512_SUPPORT
+ #define HAVE_INTEL_AVX512
+ #endif
+ #endif
#endif
#if defined(WOLFSSL_AESNI) && defined(GCM_TABLE_4BIT) && \
@@ -8128,6 +8376,24 @@ void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out,
word32 tbytes, const unsigned char* key,
int nr)
XASM_LINK("AES_GCM_encrypt_avx2");
+#ifdef HAVE_INTEL_AVX512
+void AES_GCM_encrypt_avx512(const unsigned char *in, unsigned char *out,
+ const unsigned char* addt, const unsigned char* ivec,
+ unsigned char *tag, word32 nbytes,
+ word32 abytes, word32 ibytes,
+ word32 tbytes, const unsigned char* key,
+ int nr)
+ XASM_LINK("AES_GCM_encrypt_avx512");
+#endif
+#ifdef HAVE_INTEL_VAES
+void AES_GCM_encrypt_vaes(const unsigned char *in, unsigned char *out,
+ const unsigned char* addt, const unsigned char* ivec,
+ unsigned char *tag, word32 nbytes,
+ word32 abytes, word32 ibytes,
+ word32 tbytes, const unsigned char* key,
+ int nr)
+ XASM_LINK("AES_GCM_encrypt_vaes");
+#endif
#endif /* HAVE_INTEL_AVX2 */
#endif /* HAVE_INTEL_AVX1 */
@@ -8152,6 +8418,22 @@ void AES_GCM_decrypt_avx2(const unsigned char *in, unsigned char *out,
word32 abytes, word32 ibytes, word32 tbytes,
const unsigned char* key, int nr, int* res)
XASM_LINK("AES_GCM_decrypt_avx2");
+#ifdef HAVE_INTEL_AVX512
+void AES_GCM_decrypt_avx512(const unsigned char *in, unsigned char *out,
+ const unsigned char* addt, const unsigned char* ivec,
+ const unsigned char *tag, word32 nbytes,
+ word32 abytes, word32 ibytes, word32 tbytes,
+ const unsigned char* key, int nr, int* res)
+ XASM_LINK("AES_GCM_decrypt_avx512");
+#endif
+#ifdef HAVE_INTEL_VAES
+void AES_GCM_decrypt_vaes(const unsigned char *in, unsigned char *out,
+ const unsigned char* addt, const unsigned char* ivec,
+ const unsigned char *tag, word32 nbytes,
+ word32 abytes, word32 ibytes, word32 tbytes,
+ const unsigned char* key, int nr, int* res)
+ XASM_LINK("AES_GCM_decrypt_vaes");
+#endif
#endif /* HAVE_INTEL_AVX2 */
#endif /* HAVE_INTEL_AVX1 */
#endif /* HAVE_AES_DECRYPT */
@@ -10535,6 +10817,22 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
#else
#ifdef WOLFSSL_AESNI
if (aes->use_aesni) {
+#ifdef HAVE_INTEL_AVX512
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_encrypt_avx512(in, out, authIn, iv, authTag, sz, authInSz, ivSz,
+ authTagSz, (const byte*)aes->key, (int)aes->rounds);
+ ret = 0;
+ }
+ else
+#endif
+#ifdef HAVE_INTEL_VAES
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_encrypt_vaes(in, out, authIn, iv, authTag, sz, authInSz, ivSz,
+ authTagSz, (const byte*)aes->key, (int)aes->rounds);
+ ret = 0;
+ }
+ else
+#endif
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(intel_flags)) {
AES_GCM_encrypt_avx2(in, out, authIn, iv, authTag, sz, authInSz, ivSz,
@@ -11293,6 +11591,28 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
#else
#ifdef WOLFSSL_AESNI
if (aes->use_aesni) {
+#ifdef HAVE_INTEL_AVX512
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_decrypt_avx512(in, out, authIn, iv, authTag, sz, authInSz, ivSz,
+ authTagSz, (byte*)aes->key, (int)aes->rounds, &res);
+ if (res == 0)
+ ret = AES_GCM_AUTH_E;
+ else
+ ret = 0;
+ }
+ else
+#endif
+#ifdef HAVE_INTEL_VAES
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_decrypt_vaes(in, out, authIn, iv, authTag, sz, authInSz, ivSz,
+ authTagSz, (byte*)aes->key, (int)aes->rounds, &res);
+ if (res == 0)
+ ret = AES_GCM_AUTH_E;
+ else
+ ret = 0;
+ }
+ else
+#endif
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(intel_flags)) {
AES_GCM_decrypt_avx2(in, out, authIn, iv, authTag, sz, authInSz, ivSz,
@@ -11513,19 +11833,73 @@ static WARN_UNUSED_RESULT int AesGcmFinal_C(
extern void AES_GCM_init_avx2(const unsigned char* key, int nr,
const unsigned char* ivec, unsigned int ibytes, unsigned char* h,
unsigned char* counter, unsigned char* initCtr);
+#ifdef HAVE_INTEL_AVX512
+extern void AES_GCM_init_avx512(const unsigned char* key, int nr,
+ const unsigned char* ivec, unsigned int ibytes, unsigned char* h,
+ unsigned char* counter, unsigned char* initCtr);
+#endif
+#ifdef HAVE_INTEL_VAES
+extern void AES_GCM_init_vaes(const unsigned char* key, int nr,
+ const unsigned char* ivec, unsigned int ibytes, unsigned char* h,
+ unsigned char* counter, unsigned char* initCtr);
+#endif
extern void AES_GCM_aad_update_avx2(const unsigned char* addt,
unsigned int abytes, unsigned char* tag, unsigned char* h);
+#ifdef HAVE_INTEL_AVX512
+extern void AES_GCM_aad_update_avx512(const unsigned char* addt,
+ unsigned int abytes, unsigned char* tag, unsigned char* h);
+#endif
+#ifdef HAVE_INTEL_VAES
+extern void AES_GCM_aad_update_vaes(const unsigned char* addt,
+ unsigned int abytes, unsigned char* tag, unsigned char* h);
+#endif
extern void AES_GCM_encrypt_block_avx2(const unsigned char* key, int nr,
unsigned char* out, const unsigned char* in, unsigned char* counter);
+#ifdef HAVE_INTEL_AVX512
+extern void AES_GCM_encrypt_block_avx512(const unsigned char* key, int nr,
+ unsigned char* out, const unsigned char* in, unsigned char* counter);
+#endif
+#ifdef HAVE_INTEL_VAES
+extern void AES_GCM_encrypt_block_vaes(const unsigned char* key, int nr,
+ unsigned char* out, const unsigned char* in, unsigned char* counter);
+#endif
extern void AES_GCM_ghash_block_avx2(const unsigned char* data,
unsigned char* tag, unsigned char* h);
+#ifdef HAVE_INTEL_AVX512
+extern void AES_GCM_ghash_block_avx512(const unsigned char* data,
+ unsigned char* tag, unsigned char* h);
+#endif
+#ifdef HAVE_INTEL_VAES
+extern void AES_GCM_ghash_block_vaes(const unsigned char* data,
+ unsigned char* tag, unsigned char* h);
+#endif
extern void AES_GCM_encrypt_update_avx2(const unsigned char* key, int nr,
unsigned char* out, const unsigned char* in, unsigned int nbytes,
unsigned char* tag, unsigned char* h, unsigned char* counter);
+#ifdef HAVE_INTEL_AVX512
+extern void AES_GCM_encrypt_update_avx512(const unsigned char* key, int nr,
+ unsigned char* out, const unsigned char* in, unsigned int nbytes,
+ unsigned char* tag, unsigned char* h, unsigned char* counter);
+#endif
+#ifdef HAVE_INTEL_VAES
+extern void AES_GCM_encrypt_update_vaes(const unsigned char* key, int nr,
+ unsigned char* out, const unsigned char* in, unsigned int nbytes,
+ unsigned char* tag, unsigned char* h, unsigned char* counter);
+#endif
extern void AES_GCM_encrypt_final_avx2(unsigned char* tag,
unsigned char* authTag, unsigned int tbytes, unsigned int nbytes,
unsigned int abytes, unsigned char* h, unsigned char* initCtr);
+#ifdef HAVE_INTEL_AVX512
+extern void AES_GCM_encrypt_final_avx512(unsigned char* tag,
+ unsigned char* authTag, unsigned int tbytes, unsigned int nbytes,
+ unsigned int abytes, unsigned char* h, unsigned char* initCtr);
+#endif
+#ifdef HAVE_INTEL_VAES
+extern void AES_GCM_encrypt_final_vaes(unsigned char* tag,
+ unsigned char* authTag, unsigned int tbytes, unsigned int nbytes,
+ unsigned int abytes, unsigned char* h, unsigned char* initCtr);
+#endif
#endif
#ifdef HAVE_INTEL_AVX1
extern void AES_GCM_init_avx1(const unsigned char* key, int nr,
@@ -11587,6 +11961,20 @@ static WARN_UNUSED_RESULT int AesGcmInit_aesni(
aes->aOver = 0;
aes->cOver = 0;
+#ifdef HAVE_INTEL_AVX512
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_init_avx512((byte*)aes->key, (int)aes->rounds, iv, ivSz,
+ aes->gcm.H, AES_COUNTER(aes), AES_INITCTR(aes));
+ }
+ else
+#endif
+#ifdef HAVE_INTEL_VAES
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_init_vaes((byte*)aes->key, (int)aes->rounds, iv, ivSz,
+ aes->gcm.H, AES_COUNTER(aes), AES_INITCTR(aes));
+ }
+ else
+#endif
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(intel_flags)) {
AES_GCM_init_avx2((byte*)aes->key, (int)aes->rounds, iv, ivSz,
@@ -11641,6 +12029,20 @@ static WARN_UNUSED_RESULT int AesGcmAadUpdate_aesni(
aes->aOver = (byte)(aes->aOver + sz);
if (aes->aOver == WC_AES_BLOCK_SIZE) {
/* We have filled up the block and can process. */
+#ifdef HAVE_INTEL_AVX512
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_ghash_block_avx512(AES_LASTGBLOCK(aes), AES_TAG(aes),
+ aes->gcm.H);
+ }
+ else
+#endif
+#ifdef HAVE_INTEL_VAES
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_ghash_block_vaes(AES_LASTGBLOCK(aes), AES_TAG(aes),
+ aes->gcm.H);
+ }
+ else
+#endif
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(intel_flags)) {
AES_GCM_ghash_block_avx2(AES_LASTGBLOCK(aes), AES_TAG(aes),
@@ -11672,6 +12074,20 @@ static WARN_UNUSED_RESULT int AesGcmAadUpdate_aesni(
partial = aSz % WC_AES_BLOCK_SIZE;
if (blocks > 0) {
/* GHASH full blocks now. */
+#ifdef HAVE_INTEL_AVX512
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_aad_update_avx512(a, blocks * WC_AES_BLOCK_SIZE,
+ AES_TAG(aes), aes->gcm.H);
+ }
+ else
+#endif
+#ifdef HAVE_INTEL_VAES
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_aad_update_vaes(a, blocks * WC_AES_BLOCK_SIZE,
+ AES_TAG(aes), aes->gcm.H);
+ }
+ else
+#endif
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(intel_flags)) {
AES_GCM_aad_update_avx2(a, blocks * WC_AES_BLOCK_SIZE,
@@ -11705,6 +12121,20 @@ static WARN_UNUSED_RESULT int AesGcmAadUpdate_aesni(
XMEMSET(AES_LASTGBLOCK(aes) + aes->aOver, 0,
(size_t)WC_AES_BLOCK_SIZE - aes->aOver);
/* GHASH last AAD block. */
+#ifdef HAVE_INTEL_AVX512
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_ghash_block_avx512(AES_LASTGBLOCK(aes), AES_TAG(aes),
+ aes->gcm.H);
+ }
+ else
+#endif
+#ifdef HAVE_INTEL_VAES
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_ghash_block_vaes(AES_LASTGBLOCK(aes), AES_TAG(aes),
+ aes->gcm.H);
+ }
+ else
+#endif
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(intel_flags)) {
AES_GCM_ghash_block_avx2(AES_LASTGBLOCK(aes), AES_TAG(aes),
@@ -11772,6 +12202,20 @@ static WARN_UNUSED_RESULT int AesGcmEncryptUpdate_aesni(
aes->cOver = (byte)(aes->cOver + sz);
if (aes->cOver == WC_AES_BLOCK_SIZE) {
/* We have filled up the block and can process. */
+#ifdef HAVE_INTEL_AVX512
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_ghash_block_avx512(AES_LASTGBLOCK(aes), AES_TAG(aes),
+ aes->gcm.H);
+ }
+ else
+#endif
+#ifdef HAVE_INTEL_VAES
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_ghash_block_vaes(AES_LASTGBLOCK(aes), AES_TAG(aes),
+ aes->gcm.H);
+ }
+ else
+#endif
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(intel_flags)) {
AES_GCM_ghash_block_avx2(AES_LASTGBLOCK(aes), AES_TAG(aes),
@@ -11804,6 +12248,22 @@ static WARN_UNUSED_RESULT int AesGcmEncryptUpdate_aesni(
partial = cSz % WC_AES_BLOCK_SIZE;
if (blocks > 0) {
/* Encrypt and GHASH full blocks now. */
+#ifdef HAVE_INTEL_AVX512
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_encrypt_update_avx512((byte*)aes->key, (int)aes->rounds,
+ c, p, blocks * WC_AES_BLOCK_SIZE, AES_TAG(aes), aes->gcm.H,
+ AES_COUNTER(aes));
+ }
+ else
+#endif
+#ifdef HAVE_INTEL_VAES
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_encrypt_update_vaes((byte*)aes->key, (int)aes->rounds,
+ c, p, blocks * WC_AES_BLOCK_SIZE, AES_TAG(aes), aes->gcm.H,
+ AES_COUNTER(aes));
+ }
+ else
+#endif
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(intel_flags)) {
AES_GCM_encrypt_update_avx2((byte*)aes->key, (int)aes->rounds,
@@ -11832,6 +12292,20 @@ static WARN_UNUSED_RESULT int AesGcmEncryptUpdate_aesni(
if (partial != 0) {
/* Encrypt the counter - XOR in zeros as proxy for plaintext. */
XMEMSET(AES_LASTGBLOCK(aes), 0, WC_AES_BLOCK_SIZE);
+#ifdef HAVE_INTEL_AVX512
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_encrypt_block_avx512((byte*)aes->key, (int)aes->rounds,
+ AES_LASTGBLOCK(aes), AES_LASTGBLOCK(aes), AES_COUNTER(aes));
+ }
+ else
+#endif
+#ifdef HAVE_INTEL_VAES
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_encrypt_block_vaes((byte*)aes->key, (int)aes->rounds,
+ AES_LASTGBLOCK(aes), AES_LASTGBLOCK(aes), AES_COUNTER(aes));
+ }
+ else
+#endif
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(intel_flags)) {
AES_GCM_encrypt_block_avx2((byte*)aes->key, (int)aes->rounds,
@@ -11887,6 +12361,20 @@ static WARN_UNUSED_RESULT int AesGcmEncryptFinal_aesni(
/* Fill the rest of the block with zeros. */
XMEMSET(AES_LASTGBLOCK(aes) + over, 0, (size_t)WC_AES_BLOCK_SIZE - over);
/* GHASH last cipher block. */
+#ifdef HAVE_INTEL_AVX512
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_ghash_block_avx512(AES_LASTGBLOCK(aes), AES_TAG(aes),
+ aes->gcm.H);
+ }
+ else
+#endif
+#ifdef HAVE_INTEL_VAES
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_ghash_block_vaes(AES_LASTGBLOCK(aes), AES_TAG(aes),
+ aes->gcm.H);
+ }
+ else
+#endif
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(intel_flags)) {
AES_GCM_ghash_block_avx2(AES_LASTGBLOCK(aes), AES_TAG(aes),
@@ -11907,6 +12395,20 @@ static WARN_UNUSED_RESULT int AesGcmEncryptFinal_aesni(
}
}
/* Calculate the authentication tag. */
+#ifdef HAVE_INTEL_AVX512
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_encrypt_final_avx512(AES_TAG(aes), authTag, authTagSz, aes->cSz,
+ aes->aSz, aes->gcm.H, AES_INITCTR(aes));
+ }
+ else
+#endif
+#ifdef HAVE_INTEL_VAES
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_encrypt_final_vaes(AES_TAG(aes), authTag, authTagSz, aes->cSz,
+ aes->aSz, aes->gcm.H, AES_INITCTR(aes));
+ }
+ else
+#endif
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(intel_flags)) {
AES_GCM_encrypt_final_avx2(AES_TAG(aes), authTag, authTagSz, aes->cSz,
@@ -11940,9 +12442,29 @@ static WARN_UNUSED_RESULT int AesGcmEncryptFinal_aesni(
extern void AES_GCM_decrypt_update_avx2(const unsigned char* key, int nr,
unsigned char* out, const unsigned char* in, unsigned int nbytes,
unsigned char* tag, unsigned char* h, unsigned char* counter);
+#ifdef HAVE_INTEL_AVX512
+extern void AES_GCM_decrypt_update_avx512(const unsigned char* key, int nr,
+ unsigned char* out, const unsigned char* in, unsigned int nbytes,
+ unsigned char* tag, unsigned char* h, unsigned char* counter);
+#endif
+#ifdef HAVE_INTEL_VAES
+extern void AES_GCM_decrypt_update_vaes(const unsigned char* key, int nr,
+ unsigned char* out, const unsigned char* in, unsigned int nbytes,
+ unsigned char* tag, unsigned char* h, unsigned char* counter);
+#endif
extern void AES_GCM_decrypt_final_avx2(unsigned char* tag,
const unsigned char* authTag, unsigned int tbytes, unsigned int nbytes,
unsigned int abytes, unsigned char* h, unsigned char* initCtr, int* res);
+#ifdef HAVE_INTEL_AVX512
+extern void AES_GCM_decrypt_final_avx512(unsigned char* tag,
+ const unsigned char* authTag, unsigned int tbytes, unsigned int nbytes,
+ unsigned int abytes, unsigned char* h, unsigned char* initCtr, int* res);
+#endif
+#ifdef HAVE_INTEL_VAES
+extern void AES_GCM_decrypt_final_vaes(unsigned char* tag,
+ const unsigned char* authTag, unsigned int tbytes, unsigned int nbytes,
+ unsigned int abytes, unsigned char* h, unsigned char* initCtr, int* res);
+#endif
#endif
#ifdef HAVE_INTEL_AVX1
extern void AES_GCM_decrypt_update_avx1(const unsigned char* key, int nr,
@@ -12005,6 +12527,20 @@ static WARN_UNUSED_RESULT int AesGcmDecryptUpdate_aesni(
aes->cOver = (byte)(aes->cOver + sz);
if (aes->cOver == WC_AES_BLOCK_SIZE) {
/* We have filled up the block and can process. */
+#ifdef HAVE_INTEL_AVX512
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_ghash_block_avx512(AES_LASTBLOCK(aes), AES_TAG(aes),
+ aes->gcm.H);
+ }
+ else
+#endif
+#ifdef HAVE_INTEL_VAES
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_ghash_block_vaes(AES_LASTBLOCK(aes), AES_TAG(aes),
+ aes->gcm.H);
+ }
+ else
+#endif
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(intel_flags)) {
AES_GCM_ghash_block_avx2(AES_LASTBLOCK(aes), AES_TAG(aes),
@@ -12037,6 +12573,22 @@ static WARN_UNUSED_RESULT int AesGcmDecryptUpdate_aesni(
partial = cSz % WC_AES_BLOCK_SIZE;
if (blocks > 0) {
/* Decrypt and GHASH full blocks now. */
+#ifdef HAVE_INTEL_AVX512
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_decrypt_update_avx512((byte*)aes->key, (int)aes->rounds,
+ p, c, blocks * WC_AES_BLOCK_SIZE, AES_TAG(aes), aes->gcm.H,
+ AES_COUNTER(aes));
+ }
+ else
+#endif
+#ifdef HAVE_INTEL_VAES
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_decrypt_update_vaes((byte*)aes->key, (int)aes->rounds,
+ p, c, blocks * WC_AES_BLOCK_SIZE, AES_TAG(aes), aes->gcm.H,
+ AES_COUNTER(aes));
+ }
+ else
+#endif
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(intel_flags)) {
AES_GCM_decrypt_update_avx2((byte*)aes->key, (int)aes->rounds,
@@ -12065,6 +12617,20 @@ static WARN_UNUSED_RESULT int AesGcmDecryptUpdate_aesni(
if (partial != 0) {
/* Encrypt the counter - XOR in zeros as proxy for cipher text. */
XMEMSET(AES_LASTGBLOCK(aes), 0, WC_AES_BLOCK_SIZE);
+#ifdef HAVE_INTEL_AVX512
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_encrypt_block_avx512((byte*)aes->key, (int)aes->rounds,
+ AES_LASTGBLOCK(aes), AES_LASTGBLOCK(aes), AES_COUNTER(aes));
+ }
+ else
+#endif
+#ifdef HAVE_INTEL_VAES
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_encrypt_block_vaes((byte*)aes->key, (int)aes->rounds,
+ AES_LASTGBLOCK(aes), AES_LASTGBLOCK(aes), AES_COUNTER(aes));
+ }
+ else
+#endif
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(intel_flags)) {
AES_GCM_encrypt_block_avx2((byte*)aes->key, (int)aes->rounds,
@@ -12127,6 +12693,18 @@ static WARN_UNUSED_RESULT int AesGcmDecryptFinal_aesni(
/* Zeroize the unused part of the block. */
XMEMSET(lastBlock + over, 0, (size_t)WC_AES_BLOCK_SIZE - over);
/* Hash the last block of cipher text. */
+#ifdef HAVE_INTEL_AVX512
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_ghash_block_avx512(lastBlock, AES_TAG(aes), aes->gcm.H);
+ }
+ else
+#endif
+#ifdef HAVE_INTEL_VAES
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_ghash_block_vaes(lastBlock, AES_TAG(aes), aes->gcm.H);
+ }
+ else
+#endif
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(intel_flags)) {
AES_GCM_ghash_block_avx2(lastBlock, AES_TAG(aes), aes->gcm.H);
@@ -12144,6 +12722,20 @@ static WARN_UNUSED_RESULT int AesGcmDecryptFinal_aesni(
}
}
/* Calculate and compare the authentication tag. */
+#ifdef HAVE_INTEL_AVX512
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_decrypt_final_avx512(AES_TAG(aes), authTag, authTagSz, aes->cSz,
+ aes->aSz, aes->gcm.H, AES_INITCTR(aes), &res);
+ }
+ else
+#endif
+#ifdef HAVE_INTEL_VAES
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_GCM_decrypt_final_vaes(AES_TAG(aes), authTag, authTagSz, aes->cSz,
+ aes->aSz, aes->gcm.H, AES_INITCTR(aes), &res);
+ }
+ else
+#endif
#ifdef HAVE_INTEL_AVX2
if (IS_INTEL_AVX2(intel_flags)) {
AES_GCM_decrypt_final_avx2(AES_TAG(aes), authTag, authTagSz, aes->cSz,
@@ -14537,7 +15129,11 @@ static WARN_UNUSED_RESULT int _AesEcbEncrypt(
#else
#ifdef WOLFSSL_AESNI
if (aes->use_aesni) {
+ #ifdef WOLFSSL_X86_64_BUILD
+ AesEcbEncryptBlocks(in, out, sz, (byte*)aes->key, (int)aes->rounds);
+ #else
AES_ECB_encrypt_AESNI(in, out, sz, (byte*)aes->key, (int)aes->rounds);
+ #endif
}
else
#endif
@@ -14632,7 +15228,11 @@ static WARN_UNUSED_RESULT int _AesEcbDecrypt(
#else
#ifdef WOLFSSL_AESNI
if (aes->use_aesni) {
+ #ifdef WOLFSSL_X86_64_BUILD
+ AesEcbDecryptBlocks(in, out, sz, (byte*)aes->key, (int)aes->rounds);
+ #else
AES_ECB_decrypt_AESNI(in, out, sz, (byte*)aes->key, (int)aes->rounds);
+ #endif
}
else
#endif
@@ -15797,6 +16397,37 @@ void AES_XTS_encrypt_update_avx1(const unsigned char *in, unsigned char *out, wo
XASM_LINK("AES_XTS_encrypt_update_avx1");
#endif
#endif /* HAVE_INTEL_AVX1 */
+#ifdef HAVE_INTEL_VAES
+void AES_XTS_encrypt_vaes(const unsigned char *in, unsigned char *out,
+ word32 sz, const unsigned char* i,
+ const unsigned char* key, const unsigned char* key2,
+ int nr)
+ XASM_LINK("AES_XTS_encrypt_vaes");
+#ifdef WOLFSSL_AESXTS_STREAM
+void AES_XTS_init_vaes(unsigned char* i, const unsigned char* tweak_key,
+ int tweak_nr)
+ XASM_LINK("AES_XTS_init_vaes");
+void AES_XTS_encrypt_update_vaes(const unsigned char *in, unsigned char *out, word32 sz,
+ const unsigned char* key, unsigned char *i, int nr)
+ XASM_LINK("AES_XTS_encrypt_update_vaes");
+#endif
+#endif /* HAVE_INTEL_VAES */
+#ifdef HAVE_INTEL_AVX512
+void AES_XTS_encrypt_avx512(const unsigned char *in, unsigned char *out,
+ word32 sz, const unsigned char* i,
+ const unsigned char* key, const unsigned char* key2,
+ int nr)
+ XASM_LINK("AES_XTS_encrypt_avx512");
+#ifdef WOLFSSL_AESXTS_STREAM
+void AES_XTS_init_avx512(unsigned char* i, const unsigned char* tweak_key,
+ int tweak_nr)
+ XASM_LINK("AES_XTS_init_avx512");
+void AES_XTS_encrypt_update_avx512(const unsigned char *in, unsigned char *out, word32 sz,
+ const unsigned char* key, unsigned char *i, int nr)
+ XASM_LINK("AES_XTS_encrypt_update_avx512");
+#endif
+#endif /* HAVE_INTEL_AVX512 */
+
#ifdef HAVE_AES_DECRYPT
void AES_XTS_decrypt_aesni(const unsigned char *in, unsigned char *out, word32 sz,
@@ -15820,6 +16451,30 @@ void AES_XTS_decrypt_update_avx1(const unsigned char *in, unsigned char *out, wo
XASM_LINK("AES_XTS_decrypt_update_avx1");
#endif
#endif /* HAVE_INTEL_AVX1 */
+#ifdef HAVE_INTEL_VAES
+void AES_XTS_decrypt_vaes(const unsigned char *in, unsigned char *out,
+ word32 sz, const unsigned char* i,
+ const unsigned char* key, const unsigned char* key2,
+ int nr)
+ XASM_LINK("AES_XTS_decrypt_vaes");
+#ifdef WOLFSSL_AESXTS_STREAM
+void AES_XTS_decrypt_update_vaes(const unsigned char *in, unsigned char *out, word32 sz,
+ const unsigned char* key, unsigned char *i, int nr)
+ XASM_LINK("AES_XTS_decrypt_update_vaes");
+#endif
+#endif /* HAVE_INTEL_VAES */
+#ifdef HAVE_INTEL_AVX512
+void AES_XTS_decrypt_avx512(const unsigned char *in, unsigned char *out,
+ word32 sz, const unsigned char* i,
+ const unsigned char* key, const unsigned char* key2,
+ int nr)
+ XASM_LINK("AES_XTS_decrypt_avx512");
+#ifdef WOLFSSL_AESXTS_STREAM
+void AES_XTS_decrypt_update_avx512(const unsigned char *in, unsigned char *out, word32 sz,
+ const unsigned char* key, unsigned char *i, int nr)
+ XASM_LINK("AES_XTS_decrypt_update_avx512");
+#endif
+#endif /* HAVE_INTEL_AVX512 */
#endif /* HAVE_AES_DECRYPT */
#endif /* WOLFSSL_AESNI */
@@ -16078,6 +16733,26 @@ int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
#elif defined(WOLFSSL_AESNI)
if (aes->use_aesni) {
SAVE_VECTOR_REGISTERS(return _svr_ret;);
+#if defined(HAVE_INTEL_AVX512)
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_XTS_encrypt_avx512(in, out, sz, i,
+ (const byte*)aes->key,
+ (const byte*)xaes->tweak.key,
+ (int)aes->rounds);
+ ret = 0;
+ }
+ else
+#endif
+#if defined(HAVE_INTEL_VAES)
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_XTS_encrypt_vaes(in, out, sz, i,
+ (const byte*)aes->key,
+ (const byte*)xaes->tweak.key,
+ (int)aes->rounds);
+ ret = 0;
+ }
+ else
+#endif
#if defined(HAVE_INTEL_AVX1)
if (IS_INTEL_AVX1(intel_flags)) {
AES_XTS_encrypt_avx1(in, out, sz, i,
@@ -16180,6 +16855,24 @@ int wc_AesXtsEncryptInit(XtsAes* xaes, const byte* i, word32 iSz,
#ifdef WOLFSSL_AESNI
if (aes->use_aesni) {
SAVE_VECTOR_REGISTERS(return _svr_ret;);
+#if defined(HAVE_INTEL_AVX512)
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_XTS_init_avx512(stream->tweak_block,
+ (const byte*)xaes->tweak.key,
+ (int)xaes->tweak.rounds);
+ ret = 0;
+ }
+ else
+#endif
+#if defined(HAVE_INTEL_VAES)
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_XTS_init_vaes(stream->tweak_block,
+ (const byte*)xaes->tweak.key,
+ (int)xaes->tweak.rounds);
+ ret = 0;
+ }
+ else
+#endif
#if defined(HAVE_INTEL_AVX1)
if (IS_INTEL_AVX1(intel_flags)) {
AES_XTS_init_avx1(stream->tweak_block,
@@ -16275,6 +16968,26 @@ static int AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
#ifdef WOLFSSL_AESNI
if (aes->use_aesni) {
SAVE_VECTOR_REGISTERS(return _svr_ret;);
+#if defined(HAVE_INTEL_AVX512)
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_XTS_encrypt_update_avx512(in, out, sz,
+ (const byte*)aes->key,
+ stream->tweak_block,
+ (int)aes->rounds);
+ ret = 0;
+ }
+ else
+#endif
+#if defined(HAVE_INTEL_VAES)
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_XTS_encrypt_update_vaes(in, out, sz,
+ (const byte*)aes->key,
+ stream->tweak_block,
+ (int)aes->rounds);
+ ret = 0;
+ }
+ else
+#endif
#if defined(HAVE_INTEL_AVX1)
if (IS_INTEL_AVX1(intel_flags)) {
AES_XTS_encrypt_update_avx1(in, out, sz,
@@ -16559,6 +17272,26 @@ int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
#elif defined(WOLFSSL_AESNI)
if (aes->use_aesni) {
SAVE_VECTOR_REGISTERS(return _svr_ret;);
+#if defined(HAVE_INTEL_AVX512)
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_XTS_decrypt_avx512(in, out, sz, i,
+ (const byte*)aes->key,
+ (const byte*)xaes->tweak.key,
+ (int)aes->rounds);
+ ret = 0;
+ }
+ else
+#endif
+#if defined(HAVE_INTEL_VAES)
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_XTS_decrypt_vaes(in, out, sz, i,
+ (const byte*)aes->key,
+ (const byte*)xaes->tweak.key,
+ (int)aes->rounds);
+ ret = 0;
+ }
+ else
+#endif
#if defined(HAVE_INTEL_AVX1)
if (IS_INTEL_AVX1(intel_flags)) {
AES_XTS_decrypt_avx1(in, out, sz, i,
@@ -16664,6 +17397,24 @@ int wc_AesXtsDecryptInit(XtsAes* xaes, const byte* i, word32 iSz,
#ifdef WOLFSSL_AESNI
if (aes->use_aesni) {
SAVE_VECTOR_REGISTERS(return _svr_ret;);
+#if defined(HAVE_INTEL_AVX512)
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_XTS_init_avx512(stream->tweak_block,
+ (const byte*)xaes->tweak.key,
+ (int)xaes->tweak.rounds);
+ ret = 0;
+ }
+ else
+#endif
+#if defined(HAVE_INTEL_VAES)
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_XTS_init_vaes(stream->tweak_block,
+ (const byte*)xaes->tweak.key,
+ (int)xaes->tweak.rounds);
+ ret = 0;
+ }
+ else
+#endif
#if defined(HAVE_INTEL_AVX1)
if (IS_INTEL_AVX1(intel_flags)) {
AES_XTS_init_avx1(stream->tweak_block,
@@ -16751,6 +17502,26 @@ static int AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
#ifdef WOLFSSL_AESNI
if (aes->use_aesni) {
SAVE_VECTOR_REGISTERS(return _svr_ret;);
+#if defined(HAVE_INTEL_AVX512)
+ if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_XTS_decrypt_update_avx512(in, out, sz,
+ (const byte*)aes->key,
+ stream->tweak_block,
+ (int)aes->rounds);
+ ret = 0;
+ }
+ else
+#endif
+#if defined(HAVE_INTEL_VAES)
+ if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) {
+ AES_XTS_decrypt_update_vaes(in, out, sz,
+ (const byte*)aes->key,
+ stream->tweak_block,
+ (int)aes->rounds);
+ ret = 0;
+ }
+ else
+#endif
#if defined(HAVE_INTEL_AVX1)
if (IS_INTEL_AVX1(intel_flags)) {
AES_XTS_decrypt_update_avx1(in, out, sz,
diff --git a/wolfcrypt/src/aes_asm.S b/wolfcrypt/src/aes_asm.S
index 0371ca8cb22..d4131676542 100644
--- a/wolfcrypt/src/aes_asm.S
+++ b/wolfcrypt/src/aes_asm.S
@@ -46,1314 +46,7 @@
#endif /* WOLFSSL_USER_SETTINGS_ASM */
#endif /* WOLFSSL_USER_SETTINGS */
-#ifdef WOLFSSL_X86_64_BUILD
-
-/*
-AES_CBC_encrypt_AESNI (const unsigned char *in,
- unsigned char *out,
- unsigned char ivec[16],
- unsigned long length,
- const unsigned char *KS,
- int nr)
-*/
-#ifndef __APPLE__
-.globl AES_CBC_encrypt_AESNI
-AES_CBC_encrypt_AESNI:
-#else
-.globl _AES_CBC_encrypt_AESNI
-_AES_CBC_encrypt_AESNI:
-#endif
-# parameter 1: %rdi
-# parameter 2: %rsi
-# parameter 3: %rdx
-# parameter 4: %rcx
-# parameter 5: %r8
-# parameter 6: %r9d
-movq %rcx, %r10
-shrq $4, %rcx
-shlq $60, %r10
-je NO_PARTS
-addq $1, %rcx
-NO_PARTS:
-subq $16, %rsi
-movdqa (%rdx), %xmm1
-LOOP:
-pxor (%rdi), %xmm1
-pxor (%r8), %xmm1
-addq $16,%rsi
-addq $16,%rdi
-cmpl $12, %r9d
-aesenc 16(%r8),%xmm1
-aesenc 32(%r8),%xmm1
-aesenc 48(%r8),%xmm1
-aesenc 64(%r8),%xmm1
-aesenc 80(%r8),%xmm1
-aesenc 96(%r8),%xmm1
-aesenc 112(%r8),%xmm1
-aesenc 128(%r8),%xmm1
-aesenc 144(%r8),%xmm1
-movdqa 160(%r8),%xmm2
-jb LAST
-cmpl $14, %r9d
-
-aesenc 160(%r8),%xmm1
-aesenc 176(%r8),%xmm1
-movdqa 192(%r8),%xmm2
-jb LAST
-aesenc 192(%r8),%xmm1
-aesenc 208(%r8),%xmm1
-movdqa 224(%r8),%xmm2
-LAST:
-decq %rcx
-aesenclast %xmm2,%xmm1
-movdqu %xmm1,(%rsi)
-jne LOOP
-ret
-
-
-#if defined(WOLFSSL_AESNI_BY4)
-
-/*
-AES_CBC_decrypt_AESNI_by4 (const unsigned char *in,
- unsigned char *out,
- unsigned char ivec[16],
- unsigned long length,
- const unsigned char *KS,
- int nr)
-*/
-#ifndef __APPLE__
-.globl AES_CBC_decrypt_AESNI_by4
-AES_CBC_decrypt_AESNI_by4:
-#else
-.globl _AES_CBC_decrypt_AESNI_by4
-_AES_CBC_decrypt_AESNI_by4:
-#endif
-# parameter 1: %rdi
-# parameter 2: %rsi
-# parameter 3: %rdx
-# parameter 4: %rcx
-# parameter 5: %r8
-# parameter 6: %r9d
-
- movq %rcx, %r10
- shrq $4, %rcx
- shlq $60, %r10
- je DNO_PARTS_4
- addq $1, %rcx
-DNO_PARTS_4:
- movq %rcx, %r10
- shlq $62, %r10
- shrq $62, %r10
- shrq $2, %rcx
- movdqu (%rdx),%xmm5
- je DREMAINDER_4
- subq $64, %rsi
-DLOOP_4:
- movdqu (%rdi), %xmm1
- movdqu 16(%rdi), %xmm2
- movdqu 32(%rdi), %xmm3
- movdqu 48(%rdi), %xmm4
- movdqa %xmm1, %xmm6
- movdqa %xmm2, %xmm7
- movdqa %xmm3, %xmm8
- movdqa %xmm4, %xmm15
- movdqa (%r8), %xmm9
- movdqa 16(%r8), %xmm10
- movdqa 32(%r8), %xmm11
- movdqa 48(%r8), %xmm12
- pxor %xmm9, %xmm1
- pxor %xmm9, %xmm2
- pxor %xmm9, %xmm3
- pxor %xmm9, %xmm4
- aesdec %xmm10, %xmm1
- aesdec %xmm10, %xmm2
- aesdec %xmm10, %xmm3
- aesdec %xmm10, %xmm4
- aesdec %xmm11, %xmm1
- aesdec %xmm11, %xmm2
- aesdec %xmm11, %xmm3
- aesdec %xmm11, %xmm4
- aesdec %xmm12, %xmm1
- aesdec %xmm12, %xmm2
- aesdec %xmm12, %xmm3
- aesdec %xmm12, %xmm4
- movdqa 64(%r8), %xmm9
- movdqa 80(%r8), %xmm10
- movdqa 96(%r8), %xmm11
- movdqa 112(%r8), %xmm12
- aesdec %xmm9, %xmm1
- aesdec %xmm9, %xmm2
- aesdec %xmm9, %xmm3
- aesdec %xmm9, %xmm4
- aesdec %xmm10, %xmm1
- aesdec %xmm10, %xmm2
- aesdec %xmm10, %xmm3
- aesdec %xmm10, %xmm4
- aesdec %xmm11, %xmm1
- aesdec %xmm11, %xmm2
- aesdec %xmm11, %xmm3
- aesdec %xmm11, %xmm4
- aesdec %xmm12, %xmm1
- aesdec %xmm12, %xmm2
- aesdec %xmm12, %xmm3
- aesdec %xmm12, %xmm4
- movdqa 128(%r8), %xmm9
- movdqa 144(%r8), %xmm10
- movdqa 160(%r8), %xmm11
- cmpl $12, %r9d
- aesdec %xmm9, %xmm1
- aesdec %xmm9, %xmm2
- aesdec %xmm9, %xmm3
- aesdec %xmm9, %xmm4
- aesdec %xmm10, %xmm1
- aesdec %xmm10, %xmm2
- aesdec %xmm10, %xmm3
- aesdec %xmm10, %xmm4
- jb DLAST_4
- movdqa 160(%r8), %xmm9
- movdqa 176(%r8), %xmm10
- movdqa 192(%r8), %xmm11
- cmpl $14, %r9d
- aesdec %xmm9, %xmm1
- aesdec %xmm9, %xmm2
- aesdec %xmm9, %xmm3
- aesdec %xmm9, %xmm4
- aesdec %xmm10, %xmm1
- aesdec %xmm10, %xmm2
- aesdec %xmm10, %xmm3
- aesdec %xmm10, %xmm4
- jb DLAST_4
- movdqa 192(%r8), %xmm9
- movdqa 208(%r8), %xmm10
- movdqa 224(%r8), %xmm11
- aesdec %xmm9, %xmm1
- aesdec %xmm9, %xmm2
- aesdec %xmm9, %xmm3
- aesdec %xmm9, %xmm4
- aesdec %xmm10, %xmm1
- aesdec %xmm10, %xmm2
- aesdec %xmm10, %xmm3
- aesdec %xmm10, %xmm4
-DLAST_4:
- addq $64, %rdi
- addq $64, %rsi
- decq %rcx
- aesdeclast %xmm11, %xmm1
- aesdeclast %xmm11, %xmm2
- aesdeclast %xmm11, %xmm3
- aesdeclast %xmm11, %xmm4
- pxor %xmm5, %xmm1
- pxor %xmm6, %xmm2
- pxor %xmm7, %xmm3
- pxor %xmm8, %xmm4
- movdqu %xmm1, (%rsi)
- movdqu %xmm2, 16(%rsi)
- movdqu %xmm3, 32(%rsi)
- movdqu %xmm4, 48(%rsi)
- movdqa %xmm15,%xmm5
- jne DLOOP_4
- addq $64, %rsi
-DREMAINDER_4:
- cmpq $0, %r10
- je DEND_4
-DLOOP_4_2:
- movdqu (%rdi), %xmm1
- movdqa %xmm1, %xmm15
- addq $16, %rdi
- pxor (%r8), %xmm1
- movdqu 160(%r8), %xmm2
- cmpl $12, %r9d
- aesdec 16(%r8), %xmm1
- aesdec 32(%r8), %xmm1
- aesdec 48(%r8), %xmm1
- aesdec 64(%r8), %xmm1
- aesdec 80(%r8), %xmm1
- aesdec 96(%r8), %xmm1
- aesdec 112(%r8), %xmm1
- aesdec 128(%r8), %xmm1
- aesdec 144(%r8), %xmm1
- jb DLAST_4_2
- movdqu 192(%r8), %xmm2
- cmpl $14, %r9d
- aesdec 160(%r8), %xmm1
- aesdec 176(%r8), %xmm1
- jb DLAST_4_2
- movdqu 224(%r8), %xmm2
- aesdec 192(%r8), %xmm1
- aesdec 208(%r8), %xmm1
-DLAST_4_2:
- aesdeclast %xmm2, %xmm1
- pxor %xmm5, %xmm1
- movdqa %xmm15, %xmm5
- movdqu %xmm1, (%rsi)
- addq $16, %rsi
- decq %r10
- jne DLOOP_4_2
-DEND_4:
- ret
-
-#elif defined(WOLFSSL_AESNI_BY6)
-
-/*
-AES_CBC_decrypt_AESNI_by6 (const unsigned char *in,
- unsigned char *out,
- unsigned char ivec[16],
- unsigned long length,
- const unsigned char *KS,
- int nr)
-*/
-#ifndef __APPLE__
-.globl AES_CBC_decrypt_AESNI_by6
-AES_CBC_decrypt_AESNI_by6:
-#else
-.globl _AES_CBC_decrypt_AESNI_by6
-_AES_CBC_decrypt_AESNI_by6:
-#endif
-# parameter 1: %rdi - in
-# parameter 2: %rsi - out
-# parameter 3: %rdx - ivec
-# parameter 4: %rcx - length
-# parameter 5: %r8 - KS
-# parameter 6: %r9d - nr
-
- movq %rcx, %r10
- shrq $4, %rcx
- shlq $60, %r10
- je DNO_PARTS_6
- addq $1, %rcx
-DNO_PARTS_6:
- movq %rax, %r12
- movq %rdx, %r13
- movq %rbx, %r14
- movq $0, %rdx
- movq %rcx, %rax
- movq $6, %rbx
- div %rbx
- movq %rax, %rcx
- movq %rdx, %r10
- movq %r12, %rax
- movq %r13, %rdx
- movq %r14, %rbx
- cmpq $0, %rcx
- movdqu (%rdx), %xmm7
- je DREMAINDER_6
- subq $96, %rsi
-DLOOP_6:
- movdqu (%rdi), %xmm1
- movdqu 16(%rdi), %xmm2
- movdqu 32(%rdi), %xmm3
- movdqu 48(%rdi), %xmm4
- movdqu 64(%rdi), %xmm5
- movdqu 80(%rdi), %xmm6
- movdqa (%r8), %xmm8
- movdqa 16(%r8), %xmm9
- movdqa 32(%r8), %xmm10
- movdqa 48(%r8), %xmm11
- pxor %xmm8, %xmm1
- pxor %xmm8, %xmm2
- pxor %xmm8, %xmm3
- pxor %xmm8, %xmm4
- pxor %xmm8, %xmm5
- pxor %xmm8, %xmm6
- aesdec %xmm9, %xmm1
- aesdec %xmm9, %xmm2
- aesdec %xmm9, %xmm3
- aesdec %xmm9, %xmm4
- aesdec %xmm9, %xmm5
- aesdec %xmm9, %xmm6
- aesdec %xmm10, %xmm1
- aesdec %xmm10, %xmm2
- aesdec %xmm10, %xmm3
- aesdec %xmm10, %xmm4
- aesdec %xmm10, %xmm5
- aesdec %xmm10, %xmm6
- aesdec %xmm11, %xmm1
- aesdec %xmm11, %xmm2
- aesdec %xmm11, %xmm3
- aesdec %xmm11, %xmm4
- aesdec %xmm11, %xmm5
- aesdec %xmm11, %xmm6
- movdqa 64(%r8), %xmm8
- movdqa 80(%r8), %xmm9
- movdqa 96(%r8), %xmm10
- movdqa 112(%r8), %xmm11
- aesdec %xmm8, %xmm1
- aesdec %xmm8, %xmm2
- aesdec %xmm8, %xmm3
- aesdec %xmm8, %xmm4
- aesdec %xmm8, %xmm5
- aesdec %xmm8, %xmm6
- aesdec %xmm9, %xmm1
- aesdec %xmm9, %xmm2
- aesdec %xmm9, %xmm3
- aesdec %xmm9, %xmm4
- aesdec %xmm9, %xmm5
- aesdec %xmm9, %xmm6
- aesdec %xmm10, %xmm1
- aesdec %xmm10, %xmm2
- aesdec %xmm10, %xmm3
- aesdec %xmm10, %xmm4
- aesdec %xmm10, %xmm5
- aesdec %xmm10, %xmm6
- aesdec %xmm11, %xmm1
- aesdec %xmm11, %xmm2
- aesdec %xmm11, %xmm3
- aesdec %xmm11, %xmm4
- aesdec %xmm11, %xmm5
- aesdec %xmm11, %xmm6
- movdqa 128(%r8), %xmm8
- movdqa 144(%r8), %xmm9
- movdqa 160(%r8), %xmm10
- cmpl $12, %r9d
- aesdec %xmm8, %xmm1
- aesdec %xmm8, %xmm2
- aesdec %xmm8, %xmm3
- aesdec %xmm8, %xmm4
- aesdec %xmm8, %xmm5
- aesdec %xmm8, %xmm6
- aesdec %xmm9, %xmm1
- aesdec %xmm9, %xmm2
- aesdec %xmm9, %xmm3
- aesdec %xmm9, %xmm4
- aesdec %xmm9, %xmm5
- aesdec %xmm9, %xmm6
- jb DLAST_6
- movdqa 160(%r8), %xmm8
- movdqa 176(%r8), %xmm9
- movdqa 192(%r8), %xmm10
- cmpl $14, %r9d
- aesdec %xmm8, %xmm1
- aesdec %xmm8, %xmm2
- aesdec %xmm8, %xmm3
- aesdec %xmm8, %xmm4
- aesdec %xmm8, %xmm5
- aesdec %xmm8, %xmm6
- aesdec %xmm9, %xmm1
- aesdec %xmm9, %xmm2
- aesdec %xmm9, %xmm3
- aesdec %xmm9, %xmm4
- aesdec %xmm9, %xmm5
- aesdec %xmm9, %xmm6
- jb DLAST_6
- movdqa 192(%r8), %xmm8
- movdqa 208(%r8), %xmm9
- movdqa 224(%r8), %xmm10
- aesdec %xmm8, %xmm1
- aesdec %xmm8, %xmm2
- aesdec %xmm8, %xmm3
- aesdec %xmm8, %xmm4
- aesdec %xmm8, %xmm5
- aesdec %xmm8, %xmm6
- aesdec %xmm9, %xmm1
- aesdec %xmm9, %xmm2
- aesdec %xmm9, %xmm3
- aesdec %xmm9, %xmm4
- aesdec %xmm9, %xmm5
- aesdec %xmm9, %xmm6
-DLAST_6:
- addq $96, %rsi
- aesdeclast %xmm10, %xmm1
- aesdeclast %xmm10, %xmm2
- aesdeclast %xmm10, %xmm3
- aesdeclast %xmm10, %xmm4
- aesdeclast %xmm10, %xmm5
- aesdeclast %xmm10, %xmm6
- movdqu (%rdi), %xmm8
- movdqu 16(%rdi), %xmm9
- movdqu 32(%rdi), %xmm10
- movdqu 48(%rdi), %xmm11
- movdqu 64(%rdi), %xmm12
- movdqu 80(%rdi), %xmm13
- pxor %xmm7, %xmm1
- pxor %xmm8, %xmm2
- pxor %xmm9, %xmm3
- pxor %xmm10, %xmm4
- pxor %xmm11, %xmm5
- pxor %xmm12, %xmm6
- movdqu %xmm13, %xmm7
- movdqu %xmm1, (%rsi)
- movdqu %xmm2, 16(%rsi)
- movdqu %xmm3, 32(%rsi)
- movdqu %xmm4, 48(%rsi)
- movdqu %xmm5, 64(%rsi)
- movdqu %xmm6, 80(%rsi)
- addq $96, %rdi
- decq %rcx
- jne DLOOP_6
- addq $96, %rsi
-DREMAINDER_6:
- cmpq $0, %r10
- je DEND_6
-DLOOP_6_2:
- movdqu (%rdi), %xmm1
- movdqa %xmm1, %xmm10
- addq $16, %rdi
- pxor (%r8), %xmm1
- movdqu 160(%r8), %xmm2
- cmpl $12, %r9d
- aesdec 16(%r8), %xmm1
- aesdec 32(%r8), %xmm1
- aesdec 48(%r8), %xmm1
- aesdec 64(%r8), %xmm1
- aesdec 80(%r8), %xmm1
- aesdec 96(%r8), %xmm1
- aesdec 112(%r8), %xmm1
- aesdec 128(%r8), %xmm1
- aesdec 144(%r8), %xmm1
- jb DLAST_6_2
- movdqu 192(%r8), %xmm2
- cmpl $14, %r9d
- aesdec 160(%r8), %xmm1
- aesdec 176(%r8), %xmm1
- jb DLAST_6_2
- movdqu 224(%r8), %xmm2
- aesdec 192(%r8), %xmm1
- aesdec 208(%r8), %xmm1
-DLAST_6_2:
- aesdeclast %xmm2, %xmm1
- pxor %xmm7, %xmm1
- movdqa %xmm10, %xmm7
- movdqu %xmm1, (%rsi)
- addq $16, %rsi
- decq %r10
- jne DLOOP_6_2
-DEND_6:
- ret
-
-#else /* WOLFSSL_AESNI_BYx */
-
-/*
-AES_CBC_decrypt_AESNI_by8 (const unsigned char *in,
- unsigned char *out,
- unsigned char ivec[16],
- unsigned long length,
- const unsigned char *KS,
- int nr)
-*/
-#ifndef __APPLE__
-.globl AES_CBC_decrypt_AESNI_by8
-AES_CBC_decrypt_AESNI_by8:
-#else
-.globl _AES_CBC_decrypt_AESNI_by8
-_AES_CBC_decrypt_AESNI_by8:
-#endif
-# parameter 1: %rdi - in
-# parameter 2: %rsi - out
-# parameter 3: %rdx - ivec
-# parameter 4: %rcx - length
-# parameter 5: %r8 - KS
-# parameter 6: %r9d - nr
-
- movq %rcx, %r10
- shrq $4, %rcx
- shlq $60, %r10
- je DNO_PARTS_8
- addq $1, %rcx
-DNO_PARTS_8:
- movq %rcx, %r10
- shlq $61, %r10
- shrq $61, %r10
- shrq $3, %rcx
- movdqu (%rdx), %xmm9
- je DREMAINDER_8
- subq $128, %rsi
-DLOOP_8:
- movdqu (%rdi), %xmm1
- movdqu 16(%rdi), %xmm2
- movdqu 32(%rdi), %xmm3
- movdqu 48(%rdi), %xmm4
- movdqu 64(%rdi), %xmm5
- movdqu 80(%rdi), %xmm6
- movdqu 96(%rdi), %xmm7
- movdqu 112(%rdi), %xmm8
- movdqa (%r8), %xmm10
- movdqa 16(%r8), %xmm11
- movdqa 32(%r8), %xmm12
- movdqa 48(%r8), %xmm13
- pxor %xmm10, %xmm1
- pxor %xmm10, %xmm2
- pxor %xmm10, %xmm3
- pxor %xmm10, %xmm4
- pxor %xmm10, %xmm5
- pxor %xmm10, %xmm6
- pxor %xmm10, %xmm7
- pxor %xmm10, %xmm8
- aesdec %xmm11, %xmm1
- aesdec %xmm11, %xmm2
- aesdec %xmm11, %xmm3
- aesdec %xmm11, %xmm4
- aesdec %xmm11, %xmm5
- aesdec %xmm11, %xmm6
- aesdec %xmm11, %xmm7
- aesdec %xmm11, %xmm8
- aesdec %xmm12, %xmm1
- aesdec %xmm12, %xmm2
- aesdec %xmm12, %xmm3
- aesdec %xmm12, %xmm4
- aesdec %xmm12, %xmm5
- aesdec %xmm12, %xmm6
- aesdec %xmm12, %xmm7
- aesdec %xmm12, %xmm8
- aesdec %xmm13, %xmm1
- aesdec %xmm13, %xmm2
- aesdec %xmm13, %xmm3
- aesdec %xmm13, %xmm4
- aesdec %xmm13, %xmm5
- aesdec %xmm13, %xmm6
- aesdec %xmm13, %xmm7
- aesdec %xmm13, %xmm8
- movdqa 64(%r8), %xmm10
- movdqa 80(%r8), %xmm11
- movdqa 96(%r8), %xmm12
- movdqa 112(%r8), %xmm13
- aesdec %xmm10, %xmm1
- aesdec %xmm10, %xmm2
- aesdec %xmm10, %xmm3
- aesdec %xmm10, %xmm4
- aesdec %xmm10, %xmm5
- aesdec %xmm10, %xmm6
- aesdec %xmm10, %xmm7
- aesdec %xmm10, %xmm8
- aesdec %xmm11, %xmm1
- aesdec %xmm11, %xmm2
- aesdec %xmm11, %xmm3
- aesdec %xmm11, %xmm4
- aesdec %xmm11, %xmm5
- aesdec %xmm11, %xmm6
- aesdec %xmm11, %xmm7
- aesdec %xmm11, %xmm8
- aesdec %xmm12, %xmm1
- aesdec %xmm12, %xmm2
- aesdec %xmm12, %xmm3
- aesdec %xmm12, %xmm4
- aesdec %xmm12, %xmm5
- aesdec %xmm12, %xmm6
- aesdec %xmm12, %xmm7
- aesdec %xmm12, %xmm8
- aesdec %xmm13, %xmm1
- aesdec %xmm13, %xmm2
- aesdec %xmm13, %xmm3
- aesdec %xmm13, %xmm4
- aesdec %xmm13, %xmm5
- aesdec %xmm13, %xmm6
- aesdec %xmm13, %xmm7
- aesdec %xmm13, %xmm8
- movdqa 128(%r8), %xmm10
- movdqa 144(%r8), %xmm11
- movdqa 160(%r8), %xmm12
- cmpl $12, %r9d
- aesdec %xmm10, %xmm1
- aesdec %xmm10, %xmm2
- aesdec %xmm10, %xmm3
- aesdec %xmm10, %xmm4
- aesdec %xmm10, %xmm5
- aesdec %xmm10, %xmm6
- aesdec %xmm10, %xmm7
- aesdec %xmm10, %xmm8
- aesdec %xmm11, %xmm1
- aesdec %xmm11, %xmm2
- aesdec %xmm11, %xmm3
- aesdec %xmm11, %xmm4
- aesdec %xmm11, %xmm5
- aesdec %xmm11, %xmm6
- aesdec %xmm11, %xmm7
- aesdec %xmm11, %xmm8
- jb DLAST_8
- movdqa 160(%r8), %xmm10
- movdqa 176(%r8), %xmm11
- movdqa 192(%r8), %xmm12
- cmpl $14, %r9d
- aesdec %xmm10, %xmm1
- aesdec %xmm10, %xmm2
- aesdec %xmm10, %xmm3
- aesdec %xmm10, %xmm4
- aesdec %xmm10, %xmm5
- aesdec %xmm10, %xmm6
- aesdec %xmm10, %xmm7
- aesdec %xmm10, %xmm8
- aesdec %xmm11, %xmm1
- aesdec %xmm11, %xmm2
- aesdec %xmm11, %xmm3
- aesdec %xmm11, %xmm4
- aesdec %xmm11, %xmm5
- aesdec %xmm11, %xmm6
- aesdec %xmm11, %xmm7
- aesdec %xmm11, %xmm8
- jb DLAST_8
- movdqa 192(%r8), %xmm10
- movdqa 208(%r8), %xmm11
- movdqa 224(%r8), %xmm12
- aesdec %xmm10, %xmm1
- aesdec %xmm10, %xmm2
- aesdec %xmm10, %xmm3
- aesdec %xmm10, %xmm4
- aesdec %xmm10, %xmm5
- aesdec %xmm10, %xmm6
- aesdec %xmm10, %xmm7
- aesdec %xmm10, %xmm8
- aesdec %xmm11, %xmm1
- aesdec %xmm11, %xmm2
- aesdec %xmm11, %xmm3
- aesdec %xmm11, %xmm4
- aesdec %xmm11, %xmm5
- aesdec %xmm11, %xmm6
- aesdec %xmm11, %xmm7
- aesdec %xmm11, %xmm8
-DLAST_8:
- addq $128, %rsi
- aesdeclast %xmm12, %xmm1
- aesdeclast %xmm12, %xmm2
- aesdeclast %xmm12, %xmm3
- aesdeclast %xmm12, %xmm4
- aesdeclast %xmm12, %xmm5
- aesdeclast %xmm12, %xmm6
- aesdeclast %xmm12, %xmm7
- aesdeclast %xmm12, %xmm8
- movdqu (%rdi), %xmm10
- movdqu 16(%rdi), %xmm11
- movdqu 32(%rdi), %xmm12
- movdqu 48(%rdi), %xmm13
- pxor %xmm9, %xmm1
- pxor %xmm10, %xmm2
- pxor %xmm11, %xmm3
- pxor %xmm12, %xmm4
- pxor %xmm13, %xmm5
- movdqu 64(%rdi), %xmm10
- movdqu 80(%rdi), %xmm11
- movdqu 96(%rdi), %xmm12
- movdqu 112(%rdi), %xmm9
- pxor %xmm10, %xmm6
- pxor %xmm11, %xmm7
- pxor %xmm12, %xmm8
- movdqu %xmm1, (%rsi)
- movdqu %xmm2, 16(%rsi)
- movdqu %xmm3, 32(%rsi)
- movdqu %xmm4, 48(%rsi)
- movdqu %xmm5, 64(%rsi)
- movdqu %xmm6, 80(%rsi)
- movdqu %xmm7, 96(%rsi)
- movdqu %xmm8, 112(%rsi)
- addq $128, %rdi
- decq %rcx
- jne DLOOP_8
- addq $128, %rsi
-DREMAINDER_8:
- cmpq $0, %r10
- je DEND_8
-DLOOP_8_2:
- movdqu (%rdi), %xmm1
- movdqa %xmm1, %xmm10
- addq $16, %rdi
- pxor (%r8), %xmm1
- movdqu 160(%r8), %xmm2
- cmpl $12, %r9d
- aesdec 16(%r8), %xmm1
- aesdec 32(%r8), %xmm1
- aesdec 48(%r8), %xmm1
- aesdec 64(%r8), %xmm1
- aesdec 80(%r8), %xmm1
- aesdec 96(%r8), %xmm1
- aesdec 112(%r8), %xmm1
- aesdec 128(%r8), %xmm1
- aesdec 144(%r8), %xmm1
- jb DLAST_8_2
- movdqu 192(%r8), %xmm2
- cmpl $14, %r9d
- aesdec 160(%r8), %xmm1
- aesdec 176(%r8), %xmm1
- jb DLAST_8_2
- movdqu 224(%r8), %xmm2
- aesdec 192(%r8), %xmm1
- aesdec 208(%r8), %xmm1
-DLAST_8_2:
- aesdeclast %xmm2, %xmm1
- pxor %xmm9, %xmm1
- movdqa %xmm10, %xmm9
- movdqu %xmm1, (%rsi)
- addq $16, %rsi
- decq %r10
- jne DLOOP_8_2
-DEND_8:
- ret
-
-#endif /* WOLFSSL_AESNI_BYx */
-
-
-/*
-AES_ECB_encrypt_AESNI (const unsigned char *in,
- unsigned char *out,
- unsigned long length,
- const unsigned char *KS,
- int nr)
-*/
-#ifndef __APPLE__
-.globl AES_ECB_encrypt_AESNI
-AES_ECB_encrypt_AESNI:
-#else
-.globl _AES_ECB_encrypt_AESNI
-_AES_ECB_encrypt_AESNI:
-#endif
-# parameter 1: %rdi
-# parameter 2: %rsi
-# parameter 3: %rdx
-# parameter 4: %rcx
-# parameter 5: %r8d
- movq %rdx, %r10
- shrq $4, %rdx
- shlq $60, %r10
- je EECB_NO_PARTS_4
- addq $1, %rdx
-EECB_NO_PARTS_4:
- movq %rdx, %r10
- shlq $62, %r10
- shrq $62, %r10
- shrq $2, %rdx
- je EECB_REMAINDER_4
- subq $64, %rsi
-EECB_LOOP_4:
- movdqu (%rdi), %xmm1
- movdqu 16(%rdi), %xmm2
- movdqu 32(%rdi), %xmm3
- movdqu 48(%rdi), %xmm4
- movdqa (%rcx), %xmm9
- movdqa 16(%rcx), %xmm10
- movdqa 32(%rcx), %xmm11
- movdqa 48(%rcx), %xmm12
- pxor %xmm9, %xmm1
- pxor %xmm9, %xmm2
- pxor %xmm9, %xmm3
- pxor %xmm9, %xmm4
- aesenc %xmm10, %xmm1
- aesenc %xmm10, %xmm2
- aesenc %xmm10, %xmm3
- aesenc %xmm10, %xmm4
- aesenc %xmm11, %xmm1
- aesenc %xmm11, %xmm2
- aesenc %xmm11, %xmm3
- aesenc %xmm11, %xmm4
- aesenc %xmm12, %xmm1
- aesenc %xmm12, %xmm2
- aesenc %xmm12, %xmm3
- aesenc %xmm12, %xmm4
- movdqa 64(%rcx), %xmm9
- movdqa 80(%rcx), %xmm10
- movdqa 96(%rcx), %xmm11
- movdqa 112(%rcx), %xmm12
- aesenc %xmm9, %xmm1
- aesenc %xmm9, %xmm2
- aesenc %xmm9, %xmm3
- aesenc %xmm9, %xmm4
- aesenc %xmm10, %xmm1
- aesenc %xmm10, %xmm2
- aesenc %xmm10, %xmm3
- aesenc %xmm10, %xmm4
- aesenc %xmm11, %xmm1
- aesenc %xmm11, %xmm2
- aesenc %xmm11, %xmm3
- aesenc %xmm11, %xmm4
- aesenc %xmm12, %xmm1
- aesenc %xmm12, %xmm2
- aesenc %xmm12, %xmm3
- aesenc %xmm12, %xmm4
- movdqa 128(%rcx), %xmm9
- movdqa 144(%rcx), %xmm10
- movdqa 160(%rcx), %xmm11
- cmpl $12, %r8d
- aesenc %xmm9, %xmm1
- aesenc %xmm9, %xmm2
- aesenc %xmm9, %xmm3
- aesenc %xmm9, %xmm4
- aesenc %xmm10, %xmm1
- aesenc %xmm10, %xmm2
- aesenc %xmm10, %xmm3
- aesenc %xmm10, %xmm4
- jb EECB_LAST_4
- movdqa 160(%rcx), %xmm9
- movdqa 176(%rcx), %xmm10
- movdqa 192(%rcx), %xmm11
- cmpl $14, %r8d
- aesenc %xmm9, %xmm1
- aesenc %xmm9, %xmm2
- aesenc %xmm9, %xmm3
- aesenc %xmm9, %xmm4
- aesenc %xmm10, %xmm1
- aesenc %xmm10, %xmm2
- aesenc %xmm10, %xmm3
- aesenc %xmm10, %xmm4
- jb EECB_LAST_4
- movdqa 192(%rcx), %xmm9
- movdqa 208(%rcx), %xmm10
- movdqa 224(%rcx), %xmm11
- aesenc %xmm9, %xmm1
- aesenc %xmm9, %xmm2
- aesenc %xmm9, %xmm3
- aesenc %xmm9, %xmm4
- aesenc %xmm10, %xmm1
- aesenc %xmm10, %xmm2
- aesenc %xmm10, %xmm3
- aesenc %xmm10, %xmm4
-EECB_LAST_4:
- addq $64, %rdi
- addq $64, %rsi
- decq %rdx
- aesenclast %xmm11, %xmm1
- aesenclast %xmm11, %xmm2
- aesenclast %xmm11, %xmm3
- aesenclast %xmm11, %xmm4
- movdqu %xmm1, (%rsi)
- movdqu %xmm2, 16(%rsi)
- movdqu %xmm3, 32(%rsi)
- movdqu %xmm4, 48(%rsi)
- jne EECB_LOOP_4
- addq $64, %rsi
-EECB_REMAINDER_4:
- cmpq $0, %r10
- je EECB_END_4
-EECB_LOOP_4_2:
- movdqu (%rdi), %xmm1
- addq $16, %rdi
- pxor (%rcx), %xmm1
- movdqu 160(%rcx), %xmm2
- aesenc 16(%rcx), %xmm1
- aesenc 32(%rcx), %xmm1
- aesenc 48(%rcx), %xmm1
- aesenc 64(%rcx), %xmm1
- aesenc 80(%rcx), %xmm1
- aesenc 96(%rcx), %xmm1
- aesenc 112(%rcx), %xmm1
- aesenc 128(%rcx), %xmm1
- aesenc 144(%rcx), %xmm1
- cmpl $12, %r8d
- jb EECB_LAST_4_2
- movdqu 192(%rcx), %xmm2
- aesenc 160(%rcx), %xmm1
- aesenc 176(%rcx), %xmm1
- cmpl $14, %r8d
- jb EECB_LAST_4_2
- movdqu 224(%rcx), %xmm2
- aesenc 192(%rcx), %xmm1
- aesenc 208(%rcx), %xmm1
-EECB_LAST_4_2:
- aesenclast %xmm2, %xmm1
- movdqu %xmm1, (%rsi)
- addq $16, %rsi
- decq %r10
- jne EECB_LOOP_4_2
-EECB_END_4:
- ret
-
-
-/*
-AES_ECB_decrypt_AESNI (const unsigned char *in,
- unsigned char *out,
- unsigned long length,
- const unsigned char *KS,
- int nr)
-*/
-#ifndef __APPLE__
-.globl AES_ECB_decrypt_AESNI
-AES_ECB_decrypt_AESNI:
-#else
-.globl _AES_ECB_decrypt_AESNI
-_AES_ECB_decrypt_AESNI:
-#endif
-# parameter 1: %rdi
-# parameter 2: %rsi
-# parameter 3: %rdx
-# parameter 4: %rcx
-# parameter 5: %r8d
-
- movq %rdx, %r10
- shrq $4, %rdx
- shlq $60, %r10
- je DECB_NO_PARTS_4
- addq $1, %rdx
-DECB_NO_PARTS_4:
- movq %rdx, %r10
- shlq $62, %r10
- shrq $62, %r10
- shrq $2, %rdx
- je DECB_REMAINDER_4
- subq $64, %rsi
-DECB_LOOP_4:
- movdqu (%rdi), %xmm1
- movdqu 16(%rdi), %xmm2
- movdqu 32(%rdi), %xmm3
- movdqu 48(%rdi), %xmm4
- movdqa (%rcx), %xmm9
- movdqa 16(%rcx), %xmm10
- movdqa 32(%rcx), %xmm11
- movdqa 48(%rcx), %xmm12
- pxor %xmm9, %xmm1
- pxor %xmm9, %xmm2
- pxor %xmm9, %xmm3
- pxor %xmm9, %xmm4
- aesdec %xmm10, %xmm1
- aesdec %xmm10, %xmm2
- aesdec %xmm10, %xmm3
- aesdec %xmm10, %xmm4
- aesdec %xmm11, %xmm1
- aesdec %xmm11, %xmm2
- aesdec %xmm11, %xmm3
- aesdec %xmm11, %xmm4
- aesdec %xmm12, %xmm1
- aesdec %xmm12, %xmm2
- aesdec %xmm12, %xmm3
- aesdec %xmm12, %xmm4
- movdqa 64(%rcx), %xmm9
- movdqa 80(%rcx), %xmm10
- movdqa 96(%rcx), %xmm11
- movdqa 112(%rcx), %xmm12
- aesdec %xmm9, %xmm1
- aesdec %xmm9, %xmm2
- aesdec %xmm9, %xmm3
- aesdec %xmm9, %xmm4
- aesdec %xmm10, %xmm1
- aesdec %xmm10, %xmm2
- aesdec %xmm10, %xmm3
- aesdec %xmm10, %xmm4
- aesdec %xmm11, %xmm1
- aesdec %xmm11, %xmm2
- aesdec %xmm11, %xmm3
- aesdec %xmm11, %xmm4
- aesdec %xmm12, %xmm1
- aesdec %xmm12, %xmm2
- aesdec %xmm12, %xmm3
- aesdec %xmm12, %xmm4
- movdqa 128(%rcx), %xmm9
- movdqa 144(%rcx), %xmm10
- movdqa 160(%rcx), %xmm11
- cmpl $12, %r8d
- aesdec %xmm9, %xmm1
- aesdec %xmm9, %xmm2
- aesdec %xmm9, %xmm3
- aesdec %xmm9, %xmm4
- aesdec %xmm10, %xmm1
- aesdec %xmm10, %xmm2
- aesdec %xmm10, %xmm3
- aesdec %xmm10, %xmm4
- jb DECB_LAST_4
- movdqa 160(%rcx), %xmm9
- movdqa 176(%rcx), %xmm10
- movdqa 192(%rcx), %xmm11
- cmpl $14, %r8d
- aesdec %xmm9, %xmm1
- aesdec %xmm9, %xmm2
- aesdec %xmm9, %xmm3
- aesdec %xmm9, %xmm4
- aesdec %xmm10, %xmm1
- aesdec %xmm10, %xmm2
- aesdec %xmm10, %xmm3
- aesdec %xmm10, %xmm4
- jb DECB_LAST_4
- movdqa 192(%rcx), %xmm9
- movdqa 208(%rcx), %xmm10
- movdqa 224(%rcx), %xmm11
- aesdec %xmm9, %xmm1
- aesdec %xmm9, %xmm2
- aesdec %xmm9, %xmm3
- aesdec %xmm9, %xmm4
- aesdec %xmm10, %xmm1
- aesdec %xmm10, %xmm2
- aesdec %xmm10, %xmm3
- aesdec %xmm10, %xmm4
-DECB_LAST_4:
- addq $64, %rdi
- addq $64, %rsi
- decq %rdx
- aesdeclast %xmm11, %xmm1
- aesdeclast %xmm11, %xmm2
- aesdeclast %xmm11, %xmm3
- aesdeclast %xmm11, %xmm4
- movdqu %xmm1, (%rsi)
- movdqu %xmm2, 16(%rsi)
- movdqu %xmm3, 32(%rsi)
- movdqu %xmm4, 48(%rsi)
- jne DECB_LOOP_4
- addq $64, %rsi
-DECB_REMAINDER_4:
- cmpq $0, %r10
- je DECB_END_4
-DECB_LOOP_4_2:
- movdqu (%rdi), %xmm1
- addq $16, %rdi
- pxor (%rcx), %xmm1
- movdqu 160(%rcx), %xmm2
- cmpl $12, %r8d
- aesdec 16(%rcx), %xmm1
- aesdec 32(%rcx), %xmm1
- aesdec 48(%rcx), %xmm1
- aesdec 64(%rcx), %xmm1
- aesdec 80(%rcx), %xmm1
- aesdec 96(%rcx), %xmm1
- aesdec 112(%rcx), %xmm1
- aesdec 128(%rcx), %xmm1
- aesdec 144(%rcx), %xmm1
- jb DECB_LAST_4_2
- cmpl $14, %r8d
- movdqu 192(%rcx), %xmm2
- aesdec 160(%rcx), %xmm1
- aesdec 176(%rcx), %xmm1
- jb DECB_LAST_4_2
- movdqu 224(%rcx), %xmm2
- aesdec 192(%rcx), %xmm1
- aesdec 208(%rcx), %xmm1
-DECB_LAST_4_2:
- aesdeclast %xmm2, %xmm1
- movdqu %xmm1, (%rsi)
- addq $16, %rsi
- decq %r10
- jne DECB_LOOP_4_2
-DECB_END_4:
- ret
-
-
-
-
-/*
-void AES_128_Key_Expansion_AESNI(const unsigned char* userkey,
- unsigned char* key_schedule);
-*/
-#ifndef __APPLE__
-.globl AES_128_Key_Expansion_AESNI
-.align 16,0x90
-AES_128_Key_Expansion_AESNI:
-#else
-.globl _AES_128_Key_Expansion_AESNI
-.p2align 4
-_AES_128_Key_Expansion_AESNI:
-#endif
-# parameter 1: %rdi
-# parameter 2: %rsi
-
-movdqu (%rdi), %xmm1
-movdqa %xmm1, (%rsi)
-
-
-ASSISTS:
-aeskeygenassist $1, %xmm1, %xmm2
-call PREPARE_ROUNDKEY_128
-movdqa %xmm1, 16(%rsi)
-aeskeygenassist $2, %xmm1, %xmm2
-call PREPARE_ROUNDKEY_128
-movdqa %xmm1, 32(%rsi)
-aeskeygenassist $4, %xmm1, %xmm2
-call PREPARE_ROUNDKEY_128
-movdqa %xmm1, 48(%rsi)
-aeskeygenassist $8, %xmm1, %xmm2
-call PREPARE_ROUNDKEY_128
-movdqa %xmm1, 64(%rsi)
-aeskeygenassist $16, %xmm1, %xmm2
-call PREPARE_ROUNDKEY_128
-movdqa %xmm1, 80(%rsi)
-aeskeygenassist $32, %xmm1, %xmm2
-call PREPARE_ROUNDKEY_128
-movdqa %xmm1, 96(%rsi)
-aeskeygenassist $64, %xmm1, %xmm2
-call PREPARE_ROUNDKEY_128
-movdqa %xmm1, 112(%rsi)
-aeskeygenassist $0x80, %xmm1, %xmm2
-call PREPARE_ROUNDKEY_128
-movdqa %xmm1, 128(%rsi)
-aeskeygenassist $0x1b, %xmm1, %xmm2
-call PREPARE_ROUNDKEY_128
-movdqa %xmm1, 144(%rsi)
-aeskeygenassist $0x36, %xmm1, %xmm2
-call PREPARE_ROUNDKEY_128
-movdqa %xmm1, 160(%rsi)
-ret
-
-PREPARE_ROUNDKEY_128:
-pshufd $255, %xmm2, %xmm2
-movdqa %xmm1, %xmm3
-pslldq $4, %xmm3
-pxor %xmm3, %xmm1
-pslldq $4, %xmm3
-pxor %xmm3, %xmm1
-pslldq $4, %xmm3
-pxor %xmm3, %xmm1
-pxor %xmm2, %xmm1
-ret
-
-
-/*
-void AES_192_Key_Expansion_AESNI (const unsigned char *userkey,
- unsigned char *key)
-*/
-#ifndef __APPLE__
-.globl AES_192_Key_Expansion_AESNI
-AES_192_Key_Expansion_AESNI:
-#else
-.globl _AES_192_Key_Expansion_AESNI
-_AES_192_Key_Expansion_AESNI:
-#endif
-# parameter 1: %rdi
-# parameter 2: %rsi
-
-movdqu (%rdi), %xmm1
-movq 16(%rdi), %xmm3
-movdqa %xmm1, (%rsi)
-movdqa %xmm3, %xmm5
-
-aeskeygenassist $0x1, %xmm3, %xmm2
-call PREPARE_ROUNDKEY_192
-shufpd $0, %xmm1, %xmm5
-movdqa %xmm5, 16(%rsi)
-movdqa %xmm1, %xmm6
-shufpd $1, %xmm3, %xmm6
-movdqa %xmm6, 32(%rsi)
-
-aeskeygenassist $0x2, %xmm3, %xmm2
-call PREPARE_ROUNDKEY_192
-movdqa %xmm1, 48(%rsi)
-movdqa %xmm3, %xmm5
-
-aeskeygenassist $0x4, %xmm3, %xmm2
-call PREPARE_ROUNDKEY_192
-shufpd $0, %xmm1, %xmm5
-movdqa %xmm5, 64(%rsi)
-movdqa %xmm1, %xmm6
-shufpd $1, %xmm3, %xmm6
-movdqa %xmm6, 80(%rsi)
-
-aeskeygenassist $0x8, %xmm3, %xmm2
-call PREPARE_ROUNDKEY_192
-movdqa %xmm1, 96(%rsi)
-movdqa %xmm3, %xmm5
-
-aeskeygenassist $0x10, %xmm3, %xmm2
-call PREPARE_ROUNDKEY_192
-shufpd $0, %xmm1, %xmm5
-movdqa %xmm5, 112(%rsi)
-movdqa %xmm1, %xmm6
-shufpd $1, %xmm3, %xmm6
-movdqa %xmm6, 128(%rsi)
-
-aeskeygenassist $0x20, %xmm3, %xmm2
-call PREPARE_ROUNDKEY_192
-movdqa %xmm1, 144(%rsi)
-movdqa %xmm3, %xmm5
-
-aeskeygenassist $0x40, %xmm3, %xmm2
-call PREPARE_ROUNDKEY_192
-shufpd $0, %xmm1, %xmm5
-movdqa %xmm5, 160(%rsi)
-movdqa %xmm1, %xmm6
-shufpd $1, %xmm3, %xmm6
-movdqa %xmm6, 176(%rsi)
-
-aeskeygenassist $0x80, %xmm3, %xmm2
-call PREPARE_ROUNDKEY_192
-movdqa %xmm1, 192(%rsi)
-movdqa %xmm3, 208(%rsi)
-ret
-
-PREPARE_ROUNDKEY_192:
-pshufd $0x55, %xmm2, %xmm2
-movdqu %xmm1, %xmm4
-pslldq $4, %xmm4
-pxor %xmm4, %xmm1
-
-pslldq $4, %xmm4
-pxor %xmm4, %xmm1
-pslldq $4, %xmm4
-pxor %xmm4, %xmm1
-pxor %xmm2, %xmm1
-pshufd $0xff, %xmm1, %xmm2
-movdqu %xmm3, %xmm4
-pslldq $4, %xmm4
-pxor %xmm4, %xmm3
-pxor %xmm2, %xmm3
-ret
-
-
-/*
-void AES_256_Key_Expansion_AESNI (const unsigned char *userkey,
- unsigned char *key)
-*/
-#ifndef __APPLE__
-.globl AES_256_Key_Expansion_AESNI
-AES_256_Key_Expansion_AESNI:
-#else
-.globl _AES_256_Key_Expansion_AESNI
-_AES_256_Key_Expansion_AESNI:
-#endif
-# parameter 1: %rdi
-# parameter 2: %rsi
-
-movdqu (%rdi), %xmm1
-movdqu 16(%rdi), %xmm3
-movdqa %xmm1, (%rsi)
-movdqa %xmm3, 16(%rsi)
-
-aeskeygenassist $0x1, %xmm3, %xmm2
-call MAKE_RK256_a
-movdqa %xmm1, 32(%rsi)
-aeskeygenassist $0x0, %xmm1, %xmm2
-call MAKE_RK256_b
-movdqa %xmm3, 48(%rsi)
-aeskeygenassist $0x2, %xmm3, %xmm2
-call MAKE_RK256_a
-movdqa %xmm1, 64(%rsi)
-aeskeygenassist $0x0, %xmm1, %xmm2
-call MAKE_RK256_b
-movdqa %xmm3, 80(%rsi)
-aeskeygenassist $0x4, %xmm3, %xmm2
-call MAKE_RK256_a
-movdqa %xmm1, 96(%rsi)
-aeskeygenassist $0x0, %xmm1, %xmm2
-call MAKE_RK256_b
-movdqa %xmm3, 112(%rsi)
-aeskeygenassist $0x8, %xmm3, %xmm2
-call MAKE_RK256_a
-movdqa %xmm1, 128(%rsi)
-aeskeygenassist $0x0, %xmm1, %xmm2
-call MAKE_RK256_b
-movdqa %xmm3, 144(%rsi)
-aeskeygenassist $0x10, %xmm3, %xmm2
-call MAKE_RK256_a
-movdqa %xmm1, 160(%rsi)
-aeskeygenassist $0x0, %xmm1, %xmm2
-call MAKE_RK256_b
-movdqa %xmm3, 176(%rsi)
-aeskeygenassist $0x20, %xmm3, %xmm2
-call MAKE_RK256_a
-movdqa %xmm1, 192(%rsi)
-
-aeskeygenassist $0x0, %xmm1, %xmm2
-call MAKE_RK256_b
-movdqa %xmm3, 208(%rsi)
-aeskeygenassist $0x40, %xmm3, %xmm2
-call MAKE_RK256_a
-movdqa %xmm1, 224(%rsi)
-
-ret
-
-MAKE_RK256_a:
-pshufd $0xff, %xmm2, %xmm2
-movdqa %xmm1, %xmm4
-pslldq $4, %xmm4
-pxor %xmm4, %xmm1
-pslldq $4, %xmm4
-pxor %xmm4, %xmm1
-pslldq $4, %xmm4
-pxor %xmm4, %xmm1
-pxor %xmm2, %xmm1
-ret
-
-MAKE_RK256_b:
-pshufd $0xaa, %xmm2, %xmm2
-movdqa %xmm3, %xmm4
-pslldq $4, %xmm4
-pxor %xmm4, %xmm3
-pslldq $4, %xmm4
-pxor %xmm4, %xmm3
-pslldq $4, %xmm4
-pxor %xmm4, %xmm3
-pxor %xmm2, %xmm3
-ret
-
-#elif defined WOLFSSL_X86_BUILD
+#if defined WOLFSSL_X86_BUILD
/*
AES_CBC_encrypt_AESNI (const unsigned char *in,
@@ -2238,7 +931,7 @@ MAKE_RK256_b:
pxor %xmm2, %xmm3
ret
-#endif /* WOLFSSL_X86_64_BUILD */
+#endif /* WOLFSSL_X86_BUILD */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
diff --git a/wolfcrypt/src/aes_asm.asm b/wolfcrypt/src/aes_asm.asm
index 4b5e1250d52..cb06a54ac52 100644
--- a/wolfcrypt/src/aes_asm.asm
+++ b/wolfcrypt/src/aes_asm.asm
@@ -1,1531 +1,54 @@
-; /* aes_asm.asm
-; *
-; * Copyright (C) 2006-2026 wolfSSL Inc.
-; *
-; * This file is part of wolfSSL.
-; *
-; * wolfSSL is free software; you can redistribute it and/or modify
-; * it under the terms of the GNU General Public License as published by
-; * the Free Software Foundation; either version 3 of the License, or
-; * (at your option) any later version.
-; *
-; * wolfSSL is distributed in the hope that it will be useful,
-; * but WITHOUT ANY WARRANTY; without even the implied warranty of
-; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-; * GNU General Public License for more details.
-; *
-; * You should have received a copy of the GNU General Public License
-; * along with this program; if not, write to the Free Software
-; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
-; */
+; /* aes_asm.asm
+; *
+; * Copyright (C) 2006-2026 wolfSSL Inc.
+; *
+; * This file is part of wolfSSL.
+; *
+; * wolfSSL is free software; you can redistribute it and/or modify
+; * it under the terms of the GNU General Public License as published by
+; * the Free Software Foundation; either version 3 of the License, or
+; * (at your option) any later version.
+; *
+; * wolfSSL is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; * GNU General Public License for more details.
+; *
+; * You should have received a copy of the GNU General Public License
+; * along with this program; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+; */
-;
-;
-; /* See Intel Advanced Encryption Standard (AES) Instructions Set White Paper
-; * by Israel, Intel Mobility Group Development Center, Israel Shay Gueron
-; */
-;
-; /* This file is in intel asm syntax, see .s for at&t syntax */
-;
-
-
-fips_version = 0
-IFDEF HAVE_FIPS
- fips_version = 1
- IFDEF HAVE_FIPS_VERSION
- fips_version = HAVE_FIPS_VERSION
- ENDIF
-ENDIF
-
-IF fips_version GE 2
- fipsAb SEGMENT ALIAS(".fipsA$b") 'CODE'
-ELSE
- _text SEGMENT
-ENDIF
-
-
-; /*
-; AES_CBC_encrypt_AESNI[const ,unsigned char*in
-; unsigned ,char*out
-; unsigned ,char ivec+16
-; unsigned ,long length
-; const ,unsigned char*KS
-; int nr]
-; */
-AES_CBC_encrypt_AESNI PROC
-;# parameter 1: rdi
-;# parameter 2: rsi
-;# parameter 3: rdx
-;# parameter 4: rcx
-;# parameter 5: r8
-;# parameter 6: r9d
-
-; save rdi and rsi to rax and r11, restore before ret
- mov rax,rdi
- mov r11,rsi
-
-; convert to what we had for att&t convention
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
- mov rcx,r9
- mov r8,[rsp+40]
- mov r9d,[rsp+48]
-
- mov r10,rcx
- shr rcx,4
- shl r10,60
- je NO_PARTS
- add rcx,1
-NO_PARTS:
- sub rsi,16
- movdqa xmm1,[rdx]
-LOOP_1:
- pxor xmm1,[rdi]
- pxor xmm1,[r8]
- add rsi,16
- add rdi,16
- cmp r9d,12
- aesenc xmm1,16[r8]
- aesenc xmm1,32[r8]
- aesenc xmm1,48[r8]
- aesenc xmm1,64[r8]
- aesenc xmm1,80[r8]
- aesenc xmm1,96[r8]
- aesenc xmm1,112[r8]
- aesenc xmm1,128[r8]
- aesenc xmm1,144[r8]
- movdqa xmm2,160[r8]
- jb LAST
- cmp r9d,14
-
- aesenc xmm1,160[r8]
- aesenc xmm1,176[r8]
- movdqa xmm2,192[r8]
- jb LAST
- aesenc xmm1,192[r8]
- aesenc xmm1,208[r8]
- movdqa xmm2,224[r8]
-LAST:
- dec rcx
- aesenclast xmm1,xmm2
- movdqu [rsi],xmm1
- jne LOOP_1
- ; restore non volatile rdi,rsi
- mov rdi,rax
- mov rsi,r11
- ret
-AES_CBC_encrypt_AESNI ENDP
-
-
-; void AES_CBC_decrypt_AESNI_by4(const unsigned char* in,
-; unsigned char* out,
-; unsigned char ivec[16],
-; unsigned long length,
-; const unsigned char* KS,
-; int nr)
-AES_CBC_decrypt_AESNI_by4 PROC
-; parameter 1: rdi
-; parameter 2: rsi
-; parameter 3: rdx
-; parameter 4: rcx
-; parameter 5: r8
-; parameter 6: r9d
-
- ; save rdi and rsi to rax and r11, restore before ret
- mov rax, rdi
- mov r11, rsi
- ; convert to what we had for att&t convention
- mov rdi, rcx
- mov rsi, rdx
- mov rdx, r8
- mov rcx,r9
- mov r8, [rsp+40]
- mov r9d, [rsp+48]
- ; on microsoft xmm6-xmm15 are non volatile,
- ; let's save on stack and restore at end
- sub rsp, 8+8*16 ; 8 = align stack , 8 xmm6-12,15 16 bytes each
- movdqa [rsp+0], xmm6
- movdqa [rsp+16], xmm7
- movdqa [rsp+32], xmm8
- movdqa [rsp+48], xmm9
- movdqa [rsp+64], xmm10
- movdqa [rsp+80], xmm11
- movdqa [rsp+96], xmm12
- movdqa [rsp+112], xmm15
- ; back to our original code, more or less
- mov r10, rcx
- shr rcx, 4
- shl r10, 60
- je DNO_PARTS_4
- add rcx, 1
-DNO_PARTS_4:
- mov r10, rcx
- shl r10, 62
- shr r10, 62
- shr rcx, 2
- movdqu xmm5, [rdx]
- je DREMAINDER_4
- sub rsi, 64
-DLOOP_4:
- movdqu xmm1, [rdi]
- movdqu xmm2, 16[rdi]
- movdqu xmm3, 32[rdi]
- movdqu xmm4, 48[rdi]
- movdqa xmm6, xmm1
- movdqa xmm7, xmm2
- movdqa xmm8, xmm3
- movdqa xmm15, xmm4
- movdqa xmm9, [r8]
- movdqa xmm10, 16[r8]
- movdqa xmm11, 32[r8]
- movdqa xmm12, 48[r8]
- pxor xmm1, xmm9
- pxor xmm2, xmm9
- pxor xmm3, xmm9
- pxor xmm4, xmm9
- aesdec xmm1, xmm10
- aesdec xmm2, xmm10
- aesdec xmm3, xmm10
- aesdec xmm4, xmm10
- aesdec xmm1, xmm11
- aesdec xmm2, xmm11
- aesdec xmm3, xmm11
- aesdec xmm4, xmm11
- aesdec xmm1, xmm12
- aesdec xmm2, xmm12
- aesdec xmm3, xmm12
- aesdec xmm4, xmm12
- movdqa xmm9, 64[r8]
- movdqa xmm10, 80[r8]
- movdqa xmm11, 96[r8]
- movdqa xmm12, 112[r8]
- aesdec xmm1, xmm9
- aesdec xmm2, xmm9
- aesdec xmm3, xmm9
- aesdec xmm4, xmm9
- aesdec xmm1, xmm10
- aesdec xmm2, xmm10
- aesdec xmm3, xmm10
- aesdec xmm4, xmm10
- aesdec xmm1, xmm11
- aesdec xmm2, xmm11
- aesdec xmm3, xmm11
- aesdec xmm4, xmm11
- aesdec xmm1, xmm12
- aesdec xmm2, xmm12
- aesdec xmm3, xmm12
- aesdec xmm4, xmm12
- movdqa xmm9, 128[r8]
- movdqa xmm10, 144[r8]
- movdqa xmm11, 160[r8]
- cmp r9d, 12
- aesdec xmm1, xmm9
- aesdec xmm2, xmm9
- aesdec xmm3, xmm9
- aesdec xmm4, xmm9
- aesdec xmm1, xmm10
- aesdec xmm2, xmm10
- aesdec xmm3, xmm10
- aesdec xmm4, xmm10
- jb DLAST_4
- movdqa xmm9, 160[r8]
- movdqa xmm10, 176[r8]
- movdqa xmm11, 192[r8]
- cmp r9d, 14
- aesdec xmm1, xmm9
- aesdec xmm2, xmm9
- aesdec xmm3, xmm9
- aesdec xmm4, xmm9
- aesdec xmm1, xmm10
- aesdec xmm2, xmm10
- aesdec xmm3, xmm10
- aesdec xmm4, xmm10
- jb DLAST_4
- movdqa xmm9, 192[r8]
- movdqa xmm10, 208[r8]
- movdqa xmm11, 224[r8]
- aesdec xmm1, xmm9
- aesdec xmm2, xmm9
- aesdec xmm3, xmm9
- aesdec xmm4, xmm9
- aesdec xmm1, xmm10
- aesdec xmm2, xmm10
- aesdec xmm3, xmm10
- aesdec xmm4, xmm10
-DLAST_4:
- add rdi, 64
- add rsi, 64
- dec rcx
- aesdeclast xmm1, xmm11
- aesdeclast xmm2, xmm11
- aesdeclast xmm3, xmm11
- aesdeclast xmm4, xmm11
- pxor xmm1, xmm5
- pxor xmm2, xmm6
- pxor xmm3, xmm7
- pxor xmm4, xmm8
- movdqu [rsi], xmm1
- movdqu 16[rsi], xmm2
- movdqu 32[rsi], xmm3
- movdqu 48[rsi], xmm4
- movdqa xmm5, xmm15
- jne DLOOP_4
- add rsi, 64
-DREMAINDER_4:
- cmp r10, 0
- je DEND_4
-DLOOP_4_2:
- movdqu xmm1, [rdi]
- movdqa xmm15, xmm1
- add rdi, 16
- pxor xmm1, [r8]
- movdqu xmm2, 160[r8]
- cmp r9d, 12
- aesdec xmm1, 16[r8]
- aesdec xmm1, 32[r8]
- aesdec xmm1, 48[r8]
- aesdec xmm1, 64[r8]
- aesdec xmm1, 80[r8]
- aesdec xmm1, 96[r8]
- aesdec xmm1, 112[r8]
- aesdec xmm1, 128[r8]
- aesdec xmm1, 144[r8]
- jb DLAST_4_2
- movdqu xmm2, 192[r8]
- cmp r9d, 14
- aesdec xmm1, 160[r8]
- aesdec xmm1, 176[r8]
- jb DLAST_4_2
- movdqu xmm2, 224[r8]
- aesdec xmm1, 192[r8]
- aesdec xmm1, 208[r8]
-DLAST_4_2:
- aesdeclast xmm1, xmm2
- pxor xmm1, xmm5
- movdqa xmm5, xmm15
- movdqu [rsi], xmm1
- add rsi, 16
- dec r10
- jne DLOOP_4_2
-DEND_4:
- ; restore non volatile rdi,rsi
- mov rdi, rax
- mov rsi, r11
- ; restore non volatile xmms from stack
- movdqa xmm6, [rsp+0]
- movdqa xmm7, [rsp+16]
- movdqa xmm8, [rsp+32]
- movdqa xmm9, [rsp+48]
- movdqa xmm10, [rsp+64]
- movdqa xmm11, [rsp+80]
- movdqa xmm12, [rsp+96]
- movdqa xmm15, [rsp+112]
- add rsp, 8+8*16 ; 8 = align stack , 8 xmm6-12,15 16 bytes each
- ret
-AES_CBC_decrypt_AESNI_by4 ENDP
-
-
-; void AES_CBC_decrypt_AESNI_by6(const unsigned char *in,
-; unsigned char *out,
-; unsigned char ivec[16],
-; unsigned long length,
-; const unsigned char *KS,
-; int nr)
-AES_CBC_decrypt_AESNI_by6 PROC
-; parameter 1: rdi - in
-; parameter 2: rsi - out
-; parameter 3: rdx - ivec
-; parameter 4: rcx - length
-; parameter 5: r8 - KS
-; parameter 6: r9d - nr
-
- ; save rdi and rsi to rax and r11, restore before ret
- mov rax, rdi
- mov r11, rsi
- ; convert to what we had for att&t convention
- mov rdi, rcx
- mov rsi, rdx
- mov rdx, r8
- mov rcx, r9
- mov r8, [rsp+40]
- mov r9d, [rsp+48]
- ; on microsoft xmm6-xmm15 are non volatile,
- ; let's save on stack and restore at end
- sub rsp, 8+9*16 ; 8 = align stack , 9 xmm6-14 16 bytes each
- movdqa [rsp+0], xmm6
- movdqa [rsp+16], xmm7
- movdqa [rsp+32], xmm8
- movdqa [rsp+48], xmm9
- movdqa [rsp+64], xmm10
- movdqa [rsp+80], xmm11
- movdqa [rsp+96], xmm12
- movdqa [rsp+112], xmm13
- movdqa [rsp+128], xmm14
- ; back to our original code, more or less
- mov r10, rcx
- shr rcx, 4
- shl r10, 60
- je DNO_PARTS_6
- add rcx, 1
-DNO_PARTS_6:
- mov r12, rax
- mov r13, rdx
- mov r14, rbx
- mov rdx, 0
- mov rax, rcx
- mov rbx, 6
- div rbx
- mov rcx, rax
- mov r10, rdx
- mov rax, r12
- mov rdx, r13
- mov rbx, r14
- cmp rcx, 0
- movdqu xmm7, [rdx]
- je DREMAINDER_6
- sub rsi, 96
-DLOOP_6:
- movdqu xmm1, [rdi]
- movdqu xmm2, 16[rdi]
- movdqu xmm3, 32[rdi]
- movdqu xmm4, 48[rdi]
- movdqu xmm5, 64[rdi]
- movdqu xmm6, 80[rdi]
- movdqa xmm8, [r8]
- movdqa xmm9, 16[r8]
- movdqa xmm10, 32[r8]
- movdqa xmm11, 48[r8]
- pxor xmm1, xmm8
- pxor xmm2, xmm8
- pxor xmm3, xmm8
- pxor xmm4, xmm8
- pxor xmm5, xmm8
- pxor xmm6, xmm8
- aesdec xmm1, xmm9
- aesdec xmm2, xmm9
- aesdec xmm3, xmm9
- aesdec xmm4, xmm9
- aesdec xmm5, xmm9
- aesdec xmm6, xmm9
- aesdec xmm1, xmm10
- aesdec xmm2, xmm10
- aesdec xmm3, xmm10
- aesdec xmm4, xmm10
- aesdec xmm5, xmm10
- aesdec xmm6, xmm10
- aesdec xmm1, xmm11
- aesdec xmm2, xmm11
- aesdec xmm3, xmm11
- aesdec xmm4, xmm11
- aesdec xmm5, xmm11
- aesdec xmm6, xmm11
- movdqa xmm8, 64[r8]
- movdqa xmm9, 80[r8]
- movdqa xmm10, 96[r8]
- movdqa xmm11, 112[r8]
- aesdec xmm1, xmm8
- aesdec xmm2, xmm8
- aesdec xmm3, xmm8
- aesdec xmm4, xmm8
- aesdec xmm5, xmm8
- aesdec xmm6, xmm8
- aesdec xmm1, xmm9
- aesdec xmm2, xmm9
- aesdec xmm3, xmm9
- aesdec xmm4, xmm9
- aesdec xmm5, xmm9
- aesdec xmm6, xmm9
- aesdec xmm1, xmm10
- aesdec xmm2, xmm10
- aesdec xmm3, xmm10
- aesdec xmm4, xmm10
- aesdec xmm5, xmm10
- aesdec xmm6, xmm10
- aesdec xmm1, xmm11
- aesdec xmm2, xmm11
- aesdec xmm3, xmm11
- aesdec xmm4, xmm11
- aesdec xmm5, xmm11
- aesdec xmm6, xmm11
- movdqa xmm8, 128[r8]
- movdqa xmm9, 144[r8]
- movdqa xmm10, 160[r8]
- cmp r9d, 12
- aesdec xmm1, xmm8
- aesdec xmm2, xmm8
- aesdec xmm3, xmm8
- aesdec xmm4, xmm8
- aesdec xmm5, xmm8
- aesdec xmm6, xmm8
- aesdec xmm1, xmm9
- aesdec xmm2, xmm9
- aesdec xmm3, xmm9
- aesdec xmm4, xmm9
- aesdec xmm5, xmm9
- aesdec xmm6, xmm9
- jb DLAST_6
- movdqa xmm8, 160[r8]
- movdqa xmm9, 176[r8]
- movdqa xmm10, 192[r8]
- cmp r9d, 14
- aesdec xmm1, xmm8
- aesdec xmm2, xmm8
- aesdec xmm3, xmm8
- aesdec xmm4, xmm8
- aesdec xmm5, xmm8
- aesdec xmm6, xmm8
- aesdec xmm1, xmm9
- aesdec xmm2, xmm9
- aesdec xmm3, xmm9
- aesdec xmm4, xmm9
- aesdec xmm5, xmm9
- aesdec xmm6, xmm9
- jb DLAST_6
- movdqa xmm8, 192[r8]
- movdqa xmm9, 208[r8]
- movdqa xmm10, 224[r8]
- aesdec xmm1, xmm8
- aesdec xmm2, xmm8
- aesdec xmm3, xmm8
- aesdec xmm4, xmm8
- aesdec xmm5, xmm8
- aesdec xmm6, xmm8
- aesdec xmm1, xmm9
- aesdec xmm2, xmm9
- aesdec xmm3, xmm9
- aesdec xmm4, xmm9
- aesdec xmm5, xmm9
- aesdec xmm6, xmm9
-DLAST_6:
- add rsi, 96
- aesdeclast xmm1, xmm10
- aesdeclast xmm2, xmm10
- aesdeclast xmm3, xmm10
- aesdeclast xmm4, xmm10
- aesdeclast xmm5, xmm10
- aesdeclast xmm6, xmm10
- movdqu xmm8, [rdi]
- movdqu xmm9, 16[rdi]
- movdqu xmm10, 32[rdi]
- movdqu xmm11, 48[rdi]
- movdqu xmm12, 64[rdi]
- movdqu xmm13, 80[rdi]
- pxor xmm1, xmm7
- pxor xmm2, xmm8
- pxor xmm3, xmm9
- pxor xmm4, xmm10
- pxor xmm5, xmm11
- pxor xmm6, xmm12
- movdqu xmm7, xmm13
- movdqu [rsi], xmm1
- movdqu 16[rsi], xmm2
- movdqu 32[rsi], xmm3
- movdqu 48[rsi], xmm4
- movdqu 64[rsi], xmm5
- movdqu 80[rsi], xmm6
- add rdi, 96
- dec rcx
- jne DLOOP_6
- add rsi, 96
-DREMAINDER_6:
- cmp r10, 0
- je DEND_6
-DLOOP_6_2:
- movdqu xmm1, [rdi]
- movdqa xmm10, xmm1
- add rdi, 16
- pxor xmm1, [r8]
- movdqu xmm2, 160[r8]
- cmp r9d, 12
- aesdec xmm1, 16[r8]
- aesdec xmm1, 32[r8]
- aesdec xmm1, 48[r8]
- aesdec xmm1, 64[r8]
- aesdec xmm1, 80[r8]
- aesdec xmm1, 96[r8]
- aesdec xmm1, 112[r8]
- aesdec xmm1, 128[r8]
- aesdec xmm1, 144[r8]
- jb DLAST_6_2
- movdqu xmm2, 192[r8]
- cmp r9d, 14
- aesdec xmm1, 160[r8]
- aesdec xmm1, 176[r8]
- jb DLAST_6_2
- movdqu xmm2, 224[r8]
- aesdec xmm1, 192[r8]
- aesdec xmm1, 208[r8]
-DLAST_6_2:
- aesdeclast xmm1, xmm2
- pxor xmm1, xmm7
- movdqa xmm7, xmm10
- movdqu [rsi], xmm1
- add rsi, 16
- dec r10
- jne DLOOP_6_2
-DEND_6:
- ; restore non volatile rdi,rsi
- mov rdi, rax
- mov rsi, r11
- ; restore non volatile xmms from stack
- movdqa xmm6, [rsp+0]
- movdqa xmm7, [rsp+16]
- movdqa xmm8, [rsp+32]
- movdqa xmm9, [rsp+48]
- movdqa xmm10, [rsp+64]
- movdqa xmm11, [rsp+80]
- movdqa xmm12, [rsp+96]
- movdqa xmm13, [rsp+112]
- movdqa xmm14, [rsp+128]
- add rsp, 8+9*16 ; 8 = align stack , 9 xmm6-14 16 bytes each
- ret
-AES_CBC_decrypt_AESNI_by6 ENDP
-
-
-; void AES_CBC_decrypt_AESNI_by8(const unsigned char *in,
-; unsigned char *out,
-; unsigned char ivec[16],
-; unsigned long length,
-; const unsigned char *KS,
-; int nr)
-AES_CBC_decrypt_AESNI_by8 PROC
-; parameter 1: rdi - in
-; parameter 2: rsi - out
-; parameter 3: rdx - ivec
-; parameter 4: rcx - length
-; parameter 5: r8 - KS
-; parameter 6: r9d - nr
-
- ; save rdi and rsi to rax and r11, restore before ret
- mov rax, rdi
- mov r11, rsi
- ; convert to what we had for att&t convention
- mov rdi, rcx
- mov rsi, rdx
- mov rdx, r8
- mov rcx,r9
- mov r8, [rsp+40]
- mov r9d, [rsp+48]
- ; on microsoft xmm6-xmm15 are non volatile,
- ; let's save on stack and restore at end
- sub rsp, 8+8*16 ; 8 = align stack , 8 xmm6-13 16 bytes each
- movdqa [rsp+0], xmm6
- movdqa [rsp+16], xmm7
- movdqa [rsp+32], xmm8
- movdqa [rsp+48], xmm9
- movdqa [rsp+64], xmm10
- movdqa [rsp+80], xmm11
- movdqa [rsp+96], xmm12
- movdqa [rsp+112], xmm13
- ; back to our original code, more or less
- mov r10, rcx
- shr rcx, 4
- shl r10, 60
- je DNO_PARTS_8
- add rcx, 1
-DNO_PARTS_8:
- mov r10, rcx
- shl r10, 61
- shr r10, 61
- shr rcx, 3
- movdqu xmm9, [rdx]
- je DREMAINDER_8
- sub rsi, 128
-DLOOP_8:
- movdqu xmm1, [rdi]
- movdqu xmm2, 16[rdi]
- movdqu xmm3, 32[rdi]
- movdqu xmm4, 48[rdi]
- movdqu xmm5, 64[rdi]
- movdqu xmm6, 80[rdi]
- movdqu xmm7, 96[rdi]
- movdqu xmm8, 112[rdi]
- movdqa xmm10, [r8]
- movdqa xmm11, 16[r8]
- movdqa xmm12, 32[r8]
- movdqa xmm13, 48[r8]
- pxor xmm1, xmm10
- pxor xmm2, xmm10
- pxor xmm3, xmm10
- pxor xmm4, xmm10
- pxor xmm5, xmm10
- pxor xmm6, xmm10
- pxor xmm7, xmm10
- pxor xmm8, xmm10
- aesdec xmm1, xmm11
- aesdec xmm2, xmm11
- aesdec xmm3, xmm11
- aesdec xmm4, xmm11
- aesdec xmm5, xmm11
- aesdec xmm6, xmm11
- aesdec xmm7, xmm11
- aesdec xmm8, xmm11
- aesdec xmm1, xmm12
- aesdec xmm2, xmm12
- aesdec xmm3, xmm12
- aesdec xmm4, xmm12
- aesdec xmm5, xmm12
- aesdec xmm6, xmm12
- aesdec xmm7, xmm12
- aesdec xmm8, xmm12
- aesdec xmm1, xmm13
- aesdec xmm2, xmm13
- aesdec xmm3, xmm13
- aesdec xmm4, xmm13
- aesdec xmm5, xmm13
- aesdec xmm6, xmm13
- aesdec xmm7, xmm13
- aesdec xmm8, xmm13
- movdqa xmm10, 64[r8]
- movdqa xmm11, 80[r8]
- movdqa xmm12, 96[r8]
- movdqa xmm13, 112[r8]
- aesdec xmm1, xmm10
- aesdec xmm2, xmm10
- aesdec xmm3, xmm10
- aesdec xmm4, xmm10
- aesdec xmm5, xmm10
- aesdec xmm6, xmm10
- aesdec xmm7, xmm10
- aesdec xmm8, xmm10
- aesdec xmm1, xmm11
- aesdec xmm2, xmm11
- aesdec xmm3, xmm11
- aesdec xmm4, xmm11
- aesdec xmm5, xmm11
- aesdec xmm6, xmm11
- aesdec xmm7, xmm11
- aesdec xmm8, xmm11
- aesdec xmm1, xmm12
- aesdec xmm2, xmm12
- aesdec xmm3, xmm12
- aesdec xmm4, xmm12
- aesdec xmm5, xmm12
- aesdec xmm6, xmm12
- aesdec xmm7, xmm12
- aesdec xmm8, xmm12
- aesdec xmm1, xmm13
- aesdec xmm2, xmm13
- aesdec xmm3, xmm13
- aesdec xmm4, xmm13
- aesdec xmm5, xmm13
- aesdec xmm6, xmm13
- aesdec xmm7, xmm13
- aesdec xmm8, xmm13
- movdqa xmm10, 128[r8]
- movdqa xmm11, 144[r8]
- movdqa xmm12, 160[r8]
- cmp r9d, 12
- aesdec xmm1, xmm10
- aesdec xmm2, xmm10
- aesdec xmm3, xmm10
- aesdec xmm4, xmm10
- aesdec xmm5, xmm10
- aesdec xmm6, xmm10
- aesdec xmm7, xmm10
- aesdec xmm8, xmm10
- aesdec xmm1, xmm11
- aesdec xmm2, xmm11
- aesdec xmm3, xmm11
- aesdec xmm4, xmm11
- aesdec xmm5, xmm11
- aesdec xmm6, xmm11
- aesdec xmm7, xmm11
- aesdec xmm8, xmm11
- jb DLAST_8
- movdqa xmm10, 160[r8]
- movdqa xmm11, 176[r8]
- movdqa xmm12, 192[r8]
- cmp r9d, 14
- aesdec xmm1, xmm10
- aesdec xmm2, xmm10
- aesdec xmm3, xmm10
- aesdec xmm4, xmm10
- aesdec xmm5, xmm10
- aesdec xmm6, xmm10
- aesdec xmm7, xmm10
- aesdec xmm8, xmm10
- aesdec xmm1, xmm11
- aesdec xmm2, xmm11
- aesdec xmm3, xmm11
- aesdec xmm4, xmm11
- aesdec xmm5, xmm11
- aesdec xmm6, xmm11
- aesdec xmm7, xmm11
- aesdec xmm8, xmm11
- jb DLAST_8
- movdqa xmm10, 192[r8]
- movdqa xmm11, 208[r8]
- movdqa xmm12, 224[r8]
- aesdec xmm1, xmm10
- aesdec xmm2, xmm10
- aesdec xmm3, xmm10
- aesdec xmm4, xmm10
- aesdec xmm5, xmm10
- aesdec xmm6, xmm10
- aesdec xmm7, xmm10
- aesdec xmm8, xmm10
- aesdec xmm1, xmm11
- aesdec xmm2, xmm11
- aesdec xmm3, xmm11
- aesdec xmm4, xmm11
- aesdec xmm5, xmm11
- aesdec xmm6, xmm11
- aesdec xmm7, xmm11
- aesdec xmm8, xmm11
-DLAST_8:
- add rsi, 128
- aesdeclast xmm1, xmm12
- aesdeclast xmm2, xmm12
- aesdeclast xmm3, xmm12
- aesdeclast xmm4, xmm12
- aesdeclast xmm5, xmm12
- aesdeclast xmm6, xmm12
- aesdeclast xmm7, xmm12
- aesdeclast xmm8, xmm12
- movdqu xmm10, [rdi]
- movdqu xmm11, 16[rdi]
- movdqu xmm12, 32[rdi]
- movdqu xmm13, 48[rdi]
- pxor xmm1, xmm9
- pxor xmm2, xmm10
- pxor xmm3, xmm11
- pxor xmm4, xmm12
- pxor xmm5, xmm13
- movdqu xmm10, 64[rdi]
- movdqu xmm11, 80[rdi]
- movdqu xmm12, 96[rdi]
- movdqu xmm9, 112[rdi]
- pxor xmm6, xmm10
- pxor xmm7, xmm11
- pxor xmm8, xmm12
- movdqu [rsi], xmm1
- movdqu 16[rsi], xmm2
- movdqu 32[rsi], xmm3
- movdqu 48[rsi], xmm4
- movdqu 64[rsi], xmm5
- movdqu 80[rsi], xmm6
- movdqu 96[rsi], xmm7
- movdqu 112[rsi], xmm8
- add rdi, 128
- dec rcx
- jne DLOOP_8
- add rsi, 128
-DREMAINDER_8:
- cmp r10, 0
- je DEND_8
-DLOOP_8_2:
- movdqu xmm1, [rdi]
- movdqa xmm10, xmm1
- add rdi, 16
- pxor xmm1, [r8]
- movdqu xmm2, 160[r8]
- cmp r9d, 12
- aesdec xmm1, 16[r8]
- aesdec xmm1, 32[r8]
- aesdec xmm1, 48[r8]
- aesdec xmm1, 64[r8]
- aesdec xmm1, 80[r8]
- aesdec xmm1, 96[r8]
- aesdec xmm1, 112[r8]
- aesdec xmm1, 128[r8]
- aesdec xmm1, 144[r8]
- jb DLAST_8_2
- movdqu xmm2, 192[r8]
- cmp r9d, 14
- aesdec xmm1, 160[r8]
- aesdec xmm1, 176[r8]
- jb DLAST_8_2
- movdqu xmm2, 224[r8]
- aesdec xmm1, 192[r8]
- aesdec xmm1, 208[r8]
-DLAST_8_2:
- aesdeclast xmm1, xmm2
- pxor xmm1, xmm9
- movdqa xmm9, xmm10
- movdqu [rsi], xmm1
- add rsi, 16
- dec r10
- jne DLOOP_8_2
-DEND_8:
- ; restore non volatile rdi,rsi
- mov rdi, rax
- mov rsi, r11
- ; restore non volatile xmms from stack
- movdqa xmm6, [rsp+0]
- movdqa xmm7, [rsp+16]
- movdqa xmm8, [rsp+32]
- movdqa xmm9, [rsp+48]
- movdqa xmm10, [rsp+64]
- movdqa xmm11, [rsp+80]
- movdqa xmm12, [rsp+96]
- movdqa xmm13, [rsp+112]
- add rsp, 8+8*16 ; 8 = align stack , 8 xmm6-13 16 bytes each
- ret
-AES_CBC_decrypt_AESNI_by8 ENDP
-
-
-; /*
-; AES_ECB_encrypt_AESNI[const ,unsigned char*in
-; unsigned ,char*out
-; unsigned ,long length
-; const ,unsigned char*KS
-; int nr]
-; */
-; . globl AES_ECB_encrypt_AESNI
-AES_ECB_encrypt_AESNI PROC
-;# parameter 1: rdi
-;# parameter 2: rsi
-;# parameter 3: rdx
-;# parameter 4: rcx
-;# parameter 5: r8d
-
-; save rdi and rsi to rax and r11, restore before ret
- mov rax,rdi
- mov r11,rsi
-
-; convert to what we had for att&t convention
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
- mov rcx,r9
- mov r8d,[rsp+40]
-
-; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end
- sub rsp,8+4*16 ; 8 = align stack , 4 xmm9-12, 16 bytes each
- movdqa [rsp+0], xmm9
- movdqa [rsp+16], xmm10
- movdqa [rsp+32], xmm11
- movdqa [rsp+48], xmm12
-
-
- mov r10,rdx
- shr rdx,4
- shl r10,60
- je EECB_NO_PARTS_4
- add rdx,1
-EECB_NO_PARTS_4:
- mov r10,rdx
- shl r10,62
- shr r10,62
- shr rdx,2
- je EECB_REMAINDER_4
- sub rsi,64
-EECB_LOOP_4:
- movdqu xmm1,[rdi]
- movdqu xmm2,16[rdi]
- movdqu xmm3,32[rdi]
- movdqu xmm4,48[rdi]
- movdqa xmm9,[rcx]
- movdqa xmm10,16[rcx]
- movdqa xmm11,32[rcx]
- movdqa xmm12,48[rcx]
- pxor xmm1,xmm9
- pxor xmm2,xmm9
- pxor xmm3,xmm9
- pxor xmm4,xmm9
- aesenc xmm1,xmm10
- aesenc xmm2,xmm10
- aesenc xmm3,xmm10
- aesenc xmm4,xmm10
- aesenc xmm1,xmm11
- aesenc xmm2,xmm11
- aesenc xmm3,xmm11
- aesenc xmm4,xmm11
- aesenc xmm1,xmm12
- aesenc xmm2,xmm12
- aesenc xmm3,xmm12
- aesenc xmm4,xmm12
- movdqa xmm9,64[rcx]
- movdqa xmm10,80[rcx]
- movdqa xmm11,96[rcx]
- movdqa xmm12,112[rcx]
- aesenc xmm1,xmm9
- aesenc xmm2,xmm9
- aesenc xmm3,xmm9
- aesenc xmm4,xmm9
- aesenc xmm1,xmm10
- aesenc xmm2,xmm10
- aesenc xmm3,xmm10
- aesenc xmm4,xmm10
- aesenc xmm1,xmm11
- aesenc xmm2,xmm11
- aesenc xmm3,xmm11
- aesenc xmm4,xmm11
- aesenc xmm1,xmm12
- aesenc xmm2,xmm12
- aesenc xmm3,xmm12
- aesenc xmm4,xmm12
- movdqa xmm9,128[rcx]
- movdqa xmm10,144[rcx]
- movdqa xmm11,160[rcx]
- cmp r8d,12
- aesenc xmm1,xmm9
- aesenc xmm2,xmm9
- aesenc xmm3,xmm9
- aesenc xmm4,xmm9
- aesenc xmm1,xmm10
- aesenc xmm2,xmm10
- aesenc xmm3,xmm10
- aesenc xmm4,xmm10
- jb EECB_LAST_4
- movdqa xmm9,160[rcx]
- movdqa xmm10,176[rcx]
- movdqa xmm11,192[rcx]
- cmp r8d,14
- aesenc xmm1,xmm9
- aesenc xmm2,xmm9
- aesenc xmm3,xmm9
- aesenc xmm4,xmm9
- aesenc xmm1,xmm10
- aesenc xmm2,xmm10
- aesenc xmm3,xmm10
- aesenc xmm4,xmm10
- jb EECB_LAST_4
- movdqa xmm9,192[rcx]
- movdqa xmm10,208[rcx]
- movdqa xmm11,224[rcx]
- aesenc xmm1,xmm9
- aesenc xmm2,xmm9
- aesenc xmm3,xmm9
- aesenc xmm4,xmm9
- aesenc xmm1,xmm10
- aesenc xmm2,xmm10
- aesenc xmm3,xmm10
- aesenc xmm4,xmm10
-EECB_LAST_4:
- add rdi,64
- add rsi,64
- dec rdx
- aesenclast xmm1,xmm11
- aesenclast xmm2,xmm11
- aesenclast xmm3,xmm11
- aesenclast xmm4,xmm11
- movdqu [rsi],xmm1
- movdqu 16[rsi],xmm2
- movdqu 32[rsi],xmm3
- movdqu 48[rsi],xmm4
- jne EECB_LOOP_4
- add rsi,64
-EECB_REMAINDER_4:
- cmp r10,0
- je EECB_END_4
-EECB_LOOP_4_2:
- movdqu xmm1,[rdi]
- add rdi,16
- pxor xmm1,[rcx]
- movdqu xmm2,160[rcx]
- aesenc xmm1,16[rcx]
- aesenc xmm1,32[rcx]
- aesenc xmm1,48[rcx]
- aesenc xmm1,64[rcx]
- aesenc xmm1,80[rcx]
- aesenc xmm1,96[rcx]
- aesenc xmm1,112[rcx]
- aesenc xmm1,128[rcx]
- aesenc xmm1,144[rcx]
- cmp r8d,12
- jb EECB_LAST_4_2
- movdqu xmm2,192[rcx]
- aesenc xmm1,160[rcx]
- aesenc xmm1,176[rcx]
- cmp r8d,14
- jb EECB_LAST_4_2
- movdqu xmm2,224[rcx]
- aesenc xmm1,192[rcx]
- aesenc xmm1,208[rcx]
-EECB_LAST_4_2:
- aesenclast xmm1,xmm2
- movdqu [rsi],xmm1
- add rsi,16
- dec r10
- jne EECB_LOOP_4_2
-EECB_END_4:
- ; restore non volatile rdi,rsi
- mov rdi,rax
- mov rsi,r11
- ; restore non volatile xmms from stack
- movdqa xmm9, [rsp+0]
- movdqa xmm10, [rsp+16]
- movdqa xmm11, [rsp+32]
- movdqa xmm12, [rsp+48]
- add rsp,8+4*16 ; 8 = align stack , 4 xmm9-12 16 bytes each
- ret
-AES_ECB_encrypt_AESNI ENDP
-
-; /*
-; AES_ECB_decrypt_AESNI[const ,unsigned char*in
-; unsigned ,char*out
-; unsigned ,long length
-; const ,unsigned char*KS
-; int nr]
-; */
-; . globl AES_ECB_decrypt_AESNI
-AES_ECB_decrypt_AESNI PROC
-;# parameter 1: rdi
-;# parameter 2: rsi
-;# parameter 3: rdx
-;# parameter 4: rcx
-;# parameter 5: r8d
-
-; save rdi and rsi to rax and r11, restore before ret
- mov rax,rdi
- mov r11,rsi
-
-; convert to what we had for att&t convention
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
- mov rcx,r9
- mov r8d,[rsp+40]
-
-; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end
- sub rsp,8+4*16 ; 8 = align stack , 4 xmm9-12, 16 bytes each
- movdqa [rsp+0], xmm9
- movdqa [rsp+16], xmm10
- movdqa [rsp+32], xmm11
- movdqa [rsp+48], xmm12
-
- mov r10,rdx
- shr rdx,4
- shl r10,60
- je DECB_NO_PARTS_4
- add rdx,1
-DECB_NO_PARTS_4:
- mov r10,rdx
- shl r10,62
- shr r10,62
- shr rdx,2
- je DECB_REMAINDER_4
- sub rsi,64
-DECB_LOOP_4:
- movdqu xmm1,[rdi]
- movdqu xmm2,16[rdi]
- movdqu xmm3,32[rdi]
- movdqu xmm4,48[rdi]
- movdqa xmm9,[rcx]
- movdqa xmm10,16[rcx]
- movdqa xmm11,32[rcx]
- movdqa xmm12,48[rcx]
- pxor xmm1,xmm9
- pxor xmm2,xmm9
- pxor xmm3,xmm9
- pxor xmm4,xmm9
- aesdec xmm1,xmm10
- aesdec xmm2,xmm10
- aesdec xmm3,xmm10
- aesdec xmm4,xmm10
- aesdec xmm1,xmm11
- aesdec xmm2,xmm11
- aesdec xmm3,xmm11
- aesdec xmm4,xmm11
- aesdec xmm1,xmm12
- aesdec xmm2,xmm12
- aesdec xmm3,xmm12
- aesdec xmm4,xmm12
- movdqa xmm9,64[rcx]
- movdqa xmm10,80[rcx]
- movdqa xmm11,96[rcx]
- movdqa xmm12,112[rcx]
- aesdec xmm1,xmm9
- aesdec xmm2,xmm9
- aesdec xmm3,xmm9
- aesdec xmm4,xmm9
- aesdec xmm1,xmm10
- aesdec xmm2,xmm10
- aesdec xmm3,xmm10
- aesdec xmm4,xmm10
- aesdec xmm1,xmm11
- aesdec xmm2,xmm11
- aesdec xmm3,xmm11
- aesdec xmm4,xmm11
- aesdec xmm1,xmm12
- aesdec xmm2,xmm12
- aesdec xmm3,xmm12
- aesdec xmm4,xmm12
- movdqa xmm9,128[rcx]
- movdqa xmm10,144[rcx]
- movdqa xmm11,160[rcx]
- cmp r8d,12
- aesdec xmm1,xmm9
- aesdec xmm2,xmm9
- aesdec xmm3,xmm9
- aesdec xmm4,xmm9
- aesdec xmm1,xmm10
- aesdec xmm2,xmm10
- aesdec xmm3,xmm10
- aesdec xmm4,xmm10
- jb DECB_LAST_4
- movdqa xmm9,160[rcx]
- movdqa xmm10,176[rcx]
- movdqa xmm11,192[rcx]
- cmp r8d,14
- aesdec xmm1,xmm9
- aesdec xmm2,xmm9
- aesdec xmm3,xmm9
- aesdec xmm4,xmm9
- aesdec xmm1,xmm10
- aesdec xmm2,xmm10
- aesdec xmm3,xmm10
- aesdec xmm4,xmm10
- jb DECB_LAST_4
- movdqa xmm9,192[rcx]
- movdqa xmm10,208[rcx]
- movdqa xmm11,224[rcx]
- aesdec xmm1,xmm9
- aesdec xmm2,xmm9
- aesdec xmm3,xmm9
- aesdec xmm4,xmm9
- aesdec xmm1,xmm10
- aesdec xmm2,xmm10
- aesdec xmm3,xmm10
- aesdec xmm4,xmm10
-DECB_LAST_4:
- add rdi,64
- add rsi,64
- dec rdx
- aesdeclast xmm1,xmm11
- aesdeclast xmm2,xmm11
- aesdeclast xmm3,xmm11
- aesdeclast xmm4,xmm11
- movdqu [rsi],xmm1
- movdqu 16[rsi],xmm2
- movdqu 32[rsi],xmm3
- movdqu 48[rsi],xmm4
- jne DECB_LOOP_4
- add rsi,64
-DECB_REMAINDER_4:
- cmp r10,0
- je DECB_END_4
-DECB_LOOP_4_2:
- movdqu xmm1,[rdi]
- add rdi,16
- pxor xmm1,[rcx]
- movdqu xmm2,160[rcx]
- cmp r8d,12
- aesdec xmm1,16[rcx]
- aesdec xmm1,32[rcx]
- aesdec xmm1,48[rcx]
- aesdec xmm1,64[rcx]
- aesdec xmm1,80[rcx]
- aesdec xmm1,96[rcx]
- aesdec xmm1,112[rcx]
- aesdec xmm1,128[rcx]
- aesdec xmm1,144[rcx]
- jb DECB_LAST_4_2
- cmp r8d,14
- movdqu xmm2,192[rcx]
- aesdec xmm1,160[rcx]
- aesdec xmm1,176[rcx]
- jb DECB_LAST_4_2
- movdqu xmm2,224[rcx]
- aesdec xmm1,192[rcx]
- aesdec xmm1,208[rcx]
-DECB_LAST_4_2:
- aesdeclast xmm1,xmm2
- movdqu [rsi],xmm1
- add rsi,16
- dec r10
- jne DECB_LOOP_4_2
-DECB_END_4:
- ; restore non volatile rdi,rsi
- mov rdi,rax
- mov rsi,r11
- ; restore non volatile xmms from stack
- movdqa xmm9, [rsp+0]
- movdqa xmm10, [rsp+16]
- movdqa xmm11, [rsp+32]
- movdqa xmm12, [rsp+48]
- add rsp,8+4*16 ; 8 = align stack , 4 xmm9-12 16 bytes each
- ret
-AES_ECB_decrypt_AESNI ENDP
-
-
-
-; /*
-; void ,AES_128_Key_Expansion_AESNI[const unsigned char*userkey
-; unsigned char*key_schedule]/
-; */
-; . align 16,0x90
-; . globl AES_128_Key_Expansion_AESNI
-AES_128_Key_Expansion_AESNI PROC
-;# parameter 1: rdi
-;# parameter 2: rsi
-
-; save rdi and rsi to rax and r11, restore before ret
- mov rax,rdi
- mov r11,rsi
-
-; convert to what we had for att&t convention
- mov rdi,rcx
- mov rsi,rdx
-
- mov dword ptr 240[rsi],10
-
- movdqu xmm1,[rdi]
- movdqa [rsi],xmm1
-
-
-ASSISTS:
- aeskeygenassist xmm2,xmm1,1
- call PREPARE_ROUNDKEY_128
- movdqa 16[rsi],xmm1
-
- aeskeygenassist xmm2,xmm1,2
- call PREPARE_ROUNDKEY_128
- movdqa 32[rsi],xmm1
-
- aeskeygenassist xmm2,xmm1,4
- call PREPARE_ROUNDKEY_128
- movdqa 48[rsi],xmm1
-
- aeskeygenassist xmm2,xmm1,8
- call PREPARE_ROUNDKEY_128
- movdqa 64[rsi],xmm1
-
- aeskeygenassist xmm2,xmm1,16
- call PREPARE_ROUNDKEY_128
- movdqa 80[rsi],xmm1
-
- aeskeygenassist xmm2,xmm1,32
- call PREPARE_ROUNDKEY_128
- movdqa 96[rsi],xmm1
-
- aeskeygenassist xmm2,xmm1,64
- call PREPARE_ROUNDKEY_128
- movdqa 112[rsi],xmm1
- aeskeygenassist xmm2,xmm1,80h
- call PREPARE_ROUNDKEY_128
- movdqa 128[rsi],xmm1
- aeskeygenassist xmm2,xmm1,1bh
- call PREPARE_ROUNDKEY_128
- movdqa 144[rsi],xmm1
- aeskeygenassist xmm2,xmm1,36h
- call PREPARE_ROUNDKEY_128
- movdqa 160[rsi],xmm1
- ; restore non volatile rdi,rsi
- mov rdi,rax
- mov rsi,r11
- ret
-
-PREPARE_ROUNDKEY_128:
- pshufd xmm2,xmm2,255
- movdqa xmm3,xmm1
- pslldq xmm3,4
- pxor xmm1,xmm3
- pslldq xmm3,4
- pxor xmm1,xmm3
- pslldq xmm3,4
- pxor xmm1,xmm3
- pxor xmm1,xmm2
- ret
-AES_128_Key_Expansion_AESNI ENDP
-
-; /*
-; void ,AES_192_Key_Expansion_AESNI[const unsigned char*userkey
-; unsigned char*key]
-; */
-; . globl AES_192_Key_Expansion_AESNI
-AES_192_Key_Expansion_AESNI PROC
-;# parameter 1: rdi
-;# parameter 2: rsi
-
-; save rdi and rsi to rax and r11, restore before ret
- mov rax,rdi
- mov r11,rsi
-
-; convert to what we had for att&t convention
- mov rdi,rcx
- mov rsi,rdx
-
-; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end
- sub rsp,8+1*16 ; 8 = align stack , 1 xmm6, 16 bytes each
- movdqa [rsp+0], xmm6
-
- movdqu xmm1,[rdi]
- movq xmm3,qword ptr 16[rdi]
- movdqa [rsi],xmm1
- movdqa xmm5,xmm3
-
- aeskeygenassist xmm2,xmm3,1h
- call PREPARE_ROUNDKEY_192
- shufpd xmm5,xmm1,0
- movdqa 16[rsi],xmm5
- movdqa xmm6,xmm1
- shufpd xmm6,xmm3,1
- movdqa 32[rsi],xmm6
-
- aeskeygenassist xmm2,xmm3,2h
- call PREPARE_ROUNDKEY_192
- movdqa 48[rsi],xmm1
- movdqa xmm5,xmm3
-
- aeskeygenassist xmm2,xmm3,4h
- call PREPARE_ROUNDKEY_192
- shufpd xmm5,xmm1,0
- movdqa 64[rsi],xmm5
- movdqa xmm6,xmm1
- shufpd xmm6,xmm3,1
- movdqa 80[rsi],xmm6
-
- aeskeygenassist xmm2,xmm3,8h
- call PREPARE_ROUNDKEY_192
- movdqa 96[rsi],xmm1
- movdqa xmm5,xmm3
-
- aeskeygenassist xmm2,xmm3,10h
- call PREPARE_ROUNDKEY_192
- shufpd xmm5,xmm1,0
- movdqa 112[rsi],xmm5
- movdqa xmm6,xmm1
- shufpd xmm6,xmm3,1
- movdqa 128[rsi],xmm6
-
- aeskeygenassist xmm2,xmm3,20h
- call PREPARE_ROUNDKEY_192
- movdqa 144[rsi],xmm1
- movdqa xmm5,xmm3
-
- aeskeygenassist xmm2,xmm3,40h
- call PREPARE_ROUNDKEY_192
- shufpd xmm5,xmm1,0
- movdqa 160[rsi],xmm5
- movdqa xmm6,xmm1
- shufpd xmm6,xmm3,1
- movdqa 176[rsi],xmm6
-
- aeskeygenassist xmm2,xmm3,80h
- call PREPARE_ROUNDKEY_192
- movdqa 192[rsi],xmm1
- movdqa 208[rsi],xmm3
- ; restore non volatile rdi,rsi
- mov rdi,rax
- mov rsi,r11
-; restore non volatile xmms from stack
- movdqa xmm6, [rsp+0]
- add rsp,8+1*16 ; 8 = align stack , 1 xmm6 16 bytes each
- ret
-
-PREPARE_ROUNDKEY_192:
- pshufd xmm2,xmm2,55h
- movdqu xmm4,xmm1
- pslldq xmm4,4
- pxor xmm1,xmm4
-
- pslldq xmm4,4
- pxor xmm1,xmm4
- pslldq xmm4,4
- pxor xmm1,xmm4
- pxor xmm1,xmm2
- pshufd xmm2,xmm1,0ffh
- movdqu xmm4,xmm3
- pslldq xmm4,4
- pxor xmm3,xmm4
- pxor xmm3,xmm2
- ret
-AES_192_Key_Expansion_AESNI ENDP
-
-; /*
-; void ,AES_256_Key_Expansion_AESNI[const unsigned char*userkey
-; unsigned char*key]
-; */
-; . globl AES_256_Key_Expansion_AESNI
-AES_256_Key_Expansion_AESNI PROC
-;# parameter 1: rdi
-;# parameter 2: rsi
-
-; save rdi and rsi to rax and r11, restore before ret
- mov rax,rdi
- mov r11,rsi
-
-; convert to what we had for att&t convention
- mov rdi,rcx
- mov rsi,rdx
-
- movdqu xmm1,[rdi]
- movdqu xmm3,16[rdi]
- movdqa [rsi],xmm1
- movdqa 16[rsi],xmm3
-
- aeskeygenassist xmm2,xmm3,1h
- call MAKE_RK256_a
- movdqa 32[rsi],xmm1
- aeskeygenassist xmm2,xmm1,0h
- call MAKE_RK256_b
- movdqa 48[rsi],xmm3
- aeskeygenassist xmm2,xmm3,2h
- call MAKE_RK256_a
- movdqa 64[rsi],xmm1
- aeskeygenassist xmm2,xmm1,0h
- call MAKE_RK256_b
- movdqa 80[rsi],xmm3
- aeskeygenassist xmm2,xmm3,4h
- call MAKE_RK256_a
- movdqa 96[rsi],xmm1
- aeskeygenassist xmm2,xmm1,0h
- call MAKE_RK256_b
- movdqa 112[rsi],xmm3
- aeskeygenassist xmm2,xmm3,8h
- call MAKE_RK256_a
- movdqa 128[rsi],xmm1
- aeskeygenassist xmm2,xmm1,0h
- call MAKE_RK256_b
- movdqa 144[rsi],xmm3
- aeskeygenassist xmm2,xmm3,10h
- call MAKE_RK256_a
- movdqa 160[rsi],xmm1
- aeskeygenassist xmm2,xmm1,0h
- call MAKE_RK256_b
- movdqa 176[rsi],xmm3
- aeskeygenassist xmm2,xmm3,20h
- call MAKE_RK256_a
- movdqa 192[rsi],xmm1
-
- aeskeygenassist xmm2,xmm1,0h
- call MAKE_RK256_b
- movdqa 208[rsi],xmm3
- aeskeygenassist xmm2,xmm3,40h
- call MAKE_RK256_a
- movdqa 224[rsi],xmm1
-
- ; restore non volatile rdi,rsi
- mov rdi,rax
- mov rsi,r11
- ret
-AES_256_Key_Expansion_AESNI ENDP
-
-MAKE_RK256_a:
- pshufd xmm2,xmm2,0ffh
- movdqa xmm4,xmm1
- pslldq xmm4,4
- pxor xmm1,xmm4
- pslldq xmm4,4
- pxor xmm1,xmm4
- pslldq xmm4,4
- pxor xmm1,xmm4
- pxor xmm1,xmm2
- ret
-
-MAKE_RK256_b:
- pshufd xmm2,xmm2,0aah
- movdqa xmm4,xmm3
- pslldq xmm4,4
- pxor xmm3,xmm4
- pslldq xmm4,4
- pxor xmm3,xmm4
- pslldq xmm4,4
- pxor xmm3,xmm4
- pxor xmm3,xmm2
- ret
-
-
-IF fips_version GE 2
- fipsAb ENDS
-ELSE
- _text ENDS
-ENDIF
-
-END
+;
+;
+; /* See Intel Advanced Encryption Standard (AES) Instructions Set White Paper
+; * by Israel, Intel Mobility Group Development Center, Israel Shay Gueron
+; */
+;
+; /* This file is in intel asm syntax, see .s for at&t syntax */
+;
+
+
+fips_version = 0
+IFDEF HAVE_FIPS
+ fips_version = 1
+ IFDEF HAVE_FIPS_VERSION
+ fips_version = HAVE_FIPS_VERSION
+ ENDIF
+ENDIF
+
+IF fips_version GE 2
+ fipsAb SEGMENT ALIAS(".fipsA$b") 'CODE'
+ELSE
+ _text SEGMENT
+ENDIF
+
+IF fips_version GE 2
+ fipsAb ENDS
+ELSE
+ _text ENDS
+ENDIF
+
+END
diff --git a/wolfcrypt/src/aes_gcm_asm.S b/wolfcrypt/src/aes_gcm_asm.S
index e75f2c9b942..e82445fca15 100644
--- a/wolfcrypt/src/aes_gcm_asm.S
+++ b/wolfcrypt/src/aes_gcm_asm.S
@@ -46,6 +46,16 @@
#define HAVE_INTEL_AVX2
#endif /* HAVE_INTEL_AVX2 */
#endif /* NO_AVX2_SUPPORT */
+#ifndef NO_VAES_SUPPORT
+#ifndef HAVE_INTEL_VAES
+#define HAVE_INTEL_VAES
+#endif /* HAVE_INTEL_VAES */
+#endif /* NO_VAES_SUPPORT */
+#ifndef NO_AVX512_SUPPORT
+#ifndef HAVE_INTEL_AVX512
+#define HAVE_INTEL_AVX512
+#endif /* HAVE_INTEL_AVX512 */
+#endif /* NO_AVX512_SUPPORT */
#ifdef WOLFSSL_X86_64_BUILD
#ifndef __APPLE__
@@ -194,10 +204,10 @@ _GCM_generate_m0_aesni:
por %xmm5, %xmm1
por %xmm6, %xmm2
por %xmm7, %xmm3
- vpshufb %xmm9, %xmm0, %xmm0
- vpshufb %xmm9, %xmm1, %xmm1
- vpshufb %xmm9, %xmm2, %xmm2
- vpshufb %xmm9, %xmm3, %xmm3
+ pshufb %xmm9, %xmm0
+ pshufb %xmm9, %xmm1
+ pshufb %xmm9, %xmm2
+ pshufb %xmm9, %xmm3
movdqu %xmm0, 256(%rsi)
movdqu %xmm1, 272(%rsi)
movdqu %xmm2, 288(%rsi)
@@ -230,10 +240,10 @@ _GCM_generate_m0_aesni:
por %xmm5, %xmm1
por %xmm6, %xmm2
por %xmm7, %xmm3
- vpshufb %xmm9, %xmm0, %xmm0
- vpshufb %xmm9, %xmm1, %xmm1
- vpshufb %xmm9, %xmm2, %xmm2
- vpshufb %xmm9, %xmm3, %xmm3
+ pshufb %xmm9, %xmm0
+ pshufb %xmm9, %xmm1
+ pshufb %xmm9, %xmm2
+ pshufb %xmm9, %xmm3
movdqu %xmm0, 320(%rsi)
movdqu %xmm1, 336(%rsi)
movdqu %xmm2, 352(%rsi)
@@ -266,10 +276,10 @@ _GCM_generate_m0_aesni:
por %xmm5, %xmm1
por %xmm6, %xmm2
por %xmm7, %xmm3
- vpshufb %xmm9, %xmm0, %xmm0
- vpshufb %xmm9, %xmm1, %xmm1
- vpshufb %xmm9, %xmm2, %xmm2
- vpshufb %xmm9, %xmm3, %xmm3
+ pshufb %xmm9, %xmm0
+ pshufb %xmm9, %xmm1
+ pshufb %xmm9, %xmm2
+ pshufb %xmm9, %xmm3
movdqu %xmm0, 384(%rsi)
movdqu %xmm1, 400(%rsi)
movdqu %xmm2, 416(%rsi)
@@ -302,10 +312,10 @@ _GCM_generate_m0_aesni:
por %xmm5, %xmm1
por %xmm6, %xmm2
por %xmm7, %xmm3
- vpshufb %xmm9, %xmm0, %xmm0
- vpshufb %xmm9, %xmm1, %xmm1
- vpshufb %xmm9, %xmm2, %xmm2
- vpshufb %xmm9, %xmm3, %xmm3
+ pshufb %xmm9, %xmm0
+ pshufb %xmm9, %xmm1
+ pshufb %xmm9, %xmm2
+ pshufb %xmm9, %xmm3
movdqu %xmm0, 448(%rsi)
movdqu %xmm1, 464(%rsi)
movdqu %xmm2, 480(%rsi)
@@ -16577,6 +16587,14213 @@ L_AES_GCM_decrypt_final_avx2_cmp_tag_done:
#endif /* __APPLE__ */
#endif /* WOLFSSL_AESGCM_STREAM */
#endif /* HAVE_INTEL_AVX2 */
+#ifdef HAVE_INTEL_VAES
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 32
+#else
+.p2align 5
+#endif /* __APPLE__ */
+L_vaes_aes_gcm_inc_y0:
+.quad 0x0000000000000000,0x0000000000000000
+.quad 0x0000000000000000,0x0000000000000001
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 32
+#else
+.p2align 5
+#endif /* __APPLE__ */
+L_vaes_aes_gcm_inc_y1:
+.quad 0x0000000000000000,0x0000000000000002
+.quad 0x0000000000000000,0x0000000000000003
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 32
+#else
+.p2align 5
+#endif /* __APPLE__ */
+L_vaes_aes_gcm_inc_y2:
+.quad 0x0000000000000000,0x0000000000000004
+.quad 0x0000000000000000,0x0000000000000005
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 32
+#else
+.p2align 5
+#endif /* __APPLE__ */
+L_vaes_aes_gcm_inc_y3:
+.quad 0x0000000000000000,0x0000000000000006
+.quad 0x0000000000000000,0x0000000000000007
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_GCM_generate_m0_vaes_rev8:
+.quad 0x08090a0b0c0d0e0f,0x0001020304050607
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_GCM_generate_m0_vaes_mod2_128:
+.quad 0x0000000000000000,0xe100000000000000
+#ifndef __APPLE__
+.text
+.globl GCM_generate_m0_vaes
+.type GCM_generate_m0_vaes,@function
+.align 16
+GCM_generate_m0_vaes:
+#else
+.section __TEXT,__text
+.globl _GCM_generate_m0_vaes
+.p2align 4
+_GCM_generate_m0_vaes:
+#endif /* __APPLE__ */
+ vmovdqu L_GCM_generate_m0_vaes_rev8(%rip), %xmm9
+ vmovdqu L_GCM_generate_m0_vaes_mod2_128(%rip), %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqu (%rdi), %xmm0
+ vmovdqu %xmm8, (%rsi)
+ vmovdqu %xmm0, %xmm8
+ vpshufb %xmm9, %xmm0, %xmm0
+ vpsllq $63, %xmm0, %xmm5
+ vpsrlq $0x01, %xmm0, %xmm4
+ vpslldq $8, %xmm5, %xmm1
+ vpsrldq $8, %xmm5, %xmm5
+ vpshufd $0xff, %xmm1, %xmm1
+ vpor %xmm5, %xmm4, %xmm4
+ vpsrad $31, %xmm1, %xmm1
+ vpand %xmm10, %xmm1, %xmm1
+ vpxor %xmm4, %xmm1, %xmm1
+ vpsllq $63, %xmm1, %xmm5
+ vpsrlq $0x01, %xmm1, %xmm4
+ vpslldq $8, %xmm5, %xmm2
+ vpsrldq $8, %xmm5, %xmm5
+ vpshufd $0xff, %xmm2, %xmm2
+ vpor %xmm5, %xmm4, %xmm4
+ vpsrad $31, %xmm2, %xmm2
+ vpand %xmm10, %xmm2, %xmm2
+ vpxor %xmm4, %xmm2, %xmm2
+ vpsllq $63, %xmm2, %xmm5
+ vpsrlq $0x01, %xmm2, %xmm4
+ vpslldq $8, %xmm5, %xmm3
+ vpsrldq $8, %xmm5, %xmm5
+ vpshufd $0xff, %xmm3, %xmm3
+ vpor %xmm5, %xmm4, %xmm4
+ vpsrad $31, %xmm3, %xmm3
+ vpand %xmm10, %xmm3, %xmm3
+ vpxor %xmm4, %xmm3, %xmm3
+ vpshufb %xmm9, %xmm3, %xmm3
+ vpshufb %xmm9, %xmm2, %xmm2
+ vpshufb %xmm9, %xmm1, %xmm1
+ vpshufb %xmm9, %xmm0, %xmm0
+ vpxor %xmm2, %xmm3, %xmm8
+ vmovdqu %xmm3, 16(%rsi)
+ vmovdqu %xmm2, 32(%rsi)
+ vmovdqu %xmm8, 48(%rsi)
+ vmovdqu %xmm1, 64(%rsi)
+ vpxor %xmm1, %xmm3, %xmm4
+ vpxor %xmm1, %xmm2, %xmm5
+ vpxor %xmm1, %xmm8, %xmm6
+ vmovdqu %xmm4, 80(%rsi)
+ vmovdqu %xmm5, 96(%rsi)
+ vmovdqu %xmm6, 112(%rsi)
+ vmovdqu %xmm0, 128(%rsi)
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm0, %xmm3, %xmm4
+ vpxor %xmm0, %xmm2, %xmm6
+ vmovdqu %xmm4, 144(%rsi)
+ vmovdqu %xmm6, 160(%rsi)
+ vpxor %xmm6, %xmm3, %xmm6
+ vmovdqu %xmm6, 176(%rsi)
+ vmovdqu %xmm1, 192(%rsi)
+ vpxor %xmm1, %xmm3, %xmm4
+ vpxor %xmm1, %xmm2, %xmm5
+ vpxor %xmm1, %xmm8, %xmm6
+ vmovdqu %xmm4, 208(%rsi)
+ vmovdqu %xmm5, 224(%rsi)
+ vmovdqu %xmm6, 240(%rsi)
+ vmovdqu (%rsi), %xmm0
+ vmovdqu 16(%rsi), %xmm1
+ vmovdqu 32(%rsi), %xmm2
+ vmovdqu 48(%rsi), %xmm3
+ vpshufb %xmm9, %xmm0, %xmm0
+ vpshufb %xmm9, %xmm1, %xmm1
+ vpshufb %xmm9, %xmm2, %xmm2
+ vpshufb %xmm9, %xmm3, %xmm3
+ vpsllq $60, %xmm0, %xmm4
+ vpsllq $60, %xmm1, %xmm5
+ vpsllq $60, %xmm2, %xmm6
+ vpsllq $60, %xmm3, %xmm7
+ vpsrlq $4, %xmm0, %xmm0
+ vpsrlq $4, %xmm1, %xmm1
+ vpsrlq $4, %xmm2, %xmm2
+ vpsrlq $4, %xmm3, %xmm3
+ vpsrldq $8, %xmm4, %xmm4
+ vpsrldq $8, %xmm5, %xmm5
+ vpsrldq $8, %xmm6, %xmm6
+ vpsrldq $8, %xmm7, %xmm7
+ vpor %xmm4, %xmm0, %xmm0
+ vpor %xmm5, %xmm1, %xmm1
+ vpor %xmm6, %xmm2, %xmm2
+ vpor %xmm7, %xmm3, %xmm3
+ vpshufb %xmm9, %xmm0, %xmm0
+ vpshufb %xmm9, %xmm1, %xmm1
+ vpshufb %xmm9, %xmm2, %xmm2
+ vpshufb %xmm9, %xmm3, %xmm3
+ vmovdqu %xmm0, 256(%rsi)
+ vmovdqu %xmm1, 272(%rsi)
+ vmovdqu %xmm2, 288(%rsi)
+ vmovdqu %xmm3, 304(%rsi)
+ vmovdqu 64(%rsi), %xmm0
+ vmovdqu 80(%rsi), %xmm1
+ vmovdqu 96(%rsi), %xmm2
+ vmovdqu 112(%rsi), %xmm3
+ vpshufb %xmm9, %xmm0, %xmm0
+ vpshufb %xmm9, %xmm1, %xmm1
+ vpshufb %xmm9, %xmm2, %xmm2
+ vpshufb %xmm9, %xmm3, %xmm3
+ vpsllq $60, %xmm0, %xmm4
+ vpsllq $60, %xmm1, %xmm5
+ vpsllq $60, %xmm2, %xmm6
+ vpsllq $60, %xmm3, %xmm7
+ vpsrlq $4, %xmm0, %xmm0
+ vpsrlq $4, %xmm1, %xmm1
+ vpsrlq $4, %xmm2, %xmm2
+ vpsrlq $4, %xmm3, %xmm3
+ vpsrldq $8, %xmm4, %xmm4
+ vpsrldq $8, %xmm5, %xmm5
+ vpsrldq $8, %xmm6, %xmm6
+ vpsrldq $8, %xmm7, %xmm7
+ vpor %xmm4, %xmm0, %xmm0
+ vpor %xmm5, %xmm1, %xmm1
+ vpor %xmm6, %xmm2, %xmm2
+ vpor %xmm7, %xmm3, %xmm3
+ vpshufb %xmm9, %xmm0, %xmm0
+ vpshufb %xmm9, %xmm1, %xmm1
+ vpshufb %xmm9, %xmm2, %xmm2
+ vpshufb %xmm9, %xmm3, %xmm3
+ vmovdqu %xmm0, 320(%rsi)
+ vmovdqu %xmm1, 336(%rsi)
+ vmovdqu %xmm2, 352(%rsi)
+ vmovdqu %xmm3, 368(%rsi)
+ vmovdqu 128(%rsi), %xmm0
+ vmovdqu 144(%rsi), %xmm1
+ vmovdqu 160(%rsi), %xmm2
+ vmovdqu 176(%rsi), %xmm3
+ vpshufb %xmm9, %xmm0, %xmm0
+ vpshufb %xmm9, %xmm1, %xmm1
+ vpshufb %xmm9, %xmm2, %xmm2
+ vpshufb %xmm9, %xmm3, %xmm3
+ vpsllq $60, %xmm0, %xmm4
+ vpsllq $60, %xmm1, %xmm5
+ vpsllq $60, %xmm2, %xmm6
+ vpsllq $60, %xmm3, %xmm7
+ vpsrlq $4, %xmm0, %xmm0
+ vpsrlq $4, %xmm1, %xmm1
+ vpsrlq $4, %xmm2, %xmm2
+ vpsrlq $4, %xmm3, %xmm3
+ vpsrldq $8, %xmm4, %xmm4
+ vpsrldq $8, %xmm5, %xmm5
+ vpsrldq $8, %xmm6, %xmm6
+ vpsrldq $8, %xmm7, %xmm7
+ vpor %xmm4, %xmm0, %xmm0
+ vpor %xmm5, %xmm1, %xmm1
+ vpor %xmm6, %xmm2, %xmm2
+ vpor %xmm7, %xmm3, %xmm3
+ vpshufb %xmm9, %xmm0, %xmm0
+ vpshufb %xmm9, %xmm1, %xmm1
+ vpshufb %xmm9, %xmm2, %xmm2
+ vpshufb %xmm9, %xmm3, %xmm3
+ vmovdqu %xmm0, 384(%rsi)
+ vmovdqu %xmm1, 400(%rsi)
+ vmovdqu %xmm2, 416(%rsi)
+ vmovdqu %xmm3, 432(%rsi)
+ vmovdqu 192(%rsi), %xmm0
+ vmovdqu 208(%rsi), %xmm1
+ vmovdqu 224(%rsi), %xmm2
+ vmovdqu 240(%rsi), %xmm3
+ vpshufb %xmm9, %xmm0, %xmm0
+ vpshufb %xmm9, %xmm1, %xmm1
+ vpshufb %xmm9, %xmm2, %xmm2
+ vpshufb %xmm9, %xmm3, %xmm3
+ vpsllq $60, %xmm0, %xmm4
+ vpsllq $60, %xmm1, %xmm5
+ vpsllq $60, %xmm2, %xmm6
+ vpsllq $60, %xmm3, %xmm7
+ vpsrlq $4, %xmm0, %xmm0
+ vpsrlq $4, %xmm1, %xmm1
+ vpsrlq $4, %xmm2, %xmm2
+ vpsrlq $4, %xmm3, %xmm3
+ vpsrldq $8, %xmm4, %xmm4
+ vpsrldq $8, %xmm5, %xmm5
+ vpsrldq $8, %xmm6, %xmm6
+ vpsrldq $8, %xmm7, %xmm7
+ vpor %xmm4, %xmm0, %xmm0
+ vpor %xmm5, %xmm1, %xmm1
+ vpor %xmm6, %xmm2, %xmm2
+ vpor %xmm7, %xmm3, %xmm3
+ vpshufb %xmm9, %xmm0, %xmm0
+ vpshufb %xmm9, %xmm1, %xmm1
+ vpshufb %xmm9, %xmm2, %xmm2
+ vpshufb %xmm9, %xmm3, %xmm3
+ vmovdqu %xmm0, 448(%rsi)
+ vmovdqu %xmm1, 464(%rsi)
+ vmovdqu %xmm2, 480(%rsi)
+ vmovdqu %xmm3, 496(%rsi)
+ repz retq
+#ifndef __APPLE__
+.size GCM_generate_m0_vaes,.-GCM_generate_m0_vaes
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_vaes_aes_gcm_one:
+.quad 0x0000000000000000,0x0000000000000001
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_vaes_aes_gcm_two:
+.quad 0x0000000000000000,0x0000000000000002
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_vaes_aes_gcm_three:
+.quad 0x0000000000000000,0x0000000000000003
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_vaes_aes_gcm_four:
+.quad 0x0000000000000000,0x0000000000000004
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_vaes_aes_gcm_five:
+.quad 0x0000000000000000,0x0000000000000005
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_vaes_aes_gcm_six:
+.quad 0x0000000000000000,0x0000000000000006
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_vaes_aes_gcm_seven:
+.quad 0x0000000000000000,0x0000000000000007
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_vaes_aes_gcm_eight:
+.quad 0x0000000000000000,0x0000000000000008
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_vaes_aes_gcm_bswap_epi64:
+.quad 0x0001020304050607,0x08090a0b0c0d0e0f
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_vaes_aes_gcm_bswap_mask:
+.quad 0x08090a0b0c0d0e0f,0x0001020304050607
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_vaes_aes_gcm_mod2_128:
+.quad 0x0000000000000001,0xc200000000000000
+#ifndef __APPLE__
+.text
+.globl AES_GCM_encrypt_vaes
+.type AES_GCM_encrypt_vaes,@function
+.align 16
+AES_GCM_encrypt_vaes:
+#else
+.section __TEXT,__text
+.globl _AES_GCM_encrypt_vaes
+.p2align 4
+_AES_GCM_encrypt_vaes:
+#endif /* __APPLE__ */
+ pushq %r13
+ pushq %r12
+ pushq %rbx
+ pushq %r14
+ pushq %r15
+ movq %rdx, %r12
+ movq %rcx, %rax
+ movl 48(%rsp), %r11d
+ movl 56(%rsp), %ebx
+ movl 64(%rsp), %r14d
+ movq 72(%rsp), %r15
+ movl 80(%rsp), %r10d
+ subq $0x230, %rsp
+ vpxor %xmm5, %xmm5, %xmm5
+ vpxor %xmm15, %xmm15, %xmm15
+ movl %ebx, %edx
+ cmpl $12, %edx
+ jne L_AES_GCM_encrypt_vaes_iv_not_12
+ # # Calculate values when IV is 12 bytes
+ # Set counter based on IV
+ movl $0x1000000, %ecx
+ vmovq (%rax), %xmm5
+ vpinsrd $2, 8(%rax), %xmm5, %xmm5
+ vpinsrd $3, %ecx, %xmm5, %xmm5
+ # H = Encrypt X(=0) and T = Encrypt counter
+ vmovdqa (%r15), %xmm6
+ vpxor %xmm6, %xmm5, %xmm1
+ vmovdqa 16(%r15), %xmm4
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqa 32(%r15), %xmm4
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqa 48(%r15), %xmm4
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqa 64(%r15), %xmm4
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqa 80(%r15), %xmm4
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqa 96(%r15), %xmm4
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqa 112(%r15), %xmm4
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqa 128(%r15), %xmm4
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqa 144(%r15), %xmm4
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ cmpl $11, %r10d
+ vmovdqa 160(%r15), %xmm4
+ jl L_AES_GCM_encrypt_vaes_calc_iv_12_last
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqa 176(%r15), %xmm4
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ cmpl $13, %r10d
+ vmovdqa 192(%r15), %xmm4
+ jl L_AES_GCM_encrypt_vaes_calc_iv_12_last
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqa 208(%r15), %xmm4
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqa 224(%r15), %xmm4
+L_AES_GCM_encrypt_vaes_calc_iv_12_last:
+ vaesenclast %xmm4, %xmm6, %xmm6
+ vaesenclast %xmm4, %xmm1, %xmm1
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6
+ vmovdqu %xmm1, 528(%rsp)
+ jmp L_AES_GCM_encrypt_vaes_iv_done
+L_AES_GCM_encrypt_vaes_iv_not_12:
+ # Calculate values when IV is not 12 bytes
+ # H = Encrypt X(=0)
+ vmovdqa (%r15), %xmm6
+ vaesenc 16(%r15), %xmm6, %xmm6
+ vaesenc 32(%r15), %xmm6, %xmm6
+ vaesenc 48(%r15), %xmm6, %xmm6
+ vaesenc 64(%r15), %xmm6, %xmm6
+ vaesenc 80(%r15), %xmm6, %xmm6
+ vaesenc 96(%r15), %xmm6, %xmm6
+ vaesenc 112(%r15), %xmm6, %xmm6
+ vaesenc 128(%r15), %xmm6, %xmm6
+ vaesenc 144(%r15), %xmm6, %xmm6
+ cmpl $11, %r10d
+ vmovdqa 160(%r15), %xmm8
+ jl L_AES_GCM_encrypt_vaes_calc_iv_1_aesenc_avx_last
+ vaesenc %xmm8, %xmm6, %xmm6
+ vaesenc 176(%r15), %xmm6, %xmm6
+ cmpl $13, %r10d
+ vmovdqa 192(%r15), %xmm8
+ jl L_AES_GCM_encrypt_vaes_calc_iv_1_aesenc_avx_last
+ vaesenc %xmm8, %xmm6, %xmm6
+ vaesenc 208(%r15), %xmm6, %xmm6
+ vmovdqa 224(%r15), %xmm8
+L_AES_GCM_encrypt_vaes_calc_iv_1_aesenc_avx_last:
+ vaesenclast %xmm8, %xmm6, %xmm6
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6
+ # Calc counter
+ # Initialization vector
+ cmpl $0x00, %edx
+ movq $0x00, %rcx
+ je L_AES_GCM_encrypt_vaes_calc_iv_done
+ cmpl $16, %edx
+ jl L_AES_GCM_encrypt_vaes_calc_iv_lt16
+ andl $0xfffffff0, %edx
+L_AES_GCM_encrypt_vaes_calc_iv_16_loop:
+ vmovdqu (%rax,%rcx,1), %xmm7
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7
+ vpxor %xmm7, %xmm5, %xmm5
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm5, %xmm1
+ vpshufd $0x4e, %xmm6, %xmm2
+ vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3
+ vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0
+ vpxor %xmm5, %xmm1, %xmm1
+ vpxor %xmm6, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm4
+ vmovdqa %xmm3, %xmm5
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm4, %xmm4
+ vpxor %xmm1, %xmm5, %xmm5
+ vpsrld $31, %xmm4, %xmm0
+ vpsrld $31, %xmm5, %xmm1
+ vpslld $0x01, %xmm4, %xmm4
+ vpslld $0x01, %xmm5, %xmm5
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm5, %xmm5
+ vpor %xmm0, %xmm4, %xmm4
+ vpor %xmm1, %xmm5, %xmm5
+ vpslld $31, %xmm4, %xmm0
+ vpslld $30, %xmm4, %xmm1
+ vpslld $25, %xmm4, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm4, %xmm4
+ vpsrld $0x01, %xmm4, %xmm2
+ vpsrld $2, %xmm4, %xmm3
+ vpsrld $7, %xmm4, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm4, %xmm2, %xmm2
+ vpxor %xmm2, %xmm5, %xmm5
+ addl $16, %ecx
+ cmpl %edx, %ecx
+ jl L_AES_GCM_encrypt_vaes_calc_iv_16_loop
+ movl %ebx, %edx
+ cmpl %edx, %ecx
+ je L_AES_GCM_encrypt_vaes_calc_iv_done
+L_AES_GCM_encrypt_vaes_calc_iv_lt16:
+ subq $16, %rsp
+ vpxor %xmm7, %xmm7, %xmm7
+ xorl %ebx, %ebx
+ vmovdqu %xmm7, (%rsp)
+L_AES_GCM_encrypt_vaes_calc_iv_loop:
+ movzbl (%rax,%rcx,1), %r13d
+ movb %r13b, (%rsp,%rbx,1)
+ incl %ecx
+ incl %ebx
+ cmpl %edx, %ecx
+ jl L_AES_GCM_encrypt_vaes_calc_iv_loop
+ vmovdqu (%rsp), %xmm7
+ addq $16, %rsp
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7
+ vpxor %xmm7, %xmm5, %xmm5
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm5, %xmm1
+ vpshufd $0x4e, %xmm6, %xmm2
+ vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3
+ vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0
+ vpxor %xmm5, %xmm1, %xmm1
+ vpxor %xmm6, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm4
+ vmovdqa %xmm3, %xmm5
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm4, %xmm4
+ vpxor %xmm1, %xmm5, %xmm5
+ vpsrld $31, %xmm4, %xmm0
+ vpsrld $31, %xmm5, %xmm1
+ vpslld $0x01, %xmm4, %xmm4
+ vpslld $0x01, %xmm5, %xmm5
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm5, %xmm5
+ vpor %xmm0, %xmm4, %xmm4
+ vpor %xmm1, %xmm5, %xmm5
+ vpslld $31, %xmm4, %xmm0
+ vpslld $30, %xmm4, %xmm1
+ vpslld $25, %xmm4, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm4, %xmm4
+ vpsrld $0x01, %xmm4, %xmm2
+ vpsrld $2, %xmm4, %xmm3
+ vpsrld $7, %xmm4, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm4, %xmm2, %xmm2
+ vpxor %xmm2, %xmm5, %xmm5
+L_AES_GCM_encrypt_vaes_calc_iv_done:
+ # T = Encrypt counter
+ vpxor %xmm0, %xmm0, %xmm0
+ shll $3, %edx
+ vmovq %rdx, %xmm0
+ vpxor %xmm0, %xmm5, %xmm5
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm5, %xmm1
+ vpshufd $0x4e, %xmm6, %xmm2
+ vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3
+ vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0
+ vpxor %xmm5, %xmm1, %xmm1
+ vpxor %xmm6, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm4
+ vmovdqa %xmm3, %xmm5
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm4, %xmm4
+ vpxor %xmm1, %xmm5, %xmm5
+ vpsrld $31, %xmm4, %xmm0
+ vpsrld $31, %xmm5, %xmm1
+ vpslld $0x01, %xmm4, %xmm4
+ vpslld $0x01, %xmm5, %xmm5
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm5, %xmm5
+ vpor %xmm0, %xmm4, %xmm4
+ vpor %xmm1, %xmm5, %xmm5
+ vpslld $31, %xmm4, %xmm0
+ vpslld $30, %xmm4, %xmm1
+ vpslld $25, %xmm4, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm4, %xmm4
+ vpsrld $0x01, %xmm4, %xmm2
+ vpsrld $2, %xmm4, %xmm3
+ vpsrld $7, %xmm4, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm4, %xmm2, %xmm2
+ vpxor %xmm2, %xmm5, %xmm5
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
+ # Encrypt counter
+ vmovdqa (%r15), %xmm7
+ vpxor %xmm5, %xmm7, %xmm7
+ vaesenc 16(%r15), %xmm7, %xmm7
+ vaesenc 32(%r15), %xmm7, %xmm7
+ vaesenc 48(%r15), %xmm7, %xmm7
+ vaesenc 64(%r15), %xmm7, %xmm7
+ vaesenc 80(%r15), %xmm7, %xmm7
+ vaesenc 96(%r15), %xmm7, %xmm7
+ vaesenc 112(%r15), %xmm7, %xmm7
+ vaesenc 128(%r15), %xmm7, %xmm7
+ vaesenc 144(%r15), %xmm7, %xmm7
+ cmpl $11, %r10d
+ vmovdqa 160(%r15), %xmm8
+ jl L_AES_GCM_encrypt_vaes_calc_iv_2_aesenc_avx_last
+ vaesenc %xmm8, %xmm7, %xmm7
+ vaesenc 176(%r15), %xmm7, %xmm7
+ cmpl $13, %r10d
+ vmovdqa 192(%r15), %xmm8
+ jl L_AES_GCM_encrypt_vaes_calc_iv_2_aesenc_avx_last
+ vaesenc %xmm8, %xmm7, %xmm7
+ vaesenc 208(%r15), %xmm7, %xmm7
+ vmovdqa 224(%r15), %xmm8
+L_AES_GCM_encrypt_vaes_calc_iv_2_aesenc_avx_last:
+ vaesenclast %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, 528(%rsp)
+L_AES_GCM_encrypt_vaes_iv_done:
+ # Additional authentication data
+ movl %r11d, %edx
+ cmpl $0x00, %edx
+ je L_AES_GCM_encrypt_vaes_calc_aad_done
+ xorl %ecx, %ecx
+ cmpl $16, %edx
+ jl L_AES_GCM_encrypt_vaes_calc_aad_lt16
+ andl $0xfffffff0, %edx
+L_AES_GCM_encrypt_vaes_calc_aad_16_loop:
+ vmovdqu (%r12,%rcx,1), %xmm7
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7
+ vpxor %xmm7, %xmm15, %xmm15
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm15, %xmm1
+ vpshufd $0x4e, %xmm6, %xmm2
+ vpclmulqdq $0x11, %xmm15, %xmm6, %xmm3
+ vpclmulqdq $0x00, %xmm15, %xmm6, %xmm0
+ vpxor %xmm15, %xmm1, %xmm1
+ vpxor %xmm6, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm4
+ vmovdqa %xmm3, %xmm15
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm4, %xmm4
+ vpxor %xmm1, %xmm15, %xmm15
+ vpsrld $31, %xmm4, %xmm0
+ vpsrld $31, %xmm15, %xmm1
+ vpslld $0x01, %xmm4, %xmm4
+ vpslld $0x01, %xmm15, %xmm15
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm15, %xmm15
+ vpor %xmm0, %xmm4, %xmm4
+ vpor %xmm1, %xmm15, %xmm15
+ vpslld $31, %xmm4, %xmm0
+ vpslld $30, %xmm4, %xmm1
+ vpslld $25, %xmm4, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm4, %xmm4
+ vpsrld $0x01, %xmm4, %xmm2
+ vpsrld $2, %xmm4, %xmm3
+ vpsrld $7, %xmm4, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm4, %xmm2, %xmm2
+ vpxor %xmm2, %xmm15, %xmm15
+ addl $16, %ecx
+ cmpl %edx, %ecx
+ jl L_AES_GCM_encrypt_vaes_calc_aad_16_loop
+ movl %r11d, %edx
+ cmpl %edx, %ecx
+ je L_AES_GCM_encrypt_vaes_calc_aad_done
+L_AES_GCM_encrypt_vaes_calc_aad_lt16:
+ subq $16, %rsp
+ vpxor %xmm7, %xmm7, %xmm7
+ xorl %ebx, %ebx
+ vmovdqu %xmm7, (%rsp)
+L_AES_GCM_encrypt_vaes_calc_aad_loop:
+ movzbl (%r12,%rcx,1), %r13d
+ movb %r13b, (%rsp,%rbx,1)
+ incl %ecx
+ incl %ebx
+ cmpl %edx, %ecx
+ jl L_AES_GCM_encrypt_vaes_calc_aad_loop
+ vmovdqu (%rsp), %xmm7
+ addq $16, %rsp
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7
+ vpxor %xmm7, %xmm15, %xmm15
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm15, %xmm1
+ vpshufd $0x4e, %xmm6, %xmm2
+ vpclmulqdq $0x11, %xmm15, %xmm6, %xmm3
+ vpclmulqdq $0x00, %xmm15, %xmm6, %xmm0
+ vpxor %xmm15, %xmm1, %xmm1
+ vpxor %xmm6, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm4
+ vmovdqa %xmm3, %xmm15
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm4, %xmm4
+ vpxor %xmm1, %xmm15, %xmm15
+ vpsrld $31, %xmm4, %xmm0
+ vpsrld $31, %xmm15, %xmm1
+ vpslld $0x01, %xmm4, %xmm4
+ vpslld $0x01, %xmm15, %xmm15
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm15, %xmm15
+ vpor %xmm0, %xmm4, %xmm4
+ vpor %xmm1, %xmm15, %xmm15
+ vpslld $31, %xmm4, %xmm0
+ vpslld $30, %xmm4, %xmm1
+ vpslld $25, %xmm4, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm4, %xmm4
+ vpsrld $0x01, %xmm4, %xmm2
+ vpsrld $2, %xmm4, %xmm3
+ vpsrld $7, %xmm4, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm4, %xmm2, %xmm2
+ vpxor %xmm2, %xmm15, %xmm15
+L_AES_GCM_encrypt_vaes_calc_aad_done:
+ # Calculate counter and H
+ vpsrlq $63, %xmm6, %xmm8
+ vpsllq $0x01, %xmm6, %xmm7
+ vpslldq $8, %xmm8, %xmm8
+ vpor %xmm8, %xmm7, %xmm7
+ vpshufd $0xff, %xmm6, %xmm6
+ vpsrad $31, %xmm6, %xmm6
+ vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm5, %xmm5
+ vpand L_vaes_aes_gcm_mod2_128(%rip), %xmm6, %xmm6
+ vpaddd L_vaes_aes_gcm_one(%rip), %xmm5, %xmm5
+ vpxor %xmm7, %xmm6, %xmm6
+ vmovdqu %xmm5, 512(%rsp)
+ xorl %ebx, %ebx
+ cmpl $0x80, %r9d
+ jl L_AES_GCM_encrypt_vaes_done_128
+ vmovdqa %xmm15, %xmm2
+ # H ^ 1
+ vmovdqu %xmm6, (%rsp)
+ # H ^ 2
+ vpclmulqdq $0x00, %xmm6, %xmm6, %xmm7
+ vpclmulqdq $0x11, %xmm6, %xmm6, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm0
+ vmovdqu %xmm0, 16(%rsp)
+ # H ^ 3
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm6, %xmm0, %xmm7
+ vpclmulqdq $0x01, %xmm6, %xmm0, %xmm8
+ vpclmulqdq $16, %xmm6, %xmm0, %xmm9
+ vpclmulqdq $0x11, %xmm6, %xmm0, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm1
+ vmovdqu %xmm1, 32(%rsp)
+ # H ^ 4
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm3
+ vmovdqu %xmm3, 48(%rsp)
+ # H ^ 5
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm9
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 64(%rsp)
+ # H ^ 6
+ vpclmulqdq $0x00, %xmm1, %xmm1, %xmm7
+ vpclmulqdq $0x11, %xmm1, %xmm1, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 80(%rsp)
+ # H ^ 7
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm1, %xmm3, %xmm7
+ vpclmulqdq $0x01, %xmm1, %xmm3, %xmm8
+ vpclmulqdq $16, %xmm1, %xmm3, %xmm9
+ vpclmulqdq $0x11, %xmm1, %xmm3, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 96(%rsp)
+ # H ^ 8
+ vpclmulqdq $0x00, %xmm3, %xmm3, %xmm7
+ vpclmulqdq $0x11, %xmm3, %xmm3, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 112(%rsp)
+ cmpl $0x100, %r9d
+ jl L_AES_GCM_encrypt_vaes_no_ext
+ # H ^ 9
+ vmovdqu 48(%rsp), %xmm0
+ vmovdqu 64(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm9
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 128(%rsp)
+ # H ^ 10
+ vmovdqu 64(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 144(%rsp)
+ # H ^ 11
+ vmovdqu 64(%rsp), %xmm0
+ vmovdqu 80(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm9
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 160(%rsp)
+ # H ^ 12
+ vmovdqu 80(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 176(%rsp)
+ # H ^ 13
+ vmovdqu 80(%rsp), %xmm0
+ vmovdqu 96(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm9
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 192(%rsp)
+ # H ^ 14
+ vmovdqu 96(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 208(%rsp)
+ # H ^ 15
+ vmovdqu 96(%rsp), %xmm0
+ vmovdqu 112(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm9
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 224(%rsp)
+ # H ^ 16
+ vmovdqu 112(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 240(%rsp)
+ vmovdqu 224(%rsp), %ymm7
+ vpermq $0x4e, %ymm7, %ymm7
+ vmovdqu 192(%rsp), %ymm8
+ vpermq $0x4e, %ymm8, %ymm8
+ vmovdqu 160(%rsp), %ymm9
+ vpermq $0x4e, %ymm9, %ymm9
+ vmovdqu 128(%rsp), %ymm10
+ vpermq $0x4e, %ymm10, %ymm10
+ vmovdqu %ymm7, 256(%rsp)
+ vmovdqu %ymm8, 288(%rsp)
+ vmovdqu %ymm9, 320(%rsp)
+ vmovdqu %ymm10, 352(%rsp)
+ vmovdqu 96(%rsp), %ymm7
+ vpermq $0x4e, %ymm7, %ymm7
+ vmovdqu 64(%rsp), %ymm8
+ vpermq $0x4e, %ymm8, %ymm8
+ vmovdqu 32(%rsp), %ymm9
+ vpermq $0x4e, %ymm9, %ymm9
+ vmovdqu (%rsp), %ymm10
+ vpermq $0x4e, %ymm10, %ymm10
+ vmovdqu %ymm7, 384(%rsp)
+ vmovdqu %ymm8, 416(%rsp)
+ vmovdqu %ymm9, 448(%rsp)
+ vmovdqu %ymm10, 480(%rsp)
+L_AES_GCM_encrypt_vaes_no_ext:
+ vbroadcasti128 L_vaes_aes_gcm_mod2_128(%rip), %ymm14
+ cmpl $0x100, %r9d
+ jl L_AES_GCM_encrypt_vaes_after_256
+ movl %r9d, %r13d
+ andl $0xffffff00, %r13d
+L_AES_GCM_encrypt_vaes_loop_256:
+ # 256 bytes of input
+ leaq (%rsi,%rbx,1), %rcx
+ movq %rcx, 544(%rsp)
+ vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6
+ vbroadcasti128 512(%rsp), %ymm4
+ vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0
+ vpshufb %ymm6, %ymm0, %ymm0
+ vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1
+ vpshufb %ymm6, %ymm1, %ymm1
+ vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2
+ vpshufb %ymm6, %ymm2, %ymm2
+ vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3
+ vpshufb %ymm6, %ymm3, %ymm3
+ vmovdqu 512(%rsp), %xmm7
+ vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7
+ vmovdqu %xmm7, 512(%rsp)
+ vbroadcasti128 (%r15), %ymm4
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm4, %ymm1, %ymm1
+ vpxor %ymm4, %ymm2, %ymm2
+ vpxor %ymm4, %ymm3, %ymm3
+ vbroadcasti128 16(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 32(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 48(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 64(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 80(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 96(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 112(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 128(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 144(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $11, %r10d
+ vbroadcasti128 160(%r15), %ymm4
+ jl L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 176(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $13, %r10d
+ vbroadcasti128 192(%r15), %ymm4
+ jl L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 208(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 224(%r15), %ymm4
+L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last:
+ vaesenclast %ymm4, %ymm0, %ymm0
+ vaesenclast %ymm4, %ymm1, %ymm1
+ vaesenclast %ymm4, %ymm2, %ymm2
+ vaesenclast %ymm4, %ymm3, %ymm3
+ leaq (%rdi,%rbx,1), %rcx
+ leaq (%rsi,%rbx,1), %rdx
+ vmovdqu (%rcx), %ymm5
+ vpxor %ymm5, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vmovdqu 32(%rcx), %ymm5
+ vpxor %ymm5, %ymm1, %ymm1
+ vmovdqu %ymm1, 32(%rdx)
+ vmovdqu 64(%rcx), %ymm5
+ vpxor %ymm5, %ymm2, %ymm2
+ vmovdqu %ymm2, 64(%rdx)
+ vmovdqu 96(%rcx), %ymm5
+ vpxor %ymm5, %ymm3, %ymm3
+ vmovdqu %ymm3, 96(%rdx)
+ addl $0x80, %ebx
+ vbroadcasti128 512(%rsp), %ymm4
+ vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0
+ vpshufb %ymm6, %ymm0, %ymm0
+ vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1
+ vpshufb %ymm6, %ymm1, %ymm1
+ vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2
+ vpshufb %ymm6, %ymm2, %ymm2
+ vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3
+ vpshufb %ymm6, %ymm3, %ymm3
+ vmovdqu 512(%rsp), %xmm7
+ vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7
+ vmovdqu %xmm7, 512(%rsp)
+ vbroadcasti128 (%r15), %ymm4
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm4, %ymm1, %ymm1
+ vpxor %ymm4, %ymm2, %ymm2
+ vpxor %ymm4, %ymm3, %ymm3
+ vbroadcasti128 16(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 32(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 48(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 64(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 80(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 96(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 112(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 128(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 144(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $11, %r10d
+ vbroadcasti128 160(%r15), %ymm4
+ jl L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 176(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $13, %r10d
+ vbroadcasti128 192(%r15), %ymm4
+ jl L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 208(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 224(%r15), %ymm4
+L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last:
+ vaesenclast %ymm4, %ymm0, %ymm0
+ vaesenclast %ymm4, %ymm1, %ymm1
+ vaesenclast %ymm4, %ymm2, %ymm2
+ vaesenclast %ymm4, %ymm3, %ymm3
+ leaq (%rdi,%rbx,1), %rcx
+ leaq (%rsi,%rbx,1), %rdx
+ vmovdqu (%rcx), %ymm5
+ vpxor %ymm5, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vmovdqu 32(%rcx), %ymm5
+ vpxor %ymm5, %ymm1, %ymm1
+ vmovdqu %ymm1, 32(%rdx)
+ vmovdqu 64(%rcx), %ymm5
+ vpxor %ymm5, %ymm2, %ymm2
+ vmovdqu %ymm2, 64(%rdx)
+ vmovdqu 96(%rcx), %ymm5
+ vpxor %ymm5, %ymm3, %ymm3
+ vmovdqu %ymm3, 96(%rdx)
+ addl $0x80, %ebx
+ vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6
+ movq 544(%rsp), %rcx
+ vpxor %ymm4, %ymm4, %ymm4
+ vinserti128 $0x00, %xmm15, %ymm4, %ymm4
+ vmovdqu 256(%rsp), %ymm7
+ vmovdqu 288(%rsp), %ymm8
+ vmovdqu 320(%rsp), %ymm9
+ vmovdqu 352(%rsp), %ymm10
+ vmovdqu (%rcx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpxor %ymm4, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm7, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3
+ vmovdqa %ymm0, %ymm11
+ vpxor %ymm1, %ymm2, %ymm12
+ vmovdqa %ymm3, %ymm13
+ vmovdqu 32(%rcx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm8, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 64(%rcx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm9, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 96(%rcx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm10, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 384(%rsp), %ymm7
+ vmovdqu 416(%rsp), %ymm8
+ vmovdqu 448(%rsp), %ymm9
+ vmovdqu 480(%rsp), %ymm10
+ vmovdqu 128(%rcx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm7, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 160(%rcx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm8, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 192(%rcx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm9, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 224(%rcx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm10, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5
+ vpshufd $0x4e, %ymm11, %ymm11
+ vpxor %ymm5, %ymm12, %ymm12
+ vpxor %ymm11, %ymm12, %ymm12
+ vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5
+ vpshufd $0x4e, %ymm12, %ymm12
+ vpxor %ymm5, %ymm13, %ymm13
+ vpxor %ymm12, %ymm13, %ymm13
+ vextracti128 $0x01, %ymm13, %xmm0
+ vpxor %xmm0, %xmm13, %xmm15
+ cmpl %r13d, %ebx
+ jl L_AES_GCM_encrypt_vaes_loop_256
+L_AES_GCM_encrypt_vaes_after_256:
+ movl %r9d, %r13d
+ andl $0xffffff80, %r13d
+ cmpl %r13d, %ebx
+ jge L_AES_GCM_encrypt_vaes_after_128
+ # 128 bytes of input
+ leaq (%rsi,%rbx,1), %rcx
+ movq %rcx, 544(%rsp)
+ vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6
+ vbroadcasti128 512(%rsp), %ymm4
+ vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0
+ vpshufb %ymm6, %ymm0, %ymm0
+ vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1
+ vpshufb %ymm6, %ymm1, %ymm1
+ vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2
+ vpshufb %ymm6, %ymm2, %ymm2
+ vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3
+ vpshufb %ymm6, %ymm3, %ymm3
+ vmovdqu 512(%rsp), %xmm7
+ vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7
+ vmovdqu %xmm7, 512(%rsp)
+ vbroadcasti128 (%r15), %ymm4
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm4, %ymm1, %ymm1
+ vpxor %ymm4, %ymm2, %ymm2
+ vpxor %ymm4, %ymm3, %ymm3
+ vbroadcasti128 16(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 32(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 48(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 64(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 80(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 96(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 112(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 128(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 144(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $11, %r10d
+ vbroadcasti128 160(%r15), %ymm4
+ jl L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 176(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $13, %r10d
+ vbroadcasti128 192(%r15), %ymm4
+ jl L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 208(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 224(%r15), %ymm4
+L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last:
+ vaesenclast %ymm4, %ymm0, %ymm0
+ vaesenclast %ymm4, %ymm1, %ymm1
+ vaesenclast %ymm4, %ymm2, %ymm2
+ vaesenclast %ymm4, %ymm3, %ymm3
+ leaq (%rdi,%rbx,1), %rcx
+ leaq (%rsi,%rbx,1), %rdx
+ vmovdqu (%rcx), %ymm5
+ vpxor %ymm5, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vmovdqu 32(%rcx), %ymm5
+ vpxor %ymm5, %ymm1, %ymm1
+ vmovdqu %ymm1, 32(%rdx)
+ vmovdqu 64(%rcx), %ymm5
+ vpxor %ymm5, %ymm2, %ymm2
+ vmovdqu %ymm2, 64(%rdx)
+ vmovdqu 96(%rcx), %ymm5
+ vpxor %ymm5, %ymm3, %ymm3
+ vmovdqu %ymm3, 96(%rdx)
+ addl $0x80, %ebx
+ vmovdqu 96(%rsp), %ymm7
+ vpermq $0x4e, %ymm7, %ymm7
+ vmovdqu 64(%rsp), %ymm8
+ vpermq $0x4e, %ymm8, %ymm8
+ vmovdqu 32(%rsp), %ymm9
+ vpermq $0x4e, %ymm9, %ymm9
+ vmovdqu (%rsp), %ymm10
+ vpermq $0x4e, %ymm10, %ymm10
+ vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6
+ movq 544(%rsp), %rcx
+ vpxor %ymm4, %ymm4, %ymm4
+ vinserti128 $0x00, %xmm15, %ymm4, %ymm4
+ vmovdqu (%rcx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpxor %ymm4, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm7, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3
+ vmovdqa %ymm0, %ymm11
+ vpxor %ymm1, %ymm2, %ymm12
+ vmovdqa %ymm3, %ymm13
+ vmovdqu 32(%rcx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm8, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 64(%rcx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm9, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 96(%rcx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm10, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5
+ vpshufd $0x4e, %ymm11, %ymm11
+ vpxor %ymm5, %ymm12, %ymm12
+ vpxor %ymm11, %ymm12, %ymm12
+ vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5
+ vpshufd $0x4e, %ymm12, %ymm12
+ vpxor %ymm5, %ymm13, %ymm13
+ vpxor %ymm12, %ymm13, %ymm13
+ vextracti128 $0x01, %ymm13, %xmm0
+ vpxor %xmm0, %xmm13, %xmm15
+L_AES_GCM_encrypt_vaes_after_128:
+ vmovdqu (%rsp), %xmm6
+L_AES_GCM_encrypt_vaes_done_128:
+ movl %r9d, %edx
+ cmpl %edx, %ebx
+ jge L_AES_GCM_encrypt_vaes_done_enc
+ movl %r9d, %r13d
+ andl $0xfffffff0, %r13d
+ cmpl %r13d, %ebx
+ jge L_AES_GCM_encrypt_vaes_last_block_done
+ vmovdqu 512(%rsp), %xmm8
+ vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm8, %xmm7
+ vpaddd L_vaes_aes_gcm_one(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, 512(%rsp)
+ vpxor (%r15), %xmm7, %xmm7
+ vaesenc 16(%r15), %xmm7, %xmm7
+ vaesenc 32(%r15), %xmm7, %xmm7
+ vaesenc 48(%r15), %xmm7, %xmm7
+ vaesenc 64(%r15), %xmm7, %xmm7
+ vaesenc 80(%r15), %xmm7, %xmm7
+ vaesenc 96(%r15), %xmm7, %xmm7
+ vaesenc 112(%r15), %xmm7, %xmm7
+ vaesenc 128(%r15), %xmm7, %xmm7
+ vaesenc 144(%r15), %xmm7, %xmm7
+ cmpl $11, %r10d
+ vmovdqa 160(%r15), %xmm8
+ jl L_AES_GCM_encrypt_vaes_aesenc_block_last
+ vaesenc %xmm8, %xmm7, %xmm7
+ vaesenc 176(%r15), %xmm7, %xmm7
+ cmpl $13, %r10d
+ vmovdqa 192(%r15), %xmm8
+ jl L_AES_GCM_encrypt_vaes_aesenc_block_last
+ vaesenc %xmm8, %xmm7, %xmm7
+ vaesenc 208(%r15), %xmm7, %xmm7
+ vmovdqa 224(%r15), %xmm8
+L_AES_GCM_encrypt_vaes_aesenc_block_last:
+ vaesenclast %xmm8, %xmm7, %xmm7
+ vmovdqu (%rdi,%rbx,1), %xmm8
+ vpxor %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, (%rsi,%rbx,1)
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7
+ vpxor %xmm7, %xmm15, %xmm15
+ addl $16, %ebx
+ cmpl %r13d, %ebx
+ jge L_AES_GCM_encrypt_vaes_last_block_ghash
+L_AES_GCM_encrypt_vaes_last_block_start:
+ vmovdqu (%rdi,%rbx,1), %xmm12
+ vmovdqu 512(%rsp), %xmm8
+ vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm8, %xmm7
+ vpaddd L_vaes_aes_gcm_one(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, 512(%rsp)
+ vpxor (%r15), %xmm7, %xmm7
+ vpclmulqdq $16, %xmm6, %xmm15, %xmm9
+ vaesenc 16(%r15), %xmm7, %xmm7
+ vaesenc 32(%r15), %xmm7, %xmm7
+ vpclmulqdq $0x01, %xmm6, %xmm15, %xmm10
+ vaesenc 48(%r15), %xmm7, %xmm7
+ vaesenc 64(%r15), %xmm7, %xmm7
+ vpclmulqdq $0x00, %xmm6, %xmm15, %xmm11
+ vaesenc 80(%r15), %xmm7, %xmm7
+ vpclmulqdq $0x11, %xmm6, %xmm15, %xmm1
+ vaesenc 96(%r15), %xmm7, %xmm7
+ vpxor %xmm10, %xmm9, %xmm9
+ vpslldq $8, %xmm9, %xmm2
+ vpsrldq $8, %xmm9, %xmm9
+ vaesenc 112(%r15), %xmm7, %xmm7
+ vpxor %xmm11, %xmm2, %xmm2
+ vpxor %xmm9, %xmm1, %xmm3
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm0
+ vpclmulqdq $16, %xmm0, %xmm2, %xmm10
+ vaesenc 128(%r15), %xmm7, %xmm7
+ vpshufd $0x4e, %xmm2, %xmm9
+ vpxor %xmm10, %xmm9, %xmm9
+ vpclmulqdq $16, %xmm0, %xmm9, %xmm10
+ vaesenc 144(%r15), %xmm7, %xmm7
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpxor %xmm10, %xmm9, %xmm9
+ vpxor %xmm3, %xmm9, %xmm15
+ cmpl $11, %r10d
+ vmovdqa 160(%r15), %xmm8
+ jl L_AES_GCM_encrypt_vaes_aesenc_gfmul_last
+ vaesenc %xmm8, %xmm7, %xmm7
+ vaesenc 176(%r15), %xmm7, %xmm7
+ cmpl $13, %r10d
+ vmovdqa 192(%r15), %xmm8
+ jl L_AES_GCM_encrypt_vaes_aesenc_gfmul_last
+ vaesenc %xmm8, %xmm7, %xmm7
+ vaesenc 208(%r15), %xmm7, %xmm7
+ vmovdqa 224(%r15), %xmm8
+L_AES_GCM_encrypt_vaes_aesenc_gfmul_last:
+ vaesenclast %xmm8, %xmm7, %xmm7
+ vmovdqa %xmm12, %xmm0
+ vpxor %xmm0, %xmm7, %xmm7
+ vmovdqu %xmm7, (%rsi,%rbx,1)
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7
+ addl $16, %ebx
+ vpxor %xmm7, %xmm15, %xmm15
+ cmpl %r13d, %ebx
+ jl L_AES_GCM_encrypt_vaes_last_block_start
+L_AES_GCM_encrypt_vaes_last_block_ghash:
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm6, %xmm15, %xmm7
+ vpclmulqdq $0x01, %xmm6, %xmm15, %xmm8
+ vpclmulqdq $16, %xmm6, %xmm15, %xmm9
+ vpclmulqdq $0x11, %xmm6, %xmm15, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm15
+L_AES_GCM_encrypt_vaes_last_block_done:
+ movl %r9d, %ecx
+ movl %ecx, %edx
+ andl $15, %ecx
+ jz L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_done
+ vmovdqu 512(%rsp), %xmm5
+ vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm5, %xmm5
+ vpxor (%r15), %xmm5, %xmm5
+ vaesenc 16(%r15), %xmm5, %xmm5
+ vaesenc 32(%r15), %xmm5, %xmm5
+ vaesenc 48(%r15), %xmm5, %xmm5
+ vaesenc 64(%r15), %xmm5, %xmm5
+ vaesenc 80(%r15), %xmm5, %xmm5
+ vaesenc 96(%r15), %xmm5, %xmm5
+ vaesenc 112(%r15), %xmm5, %xmm5
+ vaesenc 128(%r15), %xmm5, %xmm5
+ vaesenc 144(%r15), %xmm5, %xmm5
+ cmpl $11, %r10d
+ vmovdqa 160(%r15), %xmm8
+ jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_aesenc_avx_last
+ vaesenc %xmm8, %xmm5, %xmm5
+ vaesenc 176(%r15), %xmm5, %xmm5
+ cmpl $13, %r10d
+ vmovdqa 192(%r15), %xmm8
+ jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_aesenc_avx_last
+ vaesenc %xmm8, %xmm5, %xmm5
+ vaesenc 208(%r15), %xmm5, %xmm5
+ vmovdqa 224(%r15), %xmm8
+L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_aesenc_avx_last:
+ vaesenclast %xmm8, %xmm5, %xmm5
+ subq $16, %rsp
+ xorl %ecx, %ecx
+ vmovdqu %xmm5, (%rsp)
+L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_loop:
+ movzbl (%rdi,%rbx,1), %r13d
+ xorb (%rsp,%rcx,1), %r13b
+ movb %r13b, (%rsi,%rbx,1)
+ movb %r13b, (%rsp,%rcx,1)
+ incl %ebx
+ incl %ecx
+ cmpl %edx, %ebx
+ jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_loop
+ xorq %r13, %r13
+ cmpl $16, %ecx
+ je L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_finish_enc
+L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_byte_loop:
+ movb %r13b, (%rsp,%rcx,1)
+ incl %ecx
+ cmpl $16, %ecx
+ jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_byte_loop
+L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_finish_enc:
+ vmovdqu (%rsp), %xmm5
+ addq $16, %rsp
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
+ vpxor %xmm5, %xmm15, %xmm15
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm6, %xmm15, %xmm7
+ vpclmulqdq $0x01, %xmm6, %xmm15, %xmm8
+ vpclmulqdq $16, %xmm6, %xmm15, %xmm9
+ vpclmulqdq $0x11, %xmm6, %xmm15, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm15
+L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_done:
+L_AES_GCM_encrypt_vaes_done_enc:
+ movl %r9d, %edx
+ movl %r11d, %ecx
+ shlq $3, %rdx
+ shlq $3, %rcx
+ vmovq %rdx, %xmm0
+ vmovq %rcx, %xmm1
+ vpunpcklqdq %xmm1, %xmm0, %xmm0
+ vpxor %xmm0, %xmm15, %xmm15
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm6, %xmm15, %xmm7
+ vpclmulqdq $0x01, %xmm6, %xmm15, %xmm8
+ vpclmulqdq $16, %xmm6, %xmm15, %xmm9
+ vpclmulqdq $0x11, %xmm6, %xmm15, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm15
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm15, %xmm15
+ vmovdqu 528(%rsp), %xmm0
+ vpxor %xmm15, %xmm0, %xmm0
+ cmpl $16, %r14d
+ je L_AES_GCM_encrypt_vaes_store_tag_16
+ xorq %rcx, %rcx
+ vmovdqu %xmm0, (%rsp)
+L_AES_GCM_encrypt_vaes_store_tag_loop:
+ movzbl (%rsp,%rcx,1), %r13d
+ movb %r13b, (%r8,%rcx,1)
+ incl %ecx
+ cmpl %r14d, %ecx
+ jne L_AES_GCM_encrypt_vaes_store_tag_loop
+ jmp L_AES_GCM_encrypt_vaes_store_tag_done
+L_AES_GCM_encrypt_vaes_store_tag_16:
+ vmovdqu %xmm0, (%r8)
+L_AES_GCM_encrypt_vaes_store_tag_done:
+ vzeroupper
+ addq $0x230, %rsp
+ popq %r15
+ popq %r14
+ popq %rbx
+ popq %r12
+ popq %r13
+ repz retq
+#ifndef __APPLE__
+.size AES_GCM_encrypt_vaes,.-AES_GCM_encrypt_vaes
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_GCM_decrypt_vaes
+.type AES_GCM_decrypt_vaes,@function
+.align 16
+AES_GCM_decrypt_vaes:
+#else
+.section __TEXT,__text
+.globl _AES_GCM_decrypt_vaes
+.p2align 4
+_AES_GCM_decrypt_vaes:
+#endif /* __APPLE__ */
+ pushq %r13
+ pushq %r12
+ pushq %rbx
+ pushq %r14
+ pushq %r15
+ pushq %rbp
+ movq %rdx, %r12
+ movq %rcx, %rax
+ movl 56(%rsp), %r11d
+ movl 64(%rsp), %ebx
+ movl 72(%rsp), %r14d
+ movq 80(%rsp), %r15
+ movl 88(%rsp), %r10d
+ movq 96(%rsp), %rbp
+ subq $0x220, %rsp
+ vpxor %xmm5, %xmm5, %xmm5
+ vpxor %xmm15, %xmm15, %xmm15
+ cmpl $12, %ebx
+ movl %ebx, %edx
+ jne L_AES_GCM_decrypt_vaes_iv_not_12
+ # # Calculate values when IV is 12 bytes
+ # Set counter based on IV
+ movl $0x1000000, %ecx
+ vmovq (%rax), %xmm5
+ vpinsrd $2, 8(%rax), %xmm5, %xmm5
+ vpinsrd $3, %ecx, %xmm5, %xmm5
+ # H = Encrypt X(=0) and T = Encrypt counter
+ vmovdqa (%r15), %xmm6
+ vpxor %xmm6, %xmm5, %xmm1
+ vmovdqa 16(%r15), %xmm4
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqa 32(%r15), %xmm4
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqa 48(%r15), %xmm4
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqa 64(%r15), %xmm4
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqa 80(%r15), %xmm4
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqa 96(%r15), %xmm4
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqa 112(%r15), %xmm4
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqa 128(%r15), %xmm4
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqa 144(%r15), %xmm4
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ cmpl $11, %r10d
+ vmovdqa 160(%r15), %xmm4
+ jl L_AES_GCM_decrypt_vaes_calc_iv_12_last
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqa 176(%r15), %xmm4
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ cmpl $13, %r10d
+ vmovdqa 192(%r15), %xmm4
+ jl L_AES_GCM_decrypt_vaes_calc_iv_12_last
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqa 208(%r15), %xmm4
+ vaesenc %xmm4, %xmm6, %xmm6
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqa 224(%r15), %xmm4
+L_AES_GCM_decrypt_vaes_calc_iv_12_last:
+ vaesenclast %xmm4, %xmm6, %xmm6
+ vaesenclast %xmm4, %xmm1, %xmm1
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6
+ vmovdqu %xmm1, 528(%rsp)
+ jmp L_AES_GCM_decrypt_vaes_iv_done
+L_AES_GCM_decrypt_vaes_iv_not_12:
+ # Calculate values when IV is not 12 bytes
+ # H = Encrypt X(=0)
+ vmovdqa (%r15), %xmm6
+ vaesenc 16(%r15), %xmm6, %xmm6
+ vaesenc 32(%r15), %xmm6, %xmm6
+ vaesenc 48(%r15), %xmm6, %xmm6
+ vaesenc 64(%r15), %xmm6, %xmm6
+ vaesenc 80(%r15), %xmm6, %xmm6
+ vaesenc 96(%r15), %xmm6, %xmm6
+ vaesenc 112(%r15), %xmm6, %xmm6
+ vaesenc 128(%r15), %xmm6, %xmm6
+ vaesenc 144(%r15), %xmm6, %xmm6
+ cmpl $11, %r10d
+ vmovdqa 160(%r15), %xmm8
+ jl L_AES_GCM_decrypt_vaes_calc_iv_1_aesenc_avx_last
+ vaesenc %xmm8, %xmm6, %xmm6
+ vaesenc 176(%r15), %xmm6, %xmm6
+ cmpl $13, %r10d
+ vmovdqa 192(%r15), %xmm8
+ jl L_AES_GCM_decrypt_vaes_calc_iv_1_aesenc_avx_last
+ vaesenc %xmm8, %xmm6, %xmm6
+ vaesenc 208(%r15), %xmm6, %xmm6
+ vmovdqa 224(%r15), %xmm8
+L_AES_GCM_decrypt_vaes_calc_iv_1_aesenc_avx_last:
+ vaesenclast %xmm8, %xmm6, %xmm6
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6
+ # Calc counter
+ # Initialization vector
+ cmpl $0x00, %edx
+ movq $0x00, %rcx
+ je L_AES_GCM_decrypt_vaes_calc_iv_done
+ cmpl $16, %edx
+ jl L_AES_GCM_decrypt_vaes_calc_iv_lt16
+ andl $0xfffffff0, %edx
+L_AES_GCM_decrypt_vaes_calc_iv_16_loop:
+ vmovdqu (%rax,%rcx,1), %xmm7
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7
+ vpxor %xmm7, %xmm5, %xmm5
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm5, %xmm1
+ vpshufd $0x4e, %xmm6, %xmm2
+ vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3
+ vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0
+ vpxor %xmm5, %xmm1, %xmm1
+ vpxor %xmm6, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm4
+ vmovdqa %xmm3, %xmm5
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm4, %xmm4
+ vpxor %xmm1, %xmm5, %xmm5
+ vpsrld $31, %xmm4, %xmm0
+ vpsrld $31, %xmm5, %xmm1
+ vpslld $0x01, %xmm4, %xmm4
+ vpslld $0x01, %xmm5, %xmm5
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm5, %xmm5
+ vpor %xmm0, %xmm4, %xmm4
+ vpor %xmm1, %xmm5, %xmm5
+ vpslld $31, %xmm4, %xmm0
+ vpslld $30, %xmm4, %xmm1
+ vpslld $25, %xmm4, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm4, %xmm4
+ vpsrld $0x01, %xmm4, %xmm2
+ vpsrld $2, %xmm4, %xmm3
+ vpsrld $7, %xmm4, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm4, %xmm2, %xmm2
+ vpxor %xmm2, %xmm5, %xmm5
+ addl $16, %ecx
+ cmpl %edx, %ecx
+ jl L_AES_GCM_decrypt_vaes_calc_iv_16_loop
+ movl %ebx, %edx
+ cmpl %edx, %ecx
+ je L_AES_GCM_decrypt_vaes_calc_iv_done
+L_AES_GCM_decrypt_vaes_calc_iv_lt16:
+ subq $16, %rsp
+ vpxor %xmm7, %xmm7, %xmm7
+ xorl %ebx, %ebx
+ vmovdqu %xmm7, (%rsp)
+L_AES_GCM_decrypt_vaes_calc_iv_loop:
+ movzbl (%rax,%rcx,1), %r13d
+ movb %r13b, (%rsp,%rbx,1)
+ incl %ecx
+ incl %ebx
+ cmpl %edx, %ecx
+ jl L_AES_GCM_decrypt_vaes_calc_iv_loop
+ vmovdqu (%rsp), %xmm7
+ addq $16, %rsp
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7
+ vpxor %xmm7, %xmm5, %xmm5
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm5, %xmm1
+ vpshufd $0x4e, %xmm6, %xmm2
+ vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3
+ vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0
+ vpxor %xmm5, %xmm1, %xmm1
+ vpxor %xmm6, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm4
+ vmovdqa %xmm3, %xmm5
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm4, %xmm4
+ vpxor %xmm1, %xmm5, %xmm5
+ vpsrld $31, %xmm4, %xmm0
+ vpsrld $31, %xmm5, %xmm1
+ vpslld $0x01, %xmm4, %xmm4
+ vpslld $0x01, %xmm5, %xmm5
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm5, %xmm5
+ vpor %xmm0, %xmm4, %xmm4
+ vpor %xmm1, %xmm5, %xmm5
+ vpslld $31, %xmm4, %xmm0
+ vpslld $30, %xmm4, %xmm1
+ vpslld $25, %xmm4, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm4, %xmm4
+ vpsrld $0x01, %xmm4, %xmm2
+ vpsrld $2, %xmm4, %xmm3
+ vpsrld $7, %xmm4, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm4, %xmm2, %xmm2
+ vpxor %xmm2, %xmm5, %xmm5
+L_AES_GCM_decrypt_vaes_calc_iv_done:
+ # T = Encrypt counter
+ vpxor %xmm0, %xmm0, %xmm0
+ shll $3, %edx
+ vmovq %rdx, %xmm0
+ vpxor %xmm0, %xmm5, %xmm5
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm5, %xmm1
+ vpshufd $0x4e, %xmm6, %xmm2
+ vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3
+ vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0
+ vpxor %xmm5, %xmm1, %xmm1
+ vpxor %xmm6, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm4
+ vmovdqa %xmm3, %xmm5
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm4, %xmm4
+ vpxor %xmm1, %xmm5, %xmm5
+ vpsrld $31, %xmm4, %xmm0
+ vpsrld $31, %xmm5, %xmm1
+ vpslld $0x01, %xmm4, %xmm4
+ vpslld $0x01, %xmm5, %xmm5
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm5, %xmm5
+ vpor %xmm0, %xmm4, %xmm4
+ vpor %xmm1, %xmm5, %xmm5
+ vpslld $31, %xmm4, %xmm0
+ vpslld $30, %xmm4, %xmm1
+ vpslld $25, %xmm4, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm4, %xmm4
+ vpsrld $0x01, %xmm4, %xmm2
+ vpsrld $2, %xmm4, %xmm3
+ vpsrld $7, %xmm4, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm4, %xmm2, %xmm2
+ vpxor %xmm2, %xmm5, %xmm5
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
+ # Encrypt counter
+ vmovdqa (%r15), %xmm7
+ vpxor %xmm5, %xmm7, %xmm7
+ vaesenc 16(%r15), %xmm7, %xmm7
+ vaesenc 32(%r15), %xmm7, %xmm7
+ vaesenc 48(%r15), %xmm7, %xmm7
+ vaesenc 64(%r15), %xmm7, %xmm7
+ vaesenc 80(%r15), %xmm7, %xmm7
+ vaesenc 96(%r15), %xmm7, %xmm7
+ vaesenc 112(%r15), %xmm7, %xmm7
+ vaesenc 128(%r15), %xmm7, %xmm7
+ vaesenc 144(%r15), %xmm7, %xmm7
+ cmpl $11, %r10d
+ vmovdqa 160(%r15), %xmm8
+ jl L_AES_GCM_decrypt_vaes_calc_iv_2_aesenc_avx_last
+ vaesenc %xmm8, %xmm7, %xmm7
+ vaesenc 176(%r15), %xmm7, %xmm7
+ cmpl $13, %r10d
+ vmovdqa 192(%r15), %xmm8
+ jl L_AES_GCM_decrypt_vaes_calc_iv_2_aesenc_avx_last
+ vaesenc %xmm8, %xmm7, %xmm7
+ vaesenc 208(%r15), %xmm7, %xmm7
+ vmovdqa 224(%r15), %xmm8
+L_AES_GCM_decrypt_vaes_calc_iv_2_aesenc_avx_last:
+ vaesenclast %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, 528(%rsp)
+L_AES_GCM_decrypt_vaes_iv_done:
+ # Additional authentication data
+ movl %r11d, %edx
+ cmpl $0x00, %edx
+ je L_AES_GCM_decrypt_vaes_calc_aad_done
+ xorl %ecx, %ecx
+ cmpl $16, %edx
+ jl L_AES_GCM_decrypt_vaes_calc_aad_lt16
+ andl $0xfffffff0, %edx
+L_AES_GCM_decrypt_vaes_calc_aad_16_loop:
+ vmovdqu (%r12,%rcx,1), %xmm7
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7
+ vpxor %xmm7, %xmm15, %xmm15
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm15, %xmm1
+ vpshufd $0x4e, %xmm6, %xmm2
+ vpclmulqdq $0x11, %xmm15, %xmm6, %xmm3
+ vpclmulqdq $0x00, %xmm15, %xmm6, %xmm0
+ vpxor %xmm15, %xmm1, %xmm1
+ vpxor %xmm6, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm4
+ vmovdqa %xmm3, %xmm15
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm4, %xmm4
+ vpxor %xmm1, %xmm15, %xmm15
+ vpsrld $31, %xmm4, %xmm0
+ vpsrld $31, %xmm15, %xmm1
+ vpslld $0x01, %xmm4, %xmm4
+ vpslld $0x01, %xmm15, %xmm15
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm15, %xmm15
+ vpor %xmm0, %xmm4, %xmm4
+ vpor %xmm1, %xmm15, %xmm15
+ vpslld $31, %xmm4, %xmm0
+ vpslld $30, %xmm4, %xmm1
+ vpslld $25, %xmm4, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm4, %xmm4
+ vpsrld $0x01, %xmm4, %xmm2
+ vpsrld $2, %xmm4, %xmm3
+ vpsrld $7, %xmm4, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm4, %xmm2, %xmm2
+ vpxor %xmm2, %xmm15, %xmm15
+ addl $16, %ecx
+ cmpl %edx, %ecx
+ jl L_AES_GCM_decrypt_vaes_calc_aad_16_loop
+ movl %r11d, %edx
+ cmpl %edx, %ecx
+ je L_AES_GCM_decrypt_vaes_calc_aad_done
+L_AES_GCM_decrypt_vaes_calc_aad_lt16:
+ subq $16, %rsp
+ vpxor %xmm7, %xmm7, %xmm7
+ xorl %ebx, %ebx
+ vmovdqu %xmm7, (%rsp)
+L_AES_GCM_decrypt_vaes_calc_aad_loop:
+ movzbl (%r12,%rcx,1), %r13d
+ movb %r13b, (%rsp,%rbx,1)
+ incl %ecx
+ incl %ebx
+ cmpl %edx, %ecx
+ jl L_AES_GCM_decrypt_vaes_calc_aad_loop
+ vmovdqu (%rsp), %xmm7
+ addq $16, %rsp
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7
+ vpxor %xmm7, %xmm15, %xmm15
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm15, %xmm1
+ vpshufd $0x4e, %xmm6, %xmm2
+ vpclmulqdq $0x11, %xmm15, %xmm6, %xmm3
+ vpclmulqdq $0x00, %xmm15, %xmm6, %xmm0
+ vpxor %xmm15, %xmm1, %xmm1
+ vpxor %xmm6, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm4
+ vmovdqa %xmm3, %xmm15
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm4, %xmm4
+ vpxor %xmm1, %xmm15, %xmm15
+ vpsrld $31, %xmm4, %xmm0
+ vpsrld $31, %xmm15, %xmm1
+ vpslld $0x01, %xmm4, %xmm4
+ vpslld $0x01, %xmm15, %xmm15
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm15, %xmm15
+ vpor %xmm0, %xmm4, %xmm4
+ vpor %xmm1, %xmm15, %xmm15
+ vpslld $31, %xmm4, %xmm0
+ vpslld $30, %xmm4, %xmm1
+ vpslld $25, %xmm4, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm4, %xmm4
+ vpsrld $0x01, %xmm4, %xmm2
+ vpsrld $2, %xmm4, %xmm3
+ vpsrld $7, %xmm4, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm4, %xmm2, %xmm2
+ vpxor %xmm2, %xmm15, %xmm15
+L_AES_GCM_decrypt_vaes_calc_aad_done:
+ # Calculate counter and H
+ vpsrlq $63, %xmm6, %xmm8
+ vpsllq $0x01, %xmm6, %xmm7
+ vpslldq $8, %xmm8, %xmm8
+ vpor %xmm8, %xmm7, %xmm7
+ vpshufd $0xff, %xmm6, %xmm6
+ vpsrad $31, %xmm6, %xmm6
+ vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm5, %xmm5
+ vpand L_vaes_aes_gcm_mod2_128(%rip), %xmm6, %xmm6
+ vpaddd L_vaes_aes_gcm_one(%rip), %xmm5, %xmm5
+ vpxor %xmm7, %xmm6, %xmm6
+ vmovdqu %xmm5, 512(%rsp)
+ xorl %ebx, %ebx
+ cmpl $0x80, %r9d
+ jl L_AES_GCM_decrypt_vaes_done_128
+ vmovdqa %xmm15, %xmm2
+ # H ^ 1
+ vmovdqu %xmm6, (%rsp)
+ # H ^ 2
+ vpclmulqdq $0x00, %xmm6, %xmm6, %xmm7
+ vpclmulqdq $0x11, %xmm6, %xmm6, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm0
+ vmovdqu %xmm0, 16(%rsp)
+ # H ^ 3
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm6, %xmm0, %xmm7
+ vpclmulqdq $0x01, %xmm6, %xmm0, %xmm8
+ vpclmulqdq $16, %xmm6, %xmm0, %xmm9
+ vpclmulqdq $0x11, %xmm6, %xmm0, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm1
+ vmovdqu %xmm1, 32(%rsp)
+ # H ^ 4
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm3
+ vmovdqu %xmm3, 48(%rsp)
+ # H ^ 5
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm9
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 64(%rsp)
+ # H ^ 6
+ vpclmulqdq $0x00, %xmm1, %xmm1, %xmm7
+ vpclmulqdq $0x11, %xmm1, %xmm1, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 80(%rsp)
+ # H ^ 7
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm1, %xmm3, %xmm7
+ vpclmulqdq $0x01, %xmm1, %xmm3, %xmm8
+ vpclmulqdq $16, %xmm1, %xmm3, %xmm9
+ vpclmulqdq $0x11, %xmm1, %xmm3, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 96(%rsp)
+ # H ^ 8
+ vpclmulqdq $0x00, %xmm3, %xmm3, %xmm7
+ vpclmulqdq $0x11, %xmm3, %xmm3, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 112(%rsp)
+ cmpl $0x100, %r9d
+ jl L_AES_GCM_decrypt_vaes_no_ext
+ # H ^ 9
+ vmovdqu 48(%rsp), %xmm0
+ vmovdqu 64(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm9
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 128(%rsp)
+ # H ^ 10
+ vmovdqu 64(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 144(%rsp)
+ # H ^ 11
+ vmovdqu 64(%rsp), %xmm0
+ vmovdqu 80(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm9
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 160(%rsp)
+ # H ^ 12
+ vmovdqu 80(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 176(%rsp)
+ # H ^ 13
+ vmovdqu 80(%rsp), %xmm0
+ vmovdqu 96(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm9
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 192(%rsp)
+ # H ^ 14
+ vmovdqu 96(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 208(%rsp)
+ # H ^ 15
+ vmovdqu 96(%rsp), %xmm0
+ vmovdqu 112(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm9
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 224(%rsp)
+ # H ^ 16
+ vmovdqu 112(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 240(%rsp)
+ vmovdqu 224(%rsp), %ymm7
+ vpermq $0x4e, %ymm7, %ymm7
+ vmovdqu 192(%rsp), %ymm8
+ vpermq $0x4e, %ymm8, %ymm8
+ vmovdqu 160(%rsp), %ymm9
+ vpermq $0x4e, %ymm9, %ymm9
+ vmovdqu 128(%rsp), %ymm10
+ vpermq $0x4e, %ymm10, %ymm10
+ vmovdqu %ymm7, 256(%rsp)
+ vmovdqu %ymm8, 288(%rsp)
+ vmovdqu %ymm9, 320(%rsp)
+ vmovdqu %ymm10, 352(%rsp)
+ vmovdqu 96(%rsp), %ymm7
+ vpermq $0x4e, %ymm7, %ymm7
+ vmovdqu 64(%rsp), %ymm8
+ vpermq $0x4e, %ymm8, %ymm8
+ vmovdqu 32(%rsp), %ymm9
+ vpermq $0x4e, %ymm9, %ymm9
+ vmovdqu (%rsp), %ymm10
+ vpermq $0x4e, %ymm10, %ymm10
+ vmovdqu %ymm7, 384(%rsp)
+ vmovdqu %ymm8, 416(%rsp)
+ vmovdqu %ymm9, 448(%rsp)
+ vmovdqu %ymm10, 480(%rsp)
+L_AES_GCM_decrypt_vaes_no_ext:
+ vbroadcasti128 L_vaes_aes_gcm_mod2_128(%rip), %ymm14
+ cmpl $0x100, %r9d
+ jl L_AES_GCM_decrypt_vaes_after_256
+ movl %r9d, %r13d
+ andl $0xffffff00, %r13d
+L_AES_GCM_decrypt_vaes_loop_256:
+ # 256 bytes of input
+ leaq (%rdi,%rbx,1), %rax
+ vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6
+ vpxor %ymm4, %ymm4, %ymm4
+ vinserti128 $0x00, %xmm15, %ymm4, %ymm4
+ vmovdqu 256(%rsp), %ymm7
+ vmovdqu 288(%rsp), %ymm8
+ vmovdqu 320(%rsp), %ymm9
+ vmovdqu 352(%rsp), %ymm10
+ vmovdqu (%rax), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpxor %ymm4, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm7, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3
+ vmovdqa %ymm0, %ymm11
+ vpxor %ymm1, %ymm2, %ymm12
+ vmovdqa %ymm3, %ymm13
+ vmovdqu 32(%rax), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm8, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 64(%rax), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm9, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 96(%rax), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm10, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 384(%rsp), %ymm7
+ vmovdqu 416(%rsp), %ymm8
+ vmovdqu 448(%rsp), %ymm9
+ vmovdqu 480(%rsp), %ymm10
+ vmovdqu 128(%rax), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm7, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 160(%rax), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm8, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 192(%rax), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm9, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 224(%rax), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm10, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5
+ vpshufd $0x4e, %ymm11, %ymm11
+ vpxor %ymm5, %ymm12, %ymm12
+ vpxor %ymm11, %ymm12, %ymm12
+ vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5
+ vpshufd $0x4e, %ymm12, %ymm12
+ vpxor %ymm5, %ymm13, %ymm13
+ vpxor %ymm12, %ymm13, %ymm13
+ vextracti128 $0x01, %ymm13, %xmm0
+ vpxor %xmm0, %xmm13, %xmm15
+ vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6
+ vbroadcasti128 512(%rsp), %ymm4
+ vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0
+ vpshufb %ymm6, %ymm0, %ymm0
+ vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1
+ vpshufb %ymm6, %ymm1, %ymm1
+ vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2
+ vpshufb %ymm6, %ymm2, %ymm2
+ vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3
+ vpshufb %ymm6, %ymm3, %ymm3
+ vmovdqu 512(%rsp), %xmm7
+ vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7
+ vmovdqu %xmm7, 512(%rsp)
+ vbroadcasti128 (%r15), %ymm4
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm4, %ymm1, %ymm1
+ vpxor %ymm4, %ymm2, %ymm2
+ vpxor %ymm4, %ymm3, %ymm3
+ vbroadcasti128 16(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 32(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 48(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 64(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 80(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 96(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 112(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 128(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 144(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $11, %r10d
+ vbroadcasti128 160(%r15), %ymm4
+ jl L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 176(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $13, %r10d
+ vbroadcasti128 192(%r15), %ymm4
+ jl L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 208(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 224(%r15), %ymm4
+L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last:
+ vaesenclast %ymm4, %ymm0, %ymm0
+ vaesenclast %ymm4, %ymm1, %ymm1
+ vaesenclast %ymm4, %ymm2, %ymm2
+ vaesenclast %ymm4, %ymm3, %ymm3
+ leaq (%rdi,%rbx,1), %rcx
+ leaq (%rsi,%rbx,1), %rdx
+ vmovdqu (%rcx), %ymm5
+ vpxor %ymm5, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vmovdqu 32(%rcx), %ymm5
+ vpxor %ymm5, %ymm1, %ymm1
+ vmovdqu %ymm1, 32(%rdx)
+ vmovdqu 64(%rcx), %ymm5
+ vpxor %ymm5, %ymm2, %ymm2
+ vmovdqu %ymm2, 64(%rdx)
+ vmovdqu 96(%rcx), %ymm5
+ vpxor %ymm5, %ymm3, %ymm3
+ vmovdqu %ymm3, 96(%rdx)
+ addl $0x80, %ebx
+ vbroadcasti128 512(%rsp), %ymm4
+ vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0
+ vpshufb %ymm6, %ymm0, %ymm0
+ vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1
+ vpshufb %ymm6, %ymm1, %ymm1
+ vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2
+ vpshufb %ymm6, %ymm2, %ymm2
+ vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3
+ vpshufb %ymm6, %ymm3, %ymm3
+ vmovdqu 512(%rsp), %xmm7
+ vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7
+ vmovdqu %xmm7, 512(%rsp)
+ vbroadcasti128 (%r15), %ymm4
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm4, %ymm1, %ymm1
+ vpxor %ymm4, %ymm2, %ymm2
+ vpxor %ymm4, %ymm3, %ymm3
+ vbroadcasti128 16(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 32(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 48(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 64(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 80(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 96(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 112(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 128(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 144(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $11, %r10d
+ vbroadcasti128 160(%r15), %ymm4
+ jl L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 176(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $13, %r10d
+ vbroadcasti128 192(%r15), %ymm4
+ jl L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 208(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 224(%r15), %ymm4
+L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last:
+ vaesenclast %ymm4, %ymm0, %ymm0
+ vaesenclast %ymm4, %ymm1, %ymm1
+ vaesenclast %ymm4, %ymm2, %ymm2
+ vaesenclast %ymm4, %ymm3, %ymm3
+ leaq (%rdi,%rbx,1), %rcx
+ leaq (%rsi,%rbx,1), %rdx
+ vmovdqu (%rcx), %ymm5
+ vpxor %ymm5, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vmovdqu 32(%rcx), %ymm5
+ vpxor %ymm5, %ymm1, %ymm1
+ vmovdqu %ymm1, 32(%rdx)
+ vmovdqu 64(%rcx), %ymm5
+ vpxor %ymm5, %ymm2, %ymm2
+ vmovdqu %ymm2, 64(%rdx)
+ vmovdqu 96(%rcx), %ymm5
+ vpxor %ymm5, %ymm3, %ymm3
+ vmovdqu %ymm3, 96(%rdx)
+ addl $0x80, %ebx
+ cmpl %r13d, %ebx
+ jl L_AES_GCM_decrypt_vaes_loop_256
+L_AES_GCM_decrypt_vaes_after_256:
+ vmovdqu 96(%rsp), %ymm7
+ vpermq $0x4e, %ymm7, %ymm7
+ vmovdqu 64(%rsp), %ymm8
+ vpermq $0x4e, %ymm8, %ymm8
+ vmovdqu 32(%rsp), %ymm9
+ vpermq $0x4e, %ymm9, %ymm9
+ vmovdqu (%rsp), %ymm10
+ vpermq $0x4e, %ymm10, %ymm10
+ movl %r9d, %r13d
+ andl $0xffffff80, %r13d
+ cmpl %r13d, %ebx
+ jge L_AES_GCM_decrypt_vaes_after_128
+ # 128 bytes of input
+ leaq (%rdi,%rbx,1), %rax
+ vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6
+ vpxor %ymm4, %ymm4, %ymm4
+ vinserti128 $0x00, %xmm15, %ymm4, %ymm4
+ vmovdqu (%rax), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpxor %ymm4, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm7, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3
+ vmovdqa %ymm0, %ymm11
+ vpxor %ymm1, %ymm2, %ymm12
+ vmovdqa %ymm3, %ymm13
+ vmovdqu 32(%rax), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm8, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 64(%rax), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm9, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 96(%rax), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm10, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5
+ vpshufd $0x4e, %ymm11, %ymm11
+ vpxor %ymm5, %ymm12, %ymm12
+ vpxor %ymm11, %ymm12, %ymm12
+ vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5
+ vpshufd $0x4e, %ymm12, %ymm12
+ vpxor %ymm5, %ymm13, %ymm13
+ vpxor %ymm12, %ymm13, %ymm13
+ vextracti128 $0x01, %ymm13, %xmm0
+ vpxor %xmm0, %xmm13, %xmm15
+ vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6
+ vbroadcasti128 512(%rsp), %ymm4
+ vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0
+ vpshufb %ymm6, %ymm0, %ymm0
+ vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1
+ vpshufb %ymm6, %ymm1, %ymm1
+ vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2
+ vpshufb %ymm6, %ymm2, %ymm2
+ vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3
+ vpshufb %ymm6, %ymm3, %ymm3
+ vmovdqu 512(%rsp), %xmm7
+ vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7
+ vmovdqu %xmm7, 512(%rsp)
+ vbroadcasti128 (%r15), %ymm4
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm4, %ymm1, %ymm1
+ vpxor %ymm4, %ymm2, %ymm2
+ vpxor %ymm4, %ymm3, %ymm3
+ vbroadcasti128 16(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 32(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 48(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 64(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 80(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 96(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 112(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 128(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 144(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $11, %r10d
+ vbroadcasti128 160(%r15), %ymm4
+ jl L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 176(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $13, %r10d
+ vbroadcasti128 192(%r15), %ymm4
+ jl L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 208(%r15), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 224(%r15), %ymm4
+L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last:
+ vaesenclast %ymm4, %ymm0, %ymm0
+ vaesenclast %ymm4, %ymm1, %ymm1
+ vaesenclast %ymm4, %ymm2, %ymm2
+ vaesenclast %ymm4, %ymm3, %ymm3
+ leaq (%rdi,%rbx,1), %rcx
+ leaq (%rsi,%rbx,1), %rdx
+ vmovdqu (%rcx), %ymm5
+ vpxor %ymm5, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vmovdqu 32(%rcx), %ymm5
+ vpxor %ymm5, %ymm1, %ymm1
+ vmovdqu %ymm1, 32(%rdx)
+ vmovdqu 64(%rcx), %ymm5
+ vpxor %ymm5, %ymm2, %ymm2
+ vmovdqu %ymm2, 64(%rdx)
+ vmovdqu 96(%rcx), %ymm5
+ vpxor %ymm5, %ymm3, %ymm3
+ vmovdqu %ymm3, 96(%rdx)
+ addl $0x80, %ebx
+L_AES_GCM_decrypt_vaes_after_128:
+ vmovdqu (%rsp), %xmm6
+L_AES_GCM_decrypt_vaes_done_128:
+ movl %r9d, %edx
+ cmpl %edx, %ebx
+ jge L_AES_GCM_decrypt_vaes_done_dec
+ movl %r9d, %r13d
+ andl $0xfffffff0, %r13d
+ cmpl %r13d, %ebx
+ jge L_AES_GCM_decrypt_vaes_last_block_done
+L_AES_GCM_decrypt_vaes_last_block_start:
+ vmovdqu (%rdi,%rbx,1), %xmm12
+ vmovdqa %xmm6, %xmm0
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm12, %xmm1
+ vpxor %xmm15, %xmm1, %xmm1
+ vmovdqu 512(%rsp), %xmm8
+ vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm8, %xmm7
+ vpaddd L_vaes_aes_gcm_one(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, 512(%rsp)
+ vpxor (%r15), %xmm7, %xmm7
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm9
+ vaesenc 16(%r15), %xmm7, %xmm7
+ vaesenc 32(%r15), %xmm7, %xmm7
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm10
+ vaesenc 48(%r15), %xmm7, %xmm7
+ vaesenc 64(%r15), %xmm7, %xmm7
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm11
+ vaesenc 80(%r15), %xmm7, %xmm7
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1
+ vaesenc 96(%r15), %xmm7, %xmm7
+ vpxor %xmm10, %xmm9, %xmm9
+ vpslldq $8, %xmm9, %xmm2
+ vpsrldq $8, %xmm9, %xmm9
+ vaesenc 112(%r15), %xmm7, %xmm7
+ vpxor %xmm11, %xmm2, %xmm2
+ vpxor %xmm9, %xmm1, %xmm3
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm0
+ vpclmulqdq $16, %xmm0, %xmm2, %xmm10
+ vaesenc 128(%r15), %xmm7, %xmm7
+ vpshufd $0x4e, %xmm2, %xmm9
+ vpxor %xmm10, %xmm9, %xmm9
+ vpclmulqdq $16, %xmm0, %xmm9, %xmm10
+ vaesenc 144(%r15), %xmm7, %xmm7
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpxor %xmm10, %xmm9, %xmm9
+ vpxor %xmm3, %xmm9, %xmm15
+ cmpl $11, %r10d
+ vmovdqa 160(%r15), %xmm8
+ jl L_AES_GCM_decrypt_vaes_aesenc_gfmul_last
+ vaesenc %xmm8, %xmm7, %xmm7
+ vaesenc 176(%r15), %xmm7, %xmm7
+ cmpl $13, %r10d
+ vmovdqa 192(%r15), %xmm8
+ jl L_AES_GCM_decrypt_vaes_aesenc_gfmul_last
+ vaesenc %xmm8, %xmm7, %xmm7
+ vaesenc 208(%r15), %xmm7, %xmm7
+ vmovdqa 224(%r15), %xmm8
+L_AES_GCM_decrypt_vaes_aesenc_gfmul_last:
+ vaesenclast %xmm8, %xmm7, %xmm7
+ vmovdqa %xmm12, %xmm0
+ vpxor %xmm0, %xmm7, %xmm7
+ vmovdqu %xmm7, (%rsi,%rbx,1)
+ addl $16, %ebx
+ cmpl %r13d, %ebx
+ jl L_AES_GCM_decrypt_vaes_last_block_start
+L_AES_GCM_decrypt_vaes_last_block_done:
+ movl %r9d, %ecx
+ movl %ecx, %edx
+ andl $15, %ecx
+ jz L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_done
+ vmovdqu 512(%rsp), %xmm5
+ vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm5, %xmm5
+ vpxor (%r15), %xmm5, %xmm5
+ vaesenc 16(%r15), %xmm5, %xmm5
+ vaesenc 32(%r15), %xmm5, %xmm5
+ vaesenc 48(%r15), %xmm5, %xmm5
+ vaesenc 64(%r15), %xmm5, %xmm5
+ vaesenc 80(%r15), %xmm5, %xmm5
+ vaesenc 96(%r15), %xmm5, %xmm5
+ vaesenc 112(%r15), %xmm5, %xmm5
+ vaesenc 128(%r15), %xmm5, %xmm5
+ vaesenc 144(%r15), %xmm5, %xmm5
+ cmpl $11, %r10d
+ vmovdqa 160(%r15), %xmm8
+ jl L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_aesenc_avx_last
+ vaesenc %xmm8, %xmm5, %xmm5
+ vaesenc 176(%r15), %xmm5, %xmm5
+ cmpl $13, %r10d
+ vmovdqa 192(%r15), %xmm8
+ jl L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_aesenc_avx_last
+ vaesenc %xmm8, %xmm5, %xmm5
+ vaesenc 208(%r15), %xmm5, %xmm5
+ vmovdqa 224(%r15), %xmm8
+L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_aesenc_avx_last:
+ vaesenclast %xmm8, %xmm5, %xmm5
+ subq $32, %rsp
+ xorl %ecx, %ecx
+ vmovdqu %xmm5, (%rsp)
+ vpxor %xmm0, %xmm0, %xmm0
+ vmovdqu %xmm0, 16(%rsp)
+L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_loop:
+ movzbl (%rdi,%rbx,1), %r13d
+ movb %r13b, 16(%rsp,%rcx,1)
+ xorb (%rsp,%rcx,1), %r13b
+ movb %r13b, (%rsi,%rbx,1)
+ incl %ebx
+ incl %ecx
+ cmpl %edx, %ebx
+ jl L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_loop
+ vmovdqu 16(%rsp), %xmm5
+ addq $32, %rsp
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
+ vpxor %xmm5, %xmm15, %xmm15
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm6, %xmm15, %xmm7
+ vpclmulqdq $0x01, %xmm6, %xmm15, %xmm8
+ vpclmulqdq $16, %xmm6, %xmm15, %xmm9
+ vpclmulqdq $0x11, %xmm6, %xmm15, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm15
+L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_done:
+L_AES_GCM_decrypt_vaes_done_dec:
+ movl %r9d, %edx
+ movl %r11d, %ecx
+ shlq $3, %rdx
+ shlq $3, %rcx
+ vmovq %rdx, %xmm0
+ vmovq %rcx, %xmm1
+ vpunpcklqdq %xmm1, %xmm0, %xmm0
+ vpxor %xmm0, %xmm15, %xmm15
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm6, %xmm15, %xmm7
+ vpclmulqdq $0x01, %xmm6, %xmm15, %xmm8
+ vpclmulqdq $16, %xmm6, %xmm15, %xmm9
+ vpclmulqdq $0x11, %xmm6, %xmm15, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm15
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm15, %xmm15
+ vmovdqu 528(%rsp), %xmm0
+ vpxor %xmm15, %xmm0, %xmm0
+ cmpl $16, %r14d
+ je L_AES_GCM_decrypt_vaes_cmp_tag_16
+ subq $16, %rsp
+ xorq %rcx, %rcx
+ xorq %rbx, %rbx
+ vmovdqu %xmm0, (%rsp)
+L_AES_GCM_decrypt_vaes_cmp_tag_loop:
+ movzbl (%rsp,%rcx,1), %r13d
+ xorb (%r8,%rcx,1), %r13b
+ orb %r13b, %bl
+ incl %ecx
+ cmpl %r14d, %ecx
+ jne L_AES_GCM_decrypt_vaes_cmp_tag_loop
+ cmpb $0x00, %bl
+ sete %bl
+ addq $16, %rsp
+ xorq %rcx, %rcx
+ jmp L_AES_GCM_decrypt_vaes_cmp_tag_done
+L_AES_GCM_decrypt_vaes_cmp_tag_16:
+ vmovdqu (%r8), %xmm1
+ vpcmpeqb %xmm1, %xmm0, %xmm0
+ vpmovmskb %xmm0, %rdx
+ # %%edx == 0xFFFF then return 1 else => return 0
+ xorl %ebx, %ebx
+ cmpl $0xffff, %edx
+ sete %bl
+L_AES_GCM_decrypt_vaes_cmp_tag_done:
+ movl %ebx, (%rbp)
+ vzeroupper
+ addq $0x220, %rsp
+ popq %rbp
+ popq %r15
+ popq %r14
+ popq %rbx
+ popq %r12
+ popq %r13
+ repz retq
+#ifndef __APPLE__
+.size AES_GCM_decrypt_vaes,.-AES_GCM_decrypt_vaes
+#endif /* __APPLE__ */
+#ifdef WOLFSSL_AESGCM_STREAM
+#ifndef __APPLE__
+.text
+.globl AES_GCM_init_vaes
+.type AES_GCM_init_vaes,@function
+.align 16
+AES_GCM_init_vaes:
+#else
+.section __TEXT,__text
+.globl _AES_GCM_init_vaes
+.p2align 4
+_AES_GCM_init_vaes:
+#endif /* __APPLE__ */
+ pushq %r12
+ pushq %r13
+ movq %rdx, %r10
+ movl %ecx, %r11d
+ movq 24(%rsp), %rax
+ subq $16, %rsp
+ vpxor %xmm4, %xmm4, %xmm4
+ movl %r11d, %edx
+ cmpl $12, %edx
+ jne L_AES_GCM_init_vaes_iv_not_12
+ # # Calculate values when IV is 12 bytes
+ # Set counter based on IV
+ movl $0x1000000, %ecx
+ vmovq (%r10), %xmm4
+ vpinsrd $2, 8(%r10), %xmm4, %xmm4
+ vpinsrd $3, %ecx, %xmm4, %xmm4
+ # H = Encrypt X(=0) and T = Encrypt counter
+ vmovdqa (%rdi), %xmm5
+ vpxor %xmm5, %xmm4, %xmm1
+ vmovdqa 16(%rdi), %xmm6
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ vmovdqa 32(%rdi), %xmm6
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ vmovdqa 48(%rdi), %xmm6
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ vmovdqa 64(%rdi), %xmm6
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ vmovdqa 80(%rdi), %xmm6
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ vmovdqa 96(%rdi), %xmm6
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ vmovdqa 112(%rdi), %xmm6
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ vmovdqa 128(%rdi), %xmm6
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ vmovdqa 144(%rdi), %xmm6
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ cmpl $11, %esi
+ vmovdqa 160(%rdi), %xmm6
+ jl L_AES_GCM_init_vaes_calc_iv_12_last
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ vmovdqa 176(%rdi), %xmm6
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ cmpl $13, %esi
+ vmovdqa 192(%rdi), %xmm6
+ jl L_AES_GCM_init_vaes_calc_iv_12_last
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ vmovdqa 208(%rdi), %xmm6
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ vmovdqa 224(%rdi), %xmm6
+L_AES_GCM_init_vaes_calc_iv_12_last:
+ vaesenclast %xmm6, %xmm5, %xmm5
+ vaesenclast %xmm6, %xmm1, %xmm1
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
+ vmovdqu %xmm1, %xmm15
+ jmp L_AES_GCM_init_vaes_iv_done
+L_AES_GCM_init_vaes_iv_not_12:
+ # Calculate values when IV is not 12 bytes
+ # H = Encrypt X(=0)
+ vmovdqa (%rdi), %xmm5
+ vaesenc 16(%rdi), %xmm5, %xmm5
+ vaesenc 32(%rdi), %xmm5, %xmm5
+ vaesenc 48(%rdi), %xmm5, %xmm5
+ vaesenc 64(%rdi), %xmm5, %xmm5
+ vaesenc 80(%rdi), %xmm5, %xmm5
+ vaesenc 96(%rdi), %xmm5, %xmm5
+ vaesenc 112(%rdi), %xmm5, %xmm5
+ vaesenc 128(%rdi), %xmm5, %xmm5
+ vaesenc 144(%rdi), %xmm5, %xmm5
+ cmpl $11, %esi
+ vmovdqa 160(%rdi), %xmm8
+ jl L_AES_GCM_init_vaes_calc_iv_1_aesenc_avx_last
+ vaesenc %xmm8, %xmm5, %xmm5
+ vaesenc 176(%rdi), %xmm5, %xmm5
+ cmpl $13, %esi
+ vmovdqa 192(%rdi), %xmm8
+ jl L_AES_GCM_init_vaes_calc_iv_1_aesenc_avx_last
+ vaesenc %xmm8, %xmm5, %xmm5
+ vaesenc 208(%rdi), %xmm5, %xmm5
+ vmovdqa 224(%rdi), %xmm8
+L_AES_GCM_init_vaes_calc_iv_1_aesenc_avx_last:
+ vaesenclast %xmm8, %xmm5, %xmm5
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
+ # Calc counter
+ # Initialization vector
+ cmpl $0x00, %edx
+ movq $0x00, %rcx
+ je L_AES_GCM_init_vaes_calc_iv_done
+ cmpl $16, %edx
+ jl L_AES_GCM_init_vaes_calc_iv_lt16
+ andl $0xfffffff0, %edx
+L_AES_GCM_init_vaes_calc_iv_16_loop:
+ vmovdqu (%r10,%rcx,1), %xmm7
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7
+ vpxor %xmm7, %xmm4, %xmm4
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm4, %xmm1
+ vpshufd $0x4e, %xmm5, %xmm2
+ vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3
+ vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0
+ vpxor %xmm4, %xmm1, %xmm1
+ vpxor %xmm5, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm6
+ vmovdqa %xmm3, %xmm4
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm6, %xmm6
+ vpxor %xmm1, %xmm4, %xmm4
+ vpsrld $31, %xmm6, %xmm0
+ vpsrld $31, %xmm4, %xmm1
+ vpslld $0x01, %xmm6, %xmm6
+ vpslld $0x01, %xmm4, %xmm4
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm4, %xmm4
+ vpor %xmm0, %xmm6, %xmm6
+ vpor %xmm1, %xmm4, %xmm4
+ vpslld $31, %xmm6, %xmm0
+ vpslld $30, %xmm6, %xmm1
+ vpslld $25, %xmm6, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm6, %xmm6
+ vpsrld $0x01, %xmm6, %xmm2
+ vpsrld $2, %xmm6, %xmm3
+ vpsrld $7, %xmm6, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm6, %xmm2, %xmm2
+ vpxor %xmm2, %xmm4, %xmm4
+ addl $16, %ecx
+ cmpl %edx, %ecx
+ jl L_AES_GCM_init_vaes_calc_iv_16_loop
+ movl %r11d, %edx
+ cmpl %edx, %ecx
+ je L_AES_GCM_init_vaes_calc_iv_done
+L_AES_GCM_init_vaes_calc_iv_lt16:
+ subq $16, %rsp
+ vpxor %xmm7, %xmm7, %xmm7
+ xorl %r13d, %r13d
+ vmovdqu %xmm7, (%rsp)
+L_AES_GCM_init_vaes_calc_iv_loop:
+ movzbl (%r10,%rcx,1), %r12d
+ movb %r12b, (%rsp,%r13,1)
+ incl %ecx
+ incl %r13d
+ cmpl %edx, %ecx
+ jl L_AES_GCM_init_vaes_calc_iv_loop
+ vmovdqu (%rsp), %xmm7
+ addq $16, %rsp
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7
+ vpxor %xmm7, %xmm4, %xmm4
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm4, %xmm1
+ vpshufd $0x4e, %xmm5, %xmm2
+ vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3
+ vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0
+ vpxor %xmm4, %xmm1, %xmm1
+ vpxor %xmm5, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm6
+ vmovdqa %xmm3, %xmm4
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm6, %xmm6
+ vpxor %xmm1, %xmm4, %xmm4
+ vpsrld $31, %xmm6, %xmm0
+ vpsrld $31, %xmm4, %xmm1
+ vpslld $0x01, %xmm6, %xmm6
+ vpslld $0x01, %xmm4, %xmm4
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm4, %xmm4
+ vpor %xmm0, %xmm6, %xmm6
+ vpor %xmm1, %xmm4, %xmm4
+ vpslld $31, %xmm6, %xmm0
+ vpslld $30, %xmm6, %xmm1
+ vpslld $25, %xmm6, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm6, %xmm6
+ vpsrld $0x01, %xmm6, %xmm2
+ vpsrld $2, %xmm6, %xmm3
+ vpsrld $7, %xmm6, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm6, %xmm2, %xmm2
+ vpxor %xmm2, %xmm4, %xmm4
+L_AES_GCM_init_vaes_calc_iv_done:
+ # T = Encrypt counter
+ vpxor %xmm0, %xmm0, %xmm0
+ shll $3, %edx
+ vmovq %rdx, %xmm0
+ vpxor %xmm0, %xmm4, %xmm4
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm4, %xmm1
+ vpshufd $0x4e, %xmm5, %xmm2
+ vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3
+ vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0
+ vpxor %xmm4, %xmm1, %xmm1
+ vpxor %xmm5, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm6
+ vmovdqa %xmm3, %xmm4
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm6, %xmm6
+ vpxor %xmm1, %xmm4, %xmm4
+ vpsrld $31, %xmm6, %xmm0
+ vpsrld $31, %xmm4, %xmm1
+ vpslld $0x01, %xmm6, %xmm6
+ vpslld $0x01, %xmm4, %xmm4
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm4, %xmm4
+ vpor %xmm0, %xmm6, %xmm6
+ vpor %xmm1, %xmm4, %xmm4
+ vpslld $31, %xmm6, %xmm0
+ vpslld $30, %xmm6, %xmm1
+ vpslld $25, %xmm6, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm6, %xmm6
+ vpsrld $0x01, %xmm6, %xmm2
+ vpsrld $2, %xmm6, %xmm3
+ vpsrld $7, %xmm6, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm6, %xmm2, %xmm2
+ vpxor %xmm2, %xmm4, %xmm4
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4
+ # Encrypt counter
+ vmovdqa (%rdi), %xmm7
+ vpxor %xmm4, %xmm7, %xmm7
+ vaesenc 16(%rdi), %xmm7, %xmm7
+ vaesenc 32(%rdi), %xmm7, %xmm7
+ vaesenc 48(%rdi), %xmm7, %xmm7
+ vaesenc 64(%rdi), %xmm7, %xmm7
+ vaesenc 80(%rdi), %xmm7, %xmm7
+ vaesenc 96(%rdi), %xmm7, %xmm7
+ vaesenc 112(%rdi), %xmm7, %xmm7
+ vaesenc 128(%rdi), %xmm7, %xmm7
+ vaesenc 144(%rdi), %xmm7, %xmm7
+ cmpl $11, %esi
+ vmovdqa 160(%rdi), %xmm8
+ jl L_AES_GCM_init_vaes_calc_iv_2_aesenc_avx_last
+ vaesenc %xmm8, %xmm7, %xmm7
+ vaesenc 176(%rdi), %xmm7, %xmm7
+ cmpl $13, %esi
+ vmovdqa 192(%rdi), %xmm8
+ jl L_AES_GCM_init_vaes_calc_iv_2_aesenc_avx_last
+ vaesenc %xmm8, %xmm7, %xmm7
+ vaesenc 208(%rdi), %xmm7, %xmm7
+ vmovdqa 224(%rdi), %xmm8
+L_AES_GCM_init_vaes_calc_iv_2_aesenc_avx_last:
+ vaesenclast %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, %xmm15
+L_AES_GCM_init_vaes_iv_done:
+ vmovdqa %xmm15, (%rax)
+ vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4
+ vpaddd L_vaes_aes_gcm_one(%rip), %xmm4, %xmm4
+ vmovdqa %xmm5, (%r8)
+ vmovdqa %xmm4, (%r9)
+ addq $16, %rsp
+ popq %r13
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size AES_GCM_init_vaes,.-AES_GCM_init_vaes
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_GCM_aad_update_vaes
+.type AES_GCM_aad_update_vaes,@function
+.align 16
+AES_GCM_aad_update_vaes:
+#else
+.section __TEXT,__text
+.globl _AES_GCM_aad_update_vaes
+.p2align 4
+_AES_GCM_aad_update_vaes:
+#endif /* __APPLE__ */
+ movq %rcx, %rax
+ vmovdqa (%rdx), %xmm5
+ vmovdqa (%rax), %xmm6
+ xorl %ecx, %ecx
+L_AES_GCM_aad_update_vaes_16_loop:
+ vmovdqu (%rdi,%rcx,1), %xmm7
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7
+ vpxor %xmm7, %xmm5, %xmm5
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm5, %xmm1
+ vpshufd $0x4e, %xmm6, %xmm2
+ vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3
+ vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0
+ vpxor %xmm5, %xmm1, %xmm1
+ vpxor %xmm6, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm4
+ vmovdqa %xmm3, %xmm5
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm4, %xmm4
+ vpxor %xmm1, %xmm5, %xmm5
+ vpsrld $31, %xmm4, %xmm0
+ vpsrld $31, %xmm5, %xmm1
+ vpslld $0x01, %xmm4, %xmm4
+ vpslld $0x01, %xmm5, %xmm5
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm5, %xmm5
+ vpor %xmm0, %xmm4, %xmm4
+ vpor %xmm1, %xmm5, %xmm5
+ vpslld $31, %xmm4, %xmm0
+ vpslld $30, %xmm4, %xmm1
+ vpslld $25, %xmm4, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm4, %xmm4
+ vpsrld $0x01, %xmm4, %xmm2
+ vpsrld $2, %xmm4, %xmm3
+ vpsrld $7, %xmm4, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm4, %xmm2, %xmm2
+ vpxor %xmm2, %xmm5, %xmm5
+ addl $16, %ecx
+ cmpl %esi, %ecx
+ jl L_AES_GCM_aad_update_vaes_16_loop
+ vmovdqa %xmm5, (%rdx)
+ repz retq
+#ifndef __APPLE__
+.size AES_GCM_aad_update_vaes,.-AES_GCM_aad_update_vaes
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_GCM_encrypt_block_vaes
+.type AES_GCM_encrypt_block_vaes,@function
+.align 16
+AES_GCM_encrypt_block_vaes:
+#else
+.section __TEXT,__text
+.globl _AES_GCM_encrypt_block_vaes
+.p2align 4
+_AES_GCM_encrypt_block_vaes:
+#endif /* __APPLE__ */
+ movq %rdx, %r10
+ movq %rcx, %r11
+ vmovdqu (%r8), %xmm1
+ vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm1, %xmm0
+ vpaddd L_vaes_aes_gcm_one(%rip), %xmm1, %xmm1
+ vmovdqu %xmm1, (%r8)
+ vpxor (%rdi), %xmm0, %xmm0
+ vaesenc 16(%rdi), %xmm0, %xmm0
+ vaesenc 32(%rdi), %xmm0, %xmm0
+ vaesenc 48(%rdi), %xmm0, %xmm0
+ vaesenc 64(%rdi), %xmm0, %xmm0
+ vaesenc 80(%rdi), %xmm0, %xmm0
+ vaesenc 96(%rdi), %xmm0, %xmm0
+ vaesenc 112(%rdi), %xmm0, %xmm0
+ vaesenc 128(%rdi), %xmm0, %xmm0
+ vaesenc 144(%rdi), %xmm0, %xmm0
+ cmpl $11, %esi
+ vmovdqa 160(%rdi), %xmm1
+ jl L_AES_GCM_encrypt_block_vaes_aesenc_block_last
+ vaesenc %xmm1, %xmm0, %xmm0
+ vaesenc 176(%rdi), %xmm0, %xmm0
+ cmpl $13, %esi
+ vmovdqa 192(%rdi), %xmm1
+ jl L_AES_GCM_encrypt_block_vaes_aesenc_block_last
+ vaesenc %xmm1, %xmm0, %xmm0
+ vaesenc 208(%rdi), %xmm0, %xmm0
+ vmovdqa 224(%rdi), %xmm1
+L_AES_GCM_encrypt_block_vaes_aesenc_block_last:
+ vaesenclast %xmm1, %xmm0, %xmm0
+ vmovdqu (%r11), %xmm1
+ vpxor %xmm1, %xmm0, %xmm0
+ vmovdqu %xmm0, (%r10)
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
+ vzeroupper
+ repz retq
+#ifndef __APPLE__
+.size AES_GCM_encrypt_block_vaes,.-AES_GCM_encrypt_block_vaes
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_GCM_ghash_block_vaes
+.type AES_GCM_ghash_block_vaes,@function
+.align 16
+AES_GCM_ghash_block_vaes:
+#else
+.section __TEXT,__text
+.globl _AES_GCM_ghash_block_vaes
+.p2align 4
+_AES_GCM_ghash_block_vaes:
+#endif /* __APPLE__ */
+ vmovdqa (%rsi), %xmm4
+ vmovdqa (%rdx), %xmm5
+ vmovdqu (%rdi), %xmm7
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7
+ vpxor %xmm7, %xmm4, %xmm4
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm4, %xmm1
+ vpshufd $0x4e, %xmm5, %xmm2
+ vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3
+ vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0
+ vpxor %xmm4, %xmm1, %xmm1
+ vpxor %xmm5, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm6
+ vmovdqa %xmm3, %xmm4
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm6, %xmm6
+ vpxor %xmm1, %xmm4, %xmm4
+ vpsrld $31, %xmm6, %xmm0
+ vpsrld $31, %xmm4, %xmm1
+ vpslld $0x01, %xmm6, %xmm6
+ vpslld $0x01, %xmm4, %xmm4
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm4, %xmm4
+ vpor %xmm0, %xmm6, %xmm6
+ vpor %xmm1, %xmm4, %xmm4
+ vpslld $31, %xmm6, %xmm0
+ vpslld $30, %xmm6, %xmm1
+ vpslld $25, %xmm6, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm6, %xmm6
+ vpsrld $0x01, %xmm6, %xmm2
+ vpsrld $2, %xmm6, %xmm3
+ vpsrld $7, %xmm6, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm6, %xmm2, %xmm2
+ vpxor %xmm2, %xmm4, %xmm4
+ vmovdqa %xmm4, (%rsi)
+ vzeroupper
+ repz retq
+#ifndef __APPLE__
+.size AES_GCM_ghash_block_vaes,.-AES_GCM_ghash_block_vaes
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_GCM_encrypt_update_vaes
+.type AES_GCM_encrypt_update_vaes,@function
+.align 16
+AES_GCM_encrypt_update_vaes:
+#else
+.section __TEXT,__text
+.globl _AES_GCM_encrypt_update_vaes
+.p2align 4
+_AES_GCM_encrypt_update_vaes:
+#endif /* __APPLE__ */
+ pushq %r13
+ pushq %r12
+ pushq %r14
+ pushq %r15
+ pushq %rbx
+ movq %rdx, %r10
+ movq %rcx, %r11
+ movq 48(%rsp), %rax
+ movq 56(%rsp), %r12
+ subq $0x210, %rsp
+ vmovdqa (%r9), %xmm15
+ vmovdqa (%rax), %xmm6
+ vpsrlq $63, %xmm6, %xmm8
+ vpsllq $0x01, %xmm6, %xmm7
+ vpslldq $8, %xmm8, %xmm8
+ vpor %xmm8, %xmm7, %xmm7
+ vpshufd $0xff, %xmm6, %xmm6
+ vpsrad $31, %xmm6, %xmm6
+ vpand L_vaes_aes_gcm_mod2_128(%rip), %xmm6, %xmm6
+ vpxor %xmm7, %xmm6, %xmm6
+ xorl %r14d, %r14d
+ cmpl $0x80, %r8d
+ jl L_AES_GCM_encrypt_update_vaes_done_128
+ vmovdqa %xmm15, %xmm2
+ # H ^ 1
+ vmovdqu %xmm6, (%rsp)
+ # H ^ 2
+ vpclmulqdq $0x00, %xmm6, %xmm6, %xmm7
+ vpclmulqdq $0x11, %xmm6, %xmm6, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm0
+ vmovdqu %xmm0, 16(%rsp)
+ # H ^ 3
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm6, %xmm0, %xmm7
+ vpclmulqdq $0x01, %xmm6, %xmm0, %xmm8
+ vpclmulqdq $16, %xmm6, %xmm0, %xmm9
+ vpclmulqdq $0x11, %xmm6, %xmm0, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm1
+ vmovdqu %xmm1, 32(%rsp)
+ # H ^ 4
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm3
+ vmovdqu %xmm3, 48(%rsp)
+ # H ^ 5
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm9
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 64(%rsp)
+ # H ^ 6
+ vpclmulqdq $0x00, %xmm1, %xmm1, %xmm7
+ vpclmulqdq $0x11, %xmm1, %xmm1, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 80(%rsp)
+ # H ^ 7
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm1, %xmm3, %xmm7
+ vpclmulqdq $0x01, %xmm1, %xmm3, %xmm8
+ vpclmulqdq $16, %xmm1, %xmm3, %xmm9
+ vpclmulqdq $0x11, %xmm1, %xmm3, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 96(%rsp)
+ # H ^ 8
+ vpclmulqdq $0x00, %xmm3, %xmm3, %xmm7
+ vpclmulqdq $0x11, %xmm3, %xmm3, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 112(%rsp)
+ cmpl $0x100, %r8d
+ jl L_AES_GCM_encrypt_update_vaes_no_ext
+ # H ^ 9
+ vmovdqu 48(%rsp), %xmm0
+ vmovdqu 64(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm9
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 128(%rsp)
+ # H ^ 10
+ vmovdqu 64(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 144(%rsp)
+ # H ^ 11
+ vmovdqu 64(%rsp), %xmm0
+ vmovdqu 80(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm9
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 160(%rsp)
+ # H ^ 12
+ vmovdqu 80(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 176(%rsp)
+ # H ^ 13
+ vmovdqu 80(%rsp), %xmm0
+ vmovdqu 96(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm9
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 192(%rsp)
+ # H ^ 14
+ vmovdqu 96(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 208(%rsp)
+ # H ^ 15
+ vmovdqu 96(%rsp), %xmm0
+ vmovdqu 112(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm9
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 224(%rsp)
+ # H ^ 16
+ vmovdqu 112(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 240(%rsp)
+ vmovdqu 224(%rsp), %ymm7
+ vpermq $0x4e, %ymm7, %ymm7
+ vmovdqu 192(%rsp), %ymm8
+ vpermq $0x4e, %ymm8, %ymm8
+ vmovdqu 160(%rsp), %ymm9
+ vpermq $0x4e, %ymm9, %ymm9
+ vmovdqu 128(%rsp), %ymm10
+ vpermq $0x4e, %ymm10, %ymm10
+ vmovdqu %ymm7, 256(%rsp)
+ vmovdqu %ymm8, 288(%rsp)
+ vmovdqu %ymm9, 320(%rsp)
+ vmovdqu %ymm10, 352(%rsp)
+ vmovdqu 96(%rsp), %ymm7
+ vpermq $0x4e, %ymm7, %ymm7
+ vmovdqu 64(%rsp), %ymm8
+ vpermq $0x4e, %ymm8, %ymm8
+ vmovdqu 32(%rsp), %ymm9
+ vpermq $0x4e, %ymm9, %ymm9
+ vmovdqu (%rsp), %ymm10
+ vpermq $0x4e, %ymm10, %ymm10
+ vmovdqu %ymm7, 384(%rsp)
+ vmovdqu %ymm8, 416(%rsp)
+ vmovdqu %ymm9, 448(%rsp)
+ vmovdqu %ymm10, 480(%rsp)
+L_AES_GCM_encrypt_update_vaes_no_ext:
+ vbroadcasti128 L_vaes_aes_gcm_mod2_128(%rip), %ymm14
+ cmpl $0x100, %r8d
+ jl L_AES_GCM_encrypt_update_vaes_after_256
+ movl %r8d, %r13d
+ andl $0xffffff00, %r13d
+L_AES_GCM_encrypt_update_vaes_loop_256:
+ # 256 bytes of input
+ leaq (%r10,%r14,1), %r15
+ vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6
+ vbroadcasti128 (%r12), %ymm4
+ vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0
+ vpshufb %ymm6, %ymm0, %ymm0
+ vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1
+ vpshufb %ymm6, %ymm1, %ymm1
+ vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2
+ vpshufb %ymm6, %ymm2, %ymm2
+ vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3
+ vpshufb %ymm6, %ymm3, %ymm3
+ vmovdqu (%r12), %xmm7
+ vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7
+ vmovdqu %xmm7, (%r12)
+ vbroadcasti128 (%rdi), %ymm4
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm4, %ymm1, %ymm1
+ vpxor %ymm4, %ymm2, %ymm2
+ vpxor %ymm4, %ymm3, %ymm3
+ vbroadcasti128 16(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 32(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 48(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 64(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 80(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 96(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 112(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 128(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 144(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $11, %esi
+ vbroadcasti128 160(%rdi), %ymm4
+ jl L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 176(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $13, %esi
+ vbroadcasti128 192(%rdi), %ymm4
+ jl L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 208(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 224(%rdi), %ymm4
+L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last:
+ vaesenclast %ymm4, %ymm0, %ymm0
+ vaesenclast %ymm4, %ymm1, %ymm1
+ vaesenclast %ymm4, %ymm2, %ymm2
+ vaesenclast %ymm4, %ymm3, %ymm3
+ leaq (%r11,%r14,1), %rcx
+ leaq (%r10,%r14,1), %rdx
+ vmovdqu (%rcx), %ymm5
+ vpxor %ymm5, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vmovdqu 32(%rcx), %ymm5
+ vpxor %ymm5, %ymm1, %ymm1
+ vmovdqu %ymm1, 32(%rdx)
+ vmovdqu 64(%rcx), %ymm5
+ vpxor %ymm5, %ymm2, %ymm2
+ vmovdqu %ymm2, 64(%rdx)
+ vmovdqu 96(%rcx), %ymm5
+ vpxor %ymm5, %ymm3, %ymm3
+ vmovdqu %ymm3, 96(%rdx)
+ addl $0x80, %r14d
+ vbroadcasti128 (%r12), %ymm4
+ vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0
+ vpshufb %ymm6, %ymm0, %ymm0
+ vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1
+ vpshufb %ymm6, %ymm1, %ymm1
+ vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2
+ vpshufb %ymm6, %ymm2, %ymm2
+ vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3
+ vpshufb %ymm6, %ymm3, %ymm3
+ vmovdqu (%r12), %xmm7
+ vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7
+ vmovdqu %xmm7, (%r12)
+ vbroadcasti128 (%rdi), %ymm4
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm4, %ymm1, %ymm1
+ vpxor %ymm4, %ymm2, %ymm2
+ vpxor %ymm4, %ymm3, %ymm3
+ vbroadcasti128 16(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 32(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 48(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 64(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 80(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 96(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 112(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 128(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 144(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $11, %esi
+ vbroadcasti128 160(%rdi), %ymm4
+ jl L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 176(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $13, %esi
+ vbroadcasti128 192(%rdi), %ymm4
+ jl L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 208(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 224(%rdi), %ymm4
+L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last:
+ vaesenclast %ymm4, %ymm0, %ymm0
+ vaesenclast %ymm4, %ymm1, %ymm1
+ vaesenclast %ymm4, %ymm2, %ymm2
+ vaesenclast %ymm4, %ymm3, %ymm3
+ leaq (%r11,%r14,1), %rcx
+ leaq (%r10,%r14,1), %rdx
+ vmovdqu (%rcx), %ymm5
+ vpxor %ymm5, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vmovdqu 32(%rcx), %ymm5
+ vpxor %ymm5, %ymm1, %ymm1
+ vmovdqu %ymm1, 32(%rdx)
+ vmovdqu 64(%rcx), %ymm5
+ vpxor %ymm5, %ymm2, %ymm2
+ vmovdqu %ymm2, 64(%rdx)
+ vmovdqu 96(%rcx), %ymm5
+ vpxor %ymm5, %ymm3, %ymm3
+ vmovdqu %ymm3, 96(%rdx)
+ addl $0x80, %r14d
+ vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6
+ vpxor %ymm4, %ymm4, %ymm4
+ vinserti128 $0x00, %xmm15, %ymm4, %ymm4
+ vmovdqu 256(%rsp), %ymm7
+ vmovdqu 288(%rsp), %ymm8
+ vmovdqu 320(%rsp), %ymm9
+ vmovdqu 352(%rsp), %ymm10
+ vmovdqu (%r15), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpxor %ymm4, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm7, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3
+ vmovdqa %ymm0, %ymm11
+ vpxor %ymm1, %ymm2, %ymm12
+ vmovdqa %ymm3, %ymm13
+ vmovdqu 32(%r15), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm8, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 64(%r15), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm9, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 96(%r15), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm10, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 384(%rsp), %ymm7
+ vmovdqu 416(%rsp), %ymm8
+ vmovdqu 448(%rsp), %ymm9
+ vmovdqu 480(%rsp), %ymm10
+ vmovdqu 128(%r15), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm7, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 160(%r15), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm8, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 192(%r15), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm9, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 224(%r15), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm10, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5
+ vpshufd $0x4e, %ymm11, %ymm11
+ vpxor %ymm5, %ymm12, %ymm12
+ vpxor %ymm11, %ymm12, %ymm12
+ vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5
+ vpshufd $0x4e, %ymm12, %ymm12
+ vpxor %ymm5, %ymm13, %ymm13
+ vpxor %ymm12, %ymm13, %ymm13
+ vextracti128 $0x01, %ymm13, %xmm0
+ vpxor %xmm0, %xmm13, %xmm15
+ cmpl %r13d, %r14d
+ jl L_AES_GCM_encrypt_update_vaes_loop_256
+L_AES_GCM_encrypt_update_vaes_after_256:
+ movl %r8d, %r13d
+ andl $0xffffff80, %r13d
+ cmpl %r13d, %r14d
+ jge L_AES_GCM_encrypt_update_vaes_after_128
+ # 128 bytes of input
+ leaq (%r10,%r14,1), %r15
+ vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6
+ vbroadcasti128 (%r12), %ymm4
+ vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0
+ vpshufb %ymm6, %ymm0, %ymm0
+ vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1
+ vpshufb %ymm6, %ymm1, %ymm1
+ vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2
+ vpshufb %ymm6, %ymm2, %ymm2
+ vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3
+ vpshufb %ymm6, %ymm3, %ymm3
+ vmovdqu (%r12), %xmm7
+ vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7
+ vmovdqu %xmm7, (%r12)
+ vbroadcasti128 (%rdi), %ymm4
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm4, %ymm1, %ymm1
+ vpxor %ymm4, %ymm2, %ymm2
+ vpxor %ymm4, %ymm3, %ymm3
+ vbroadcasti128 16(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 32(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 48(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 64(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 80(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 96(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 112(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 128(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 144(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $11, %esi
+ vbroadcasti128 160(%rdi), %ymm4
+ jl L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 176(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $13, %esi
+ vbroadcasti128 192(%rdi), %ymm4
+ jl L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 208(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 224(%rdi), %ymm4
+L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last:
+ vaesenclast %ymm4, %ymm0, %ymm0
+ vaesenclast %ymm4, %ymm1, %ymm1
+ vaesenclast %ymm4, %ymm2, %ymm2
+ vaesenclast %ymm4, %ymm3, %ymm3
+ leaq (%r11,%r14,1), %rcx
+ leaq (%r10,%r14,1), %rdx
+ vmovdqu (%rcx), %ymm5
+ vpxor %ymm5, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vmovdqu 32(%rcx), %ymm5
+ vpxor %ymm5, %ymm1, %ymm1
+ vmovdqu %ymm1, 32(%rdx)
+ vmovdqu 64(%rcx), %ymm5
+ vpxor %ymm5, %ymm2, %ymm2
+ vmovdqu %ymm2, 64(%rdx)
+ vmovdqu 96(%rcx), %ymm5
+ vpxor %ymm5, %ymm3, %ymm3
+ vmovdqu %ymm3, 96(%rdx)
+ addl $0x80, %r14d
+ vmovdqu 96(%rsp), %ymm7
+ vpermq $0x4e, %ymm7, %ymm7
+ vmovdqu 64(%rsp), %ymm8
+ vpermq $0x4e, %ymm8, %ymm8
+ vmovdqu 32(%rsp), %ymm9
+ vpermq $0x4e, %ymm9, %ymm9
+ vmovdqu (%rsp), %ymm10
+ vpermq $0x4e, %ymm10, %ymm10
+ vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6
+ vpxor %ymm4, %ymm4, %ymm4
+ vinserti128 $0x00, %xmm15, %ymm4, %ymm4
+ vmovdqu (%r15), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpxor %ymm4, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm7, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3
+ vmovdqa %ymm0, %ymm11
+ vpxor %ymm1, %ymm2, %ymm12
+ vmovdqa %ymm3, %ymm13
+ vmovdqu 32(%r15), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm8, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 64(%r15), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm9, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 96(%r15), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm10, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5
+ vpshufd $0x4e, %ymm11, %ymm11
+ vpxor %ymm5, %ymm12, %ymm12
+ vpxor %ymm11, %ymm12, %ymm12
+ vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5
+ vpshufd $0x4e, %ymm12, %ymm12
+ vpxor %ymm5, %ymm13, %ymm13
+ vpxor %ymm12, %ymm13, %ymm13
+ vextracti128 $0x01, %ymm13, %xmm0
+ vpxor %xmm0, %xmm13, %xmm15
+L_AES_GCM_encrypt_update_vaes_after_128:
+ vmovdqu (%rsp), %xmm6
+L_AES_GCM_encrypt_update_vaes_done_128:
+ movl %r8d, %edx
+ cmpl %edx, %r14d
+ jge L_AES_GCM_encrypt_update_vaes_done_enc
+ movl %r8d, %r13d
+ andl $0xfffffff0, %r13d
+ cmpl %r13d, %r14d
+ jge L_AES_GCM_encrypt_update_vaes_last_block_done
+ vmovdqu (%r12), %xmm8
+ vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm8, %xmm7
+ vpaddd L_vaes_aes_gcm_one(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, (%r12)
+ vpxor (%rdi), %xmm7, %xmm7
+ vaesenc 16(%rdi), %xmm7, %xmm7
+ vaesenc 32(%rdi), %xmm7, %xmm7
+ vaesenc 48(%rdi), %xmm7, %xmm7
+ vaesenc 64(%rdi), %xmm7, %xmm7
+ vaesenc 80(%rdi), %xmm7, %xmm7
+ vaesenc 96(%rdi), %xmm7, %xmm7
+ vaesenc 112(%rdi), %xmm7, %xmm7
+ vaesenc 128(%rdi), %xmm7, %xmm7
+ vaesenc 144(%rdi), %xmm7, %xmm7
+ cmpl $11, %esi
+ vmovdqa 160(%rdi), %xmm8
+ jl L_AES_GCM_encrypt_update_vaes_aesenc_block_last
+ vaesenc %xmm8, %xmm7, %xmm7
+ vaesenc 176(%rdi), %xmm7, %xmm7
+ cmpl $13, %esi
+ vmovdqa 192(%rdi), %xmm8
+ jl L_AES_GCM_encrypt_update_vaes_aesenc_block_last
+ vaesenc %xmm8, %xmm7, %xmm7
+ vaesenc 208(%rdi), %xmm7, %xmm7
+ vmovdqa 224(%rdi), %xmm8
+L_AES_GCM_encrypt_update_vaes_aesenc_block_last:
+ vaesenclast %xmm8, %xmm7, %xmm7
+ vmovdqu (%r11,%r14,1), %xmm8
+ vpxor %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, (%r10,%r14,1)
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7
+ vpxor %xmm7, %xmm15, %xmm15
+ addl $16, %r14d
+ cmpl %r13d, %r14d
+ jge L_AES_GCM_encrypt_update_vaes_last_block_ghash
+L_AES_GCM_encrypt_update_vaes_last_block_start:
+ vmovdqu (%r11,%r14,1), %xmm12
+ vmovdqu (%r12), %xmm8
+ vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm8, %xmm7
+ vpaddd L_vaes_aes_gcm_one(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, (%r12)
+ vpxor (%rdi), %xmm7, %xmm7
+ vpclmulqdq $16, %xmm6, %xmm15, %xmm9
+ vaesenc 16(%rdi), %xmm7, %xmm7
+ vaesenc 32(%rdi), %xmm7, %xmm7
+ vpclmulqdq $0x01, %xmm6, %xmm15, %xmm10
+ vaesenc 48(%rdi), %xmm7, %xmm7
+ vaesenc 64(%rdi), %xmm7, %xmm7
+ vpclmulqdq $0x00, %xmm6, %xmm15, %xmm11
+ vaesenc 80(%rdi), %xmm7, %xmm7
+ vpclmulqdq $0x11, %xmm6, %xmm15, %xmm1
+ vaesenc 96(%rdi), %xmm7, %xmm7
+ vpxor %xmm10, %xmm9, %xmm9
+ vpslldq $8, %xmm9, %xmm2
+ vpsrldq $8, %xmm9, %xmm9
+ vaesenc 112(%rdi), %xmm7, %xmm7
+ vpxor %xmm11, %xmm2, %xmm2
+ vpxor %xmm9, %xmm1, %xmm3
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm0
+ vpclmulqdq $16, %xmm0, %xmm2, %xmm10
+ vaesenc 128(%rdi), %xmm7, %xmm7
+ vpshufd $0x4e, %xmm2, %xmm9
+ vpxor %xmm10, %xmm9, %xmm9
+ vpclmulqdq $16, %xmm0, %xmm9, %xmm10
+ vaesenc 144(%rdi), %xmm7, %xmm7
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpxor %xmm10, %xmm9, %xmm9
+ vpxor %xmm3, %xmm9, %xmm15
+ cmpl $11, %esi
+ vmovdqa 160(%rdi), %xmm8
+ jl L_AES_GCM_encrypt_update_vaes_aesenc_gfmul_last
+ vaesenc %xmm8, %xmm7, %xmm7
+ vaesenc 176(%rdi), %xmm7, %xmm7
+ cmpl $13, %esi
+ vmovdqa 192(%rdi), %xmm8
+ jl L_AES_GCM_encrypt_update_vaes_aesenc_gfmul_last
+ vaesenc %xmm8, %xmm7, %xmm7
+ vaesenc 208(%rdi), %xmm7, %xmm7
+ vmovdqa 224(%rdi), %xmm8
+L_AES_GCM_encrypt_update_vaes_aesenc_gfmul_last:
+ vaesenclast %xmm8, %xmm7, %xmm7
+ vmovdqa %xmm12, %xmm0
+ vpxor %xmm0, %xmm7, %xmm7
+ vmovdqu %xmm7, (%r10,%r14,1)
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7
+ addl $16, %r14d
+ vpxor %xmm7, %xmm15, %xmm15
+ cmpl %r13d, %r14d
+ jl L_AES_GCM_encrypt_update_vaes_last_block_start
+L_AES_GCM_encrypt_update_vaes_last_block_ghash:
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm6, %xmm15, %xmm7
+ vpclmulqdq $0x01, %xmm6, %xmm15, %xmm8
+ vpclmulqdq $16, %xmm6, %xmm15, %xmm9
+ vpclmulqdq $0x11, %xmm6, %xmm15, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm15
+L_AES_GCM_encrypt_update_vaes_last_block_done:
+L_AES_GCM_encrypt_update_vaes_done_enc:
+ vmovdqa %xmm15, (%r9)
+ vzeroupper
+ addq $0x210, %rsp
+ popq %rbx
+ popq %r15
+ popq %r14
+ popq %r12
+ popq %r13
+ repz retq
+#ifndef __APPLE__
+.size AES_GCM_encrypt_update_vaes,.-AES_GCM_encrypt_update_vaes
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_GCM_encrypt_final_vaes
+.type AES_GCM_encrypt_final_vaes,@function
+.align 16
+AES_GCM_encrypt_final_vaes:
+#else
+.section __TEXT,__text
+.globl _AES_GCM_encrypt_final_vaes
+.p2align 4
+_AES_GCM_encrypt_final_vaes:
+#endif /* __APPLE__ */
+ pushq %r13
+ movl %edx, %eax
+ movl %ecx, %r10d
+ movl %r8d, %r11d
+ movq 16(%rsp), %r8
+ subq $16, %rsp
+ vmovdqa (%rdi), %xmm4
+ vmovdqa (%r9), %xmm5
+ vmovdqa (%r8), %xmm6
+ vpsrlq $63, %xmm5, %xmm8
+ vpsllq $0x01, %xmm5, %xmm7
+ vpslldq $8, %xmm8, %xmm8
+ vpor %xmm8, %xmm7, %xmm7
+ vpshufd $0xff, %xmm5, %xmm5
+ vpsrad $31, %xmm5, %xmm5
+ vpand L_vaes_aes_gcm_mod2_128(%rip), %xmm5, %xmm5
+ vpxor %xmm7, %xmm5, %xmm5
+ movl %r10d, %edx
+ movl %r11d, %ecx
+ shlq $3, %rdx
+ shlq $3, %rcx
+ vmovq %rdx, %xmm0
+ vmovq %rcx, %xmm1
+ vpunpcklqdq %xmm1, %xmm0, %xmm0
+ vpxor %xmm0, %xmm4, %xmm4
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm5, %xmm4, %xmm7
+ vpclmulqdq $0x01, %xmm5, %xmm4, %xmm8
+ vpclmulqdq $16, %xmm5, %xmm4, %xmm9
+ vpclmulqdq $0x11, %xmm5, %xmm4, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4
+ vpxor %xmm6, %xmm4, %xmm0
+ cmpl $16, %eax
+ je L_AES_GCM_encrypt_final_vaes_store_tag_16
+ xorq %rcx, %rcx
+ vmovdqu %xmm0, (%rsp)
+L_AES_GCM_encrypt_final_vaes_store_tag_loop:
+ movzbl (%rsp,%rcx,1), %r13d
+ movb %r13b, (%rsi,%rcx,1)
+ incl %ecx
+ cmpl %eax, %ecx
+ jne L_AES_GCM_encrypt_final_vaes_store_tag_loop
+ jmp L_AES_GCM_encrypt_final_vaes_store_tag_done
+L_AES_GCM_encrypt_final_vaes_store_tag_16:
+ vmovdqu %xmm0, (%rsi)
+L_AES_GCM_encrypt_final_vaes_store_tag_done:
+ vzeroupper
+ addq $16, %rsp
+ popq %r13
+ repz retq
+#ifndef __APPLE__
+.size AES_GCM_encrypt_final_vaes,.-AES_GCM_encrypt_final_vaes
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_GCM_decrypt_update_vaes
+.type AES_GCM_decrypt_update_vaes,@function
+.align 16
+AES_GCM_decrypt_update_vaes:
+#else
+.section __TEXT,__text
+.globl _AES_GCM_decrypt_update_vaes
+.p2align 4
+_AES_GCM_decrypt_update_vaes:
+#endif /* __APPLE__ */
+ pushq %r13
+ pushq %r12
+ pushq %r14
+ pushq %r15
+ pushq %rbx
+ movq %rdx, %r10
+ movq %rcx, %r11
+ movq 48(%rsp), %rax
+ movq 56(%rsp), %r12
+ subq $0x210, %rsp
+ vmovdqa (%r9), %xmm15
+ vmovdqa (%rax), %xmm6
+ vpsrlq $63, %xmm6, %xmm8
+ vpsllq $0x01, %xmm6, %xmm7
+ vpslldq $8, %xmm8, %xmm8
+ vpor %xmm8, %xmm7, %xmm7
+ vpshufd $0xff, %xmm6, %xmm6
+ vpsrad $31, %xmm6, %xmm6
+ vpand L_vaes_aes_gcm_mod2_128(%rip), %xmm6, %xmm6
+ vpxor %xmm7, %xmm6, %xmm6
+ xorl %r14d, %r14d
+ cmpl $0x80, %r8d
+ jl L_AES_GCM_decrypt_update_vaes_done_128
+ vmovdqa %xmm15, %xmm2
+ # H ^ 1
+ vmovdqu %xmm6, (%rsp)
+ # H ^ 2
+ vpclmulqdq $0x00, %xmm6, %xmm6, %xmm7
+ vpclmulqdq $0x11, %xmm6, %xmm6, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm0
+ vmovdqu %xmm0, 16(%rsp)
+ # H ^ 3
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm6, %xmm0, %xmm7
+ vpclmulqdq $0x01, %xmm6, %xmm0, %xmm8
+ vpclmulqdq $16, %xmm6, %xmm0, %xmm9
+ vpclmulqdq $0x11, %xmm6, %xmm0, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm1
+ vmovdqu %xmm1, 32(%rsp)
+ # H ^ 4
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm3
+ vmovdqu %xmm3, 48(%rsp)
+ # H ^ 5
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm9
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 64(%rsp)
+ # H ^ 6
+ vpclmulqdq $0x00, %xmm1, %xmm1, %xmm7
+ vpclmulqdq $0x11, %xmm1, %xmm1, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 80(%rsp)
+ # H ^ 7
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm1, %xmm3, %xmm7
+ vpclmulqdq $0x01, %xmm1, %xmm3, %xmm8
+ vpclmulqdq $16, %xmm1, %xmm3, %xmm9
+ vpclmulqdq $0x11, %xmm1, %xmm3, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 96(%rsp)
+ # H ^ 8
+ vpclmulqdq $0x00, %xmm3, %xmm3, %xmm7
+ vpclmulqdq $0x11, %xmm3, %xmm3, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 112(%rsp)
+ cmpl $0x100, %r8d
+ jl L_AES_GCM_decrypt_update_vaes_no_ext
+ # H ^ 9
+ vmovdqu 48(%rsp), %xmm0
+ vmovdqu 64(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm9
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 128(%rsp)
+ # H ^ 10
+ vmovdqu 64(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 144(%rsp)
+ # H ^ 11
+ vmovdqu 64(%rsp), %xmm0
+ vmovdqu 80(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm9
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 160(%rsp)
+ # H ^ 12
+ vmovdqu 80(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 176(%rsp)
+ # H ^ 13
+ vmovdqu 80(%rsp), %xmm0
+ vmovdqu 96(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm9
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 192(%rsp)
+ # H ^ 14
+ vmovdqu 96(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 208(%rsp)
+ # H ^ 15
+ vmovdqu 96(%rsp), %xmm0
+ vmovdqu 112(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm9
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 224(%rsp)
+ # H ^ 16
+ vmovdqu 112(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vmovdqu %xmm4, 240(%rsp)
+ vmovdqu 224(%rsp), %ymm7
+ vpermq $0x4e, %ymm7, %ymm7
+ vmovdqu 192(%rsp), %ymm8
+ vpermq $0x4e, %ymm8, %ymm8
+ vmovdqu 160(%rsp), %ymm9
+ vpermq $0x4e, %ymm9, %ymm9
+ vmovdqu 128(%rsp), %ymm10
+ vpermq $0x4e, %ymm10, %ymm10
+ vmovdqu %ymm7, 256(%rsp)
+ vmovdqu %ymm8, 288(%rsp)
+ vmovdqu %ymm9, 320(%rsp)
+ vmovdqu %ymm10, 352(%rsp)
+ vmovdqu 96(%rsp), %ymm7
+ vpermq $0x4e, %ymm7, %ymm7
+ vmovdqu 64(%rsp), %ymm8
+ vpermq $0x4e, %ymm8, %ymm8
+ vmovdqu 32(%rsp), %ymm9
+ vpermq $0x4e, %ymm9, %ymm9
+ vmovdqu (%rsp), %ymm10
+ vpermq $0x4e, %ymm10, %ymm10
+ vmovdqu %ymm7, 384(%rsp)
+ vmovdqu %ymm8, 416(%rsp)
+ vmovdqu %ymm9, 448(%rsp)
+ vmovdqu %ymm10, 480(%rsp)
+L_AES_GCM_decrypt_update_vaes_no_ext:
+ vbroadcasti128 L_vaes_aes_gcm_mod2_128(%rip), %ymm14
+ cmpl $0x100, %r8d
+ jl L_AES_GCM_decrypt_update_vaes_after_256
+ movl %r8d, %r13d
+ andl $0xffffff00, %r13d
+L_AES_GCM_decrypt_update_vaes_loop_256:
+ # 256 bytes of input
+ leaq (%r11,%r14,1), %rbx
+ vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6
+ vpxor %ymm4, %ymm4, %ymm4
+ vinserti128 $0x00, %xmm15, %ymm4, %ymm4
+ vmovdqu 256(%rsp), %ymm7
+ vmovdqu 288(%rsp), %ymm8
+ vmovdqu 320(%rsp), %ymm9
+ vmovdqu 352(%rsp), %ymm10
+ vmovdqu (%rbx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpxor %ymm4, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm7, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3
+ vmovdqa %ymm0, %ymm11
+ vpxor %ymm1, %ymm2, %ymm12
+ vmovdqa %ymm3, %ymm13
+ vmovdqu 32(%rbx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm8, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 64(%rbx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm9, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 96(%rbx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm10, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 384(%rsp), %ymm7
+ vmovdqu 416(%rsp), %ymm8
+ vmovdqu 448(%rsp), %ymm9
+ vmovdqu 480(%rsp), %ymm10
+ vmovdqu 128(%rbx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm7, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 160(%rbx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm8, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 192(%rbx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm9, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 224(%rbx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm10, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5
+ vpshufd $0x4e, %ymm11, %ymm11
+ vpxor %ymm5, %ymm12, %ymm12
+ vpxor %ymm11, %ymm12, %ymm12
+ vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5
+ vpshufd $0x4e, %ymm12, %ymm12
+ vpxor %ymm5, %ymm13, %ymm13
+ vpxor %ymm12, %ymm13, %ymm13
+ vextracti128 $0x01, %ymm13, %xmm0
+ vpxor %xmm0, %xmm13, %xmm15
+ vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6
+ vbroadcasti128 (%r12), %ymm4
+ vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0
+ vpshufb %ymm6, %ymm0, %ymm0
+ vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1
+ vpshufb %ymm6, %ymm1, %ymm1
+ vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2
+ vpshufb %ymm6, %ymm2, %ymm2
+ vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3
+ vpshufb %ymm6, %ymm3, %ymm3
+ vmovdqu (%r12), %xmm7
+ vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7
+ vmovdqu %xmm7, (%r12)
+ vbroadcasti128 (%rdi), %ymm4
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm4, %ymm1, %ymm1
+ vpxor %ymm4, %ymm2, %ymm2
+ vpxor %ymm4, %ymm3, %ymm3
+ vbroadcasti128 16(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 32(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 48(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 64(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 80(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 96(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 112(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 128(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 144(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $11, %esi
+ vbroadcasti128 160(%rdi), %ymm4
+ jl L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 176(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $13, %esi
+ vbroadcasti128 192(%rdi), %ymm4
+ jl L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 208(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 224(%rdi), %ymm4
+L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last:
+ vaesenclast %ymm4, %ymm0, %ymm0
+ vaesenclast %ymm4, %ymm1, %ymm1
+ vaesenclast %ymm4, %ymm2, %ymm2
+ vaesenclast %ymm4, %ymm3, %ymm3
+ leaq (%r11,%r14,1), %rcx
+ leaq (%r10,%r14,1), %rdx
+ vmovdqu (%rcx), %ymm5
+ vpxor %ymm5, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vmovdqu 32(%rcx), %ymm5
+ vpxor %ymm5, %ymm1, %ymm1
+ vmovdqu %ymm1, 32(%rdx)
+ vmovdqu 64(%rcx), %ymm5
+ vpxor %ymm5, %ymm2, %ymm2
+ vmovdqu %ymm2, 64(%rdx)
+ vmovdqu 96(%rcx), %ymm5
+ vpxor %ymm5, %ymm3, %ymm3
+ vmovdqu %ymm3, 96(%rdx)
+ addl $0x80, %r14d
+ vbroadcasti128 (%r12), %ymm4
+ vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0
+ vpshufb %ymm6, %ymm0, %ymm0
+ vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1
+ vpshufb %ymm6, %ymm1, %ymm1
+ vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2
+ vpshufb %ymm6, %ymm2, %ymm2
+ vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3
+ vpshufb %ymm6, %ymm3, %ymm3
+ vmovdqu (%r12), %xmm7
+ vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7
+ vmovdqu %xmm7, (%r12)
+ vbroadcasti128 (%rdi), %ymm4
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm4, %ymm1, %ymm1
+ vpxor %ymm4, %ymm2, %ymm2
+ vpxor %ymm4, %ymm3, %ymm3
+ vbroadcasti128 16(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 32(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 48(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 64(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 80(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 96(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 112(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 128(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 144(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $11, %esi
+ vbroadcasti128 160(%rdi), %ymm4
+ jl L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 176(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $13, %esi
+ vbroadcasti128 192(%rdi), %ymm4
+ jl L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 208(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 224(%rdi), %ymm4
+L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last:
+ vaesenclast %ymm4, %ymm0, %ymm0
+ vaesenclast %ymm4, %ymm1, %ymm1
+ vaesenclast %ymm4, %ymm2, %ymm2
+ vaesenclast %ymm4, %ymm3, %ymm3
+ leaq (%r11,%r14,1), %rcx
+ leaq (%r10,%r14,1), %rdx
+ vmovdqu (%rcx), %ymm5
+ vpxor %ymm5, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vmovdqu 32(%rcx), %ymm5
+ vpxor %ymm5, %ymm1, %ymm1
+ vmovdqu %ymm1, 32(%rdx)
+ vmovdqu 64(%rcx), %ymm5
+ vpxor %ymm5, %ymm2, %ymm2
+ vmovdqu %ymm2, 64(%rdx)
+ vmovdqu 96(%rcx), %ymm5
+ vpxor %ymm5, %ymm3, %ymm3
+ vmovdqu %ymm3, 96(%rdx)
+ addl $0x80, %r14d
+ cmpl %r13d, %r14d
+ jl L_AES_GCM_decrypt_update_vaes_loop_256
+L_AES_GCM_decrypt_update_vaes_after_256:
+ vmovdqu 96(%rsp), %ymm7
+ vpermq $0x4e, %ymm7, %ymm7
+ vmovdqu 64(%rsp), %ymm8
+ vpermq $0x4e, %ymm8, %ymm8
+ vmovdqu 32(%rsp), %ymm9
+ vpermq $0x4e, %ymm9, %ymm9
+ vmovdqu (%rsp), %ymm10
+ vpermq $0x4e, %ymm10, %ymm10
+ movl %r8d, %r13d
+ andl $0xffffff80, %r13d
+ cmpl %r13d, %r14d
+ jge L_AES_GCM_decrypt_update_vaes_after_128
+ # 128 bytes of input
+ leaq (%r11,%r14,1), %rbx
+ vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6
+ vpxor %ymm4, %ymm4, %ymm4
+ vinserti128 $0x00, %xmm15, %ymm4, %ymm4
+ vmovdqu (%rbx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpxor %ymm4, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm7, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3
+ vmovdqa %ymm0, %ymm11
+ vpxor %ymm1, %ymm2, %ymm12
+ vmovdqa %ymm3, %ymm13
+ vmovdqu 32(%rbx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm8, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 64(%rbx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm9, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vmovdqu 96(%rbx), %ymm5
+ vpshufb %ymm6, %ymm5, %ymm5
+ vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0
+ vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1
+ vpclmulqdq $16, %ymm10, %ymm5, %ymm2
+ vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3
+ vpxor %ymm0, %ymm11, %ymm11
+ vpxor %ymm1, %ymm12, %ymm12
+ vpxor %ymm2, %ymm12, %ymm12
+ vpxor %ymm3, %ymm13, %ymm13
+ vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5
+ vpshufd $0x4e, %ymm11, %ymm11
+ vpxor %ymm5, %ymm12, %ymm12
+ vpxor %ymm11, %ymm12, %ymm12
+ vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5
+ vpshufd $0x4e, %ymm12, %ymm12
+ vpxor %ymm5, %ymm13, %ymm13
+ vpxor %ymm12, %ymm13, %ymm13
+ vextracti128 $0x01, %ymm13, %xmm0
+ vpxor %xmm0, %xmm13, %xmm15
+ vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6
+ vbroadcasti128 (%r12), %ymm4
+ vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0
+ vpshufb %ymm6, %ymm0, %ymm0
+ vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1
+ vpshufb %ymm6, %ymm1, %ymm1
+ vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2
+ vpshufb %ymm6, %ymm2, %ymm2
+ vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3
+ vpshufb %ymm6, %ymm3, %ymm3
+ vmovdqu (%r12), %xmm7
+ vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7
+ vmovdqu %xmm7, (%r12)
+ vbroadcasti128 (%rdi), %ymm4
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm4, %ymm1, %ymm1
+ vpxor %ymm4, %ymm2, %ymm2
+ vpxor %ymm4, %ymm3, %ymm3
+ vbroadcasti128 16(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 32(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 48(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 64(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 80(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 96(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 112(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 128(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 144(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $11, %esi
+ vbroadcasti128 160(%rdi), %ymm4
+ jl L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 176(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ cmpl $13, %esi
+ vbroadcasti128 192(%rdi), %ymm4
+ jl L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 208(%rdi), %ymm4
+ vaesenc %ymm4, %ymm0, %ymm0
+ vaesenc %ymm4, %ymm1, %ymm1
+ vaesenc %ymm4, %ymm2, %ymm2
+ vaesenc %ymm4, %ymm3, %ymm3
+ vbroadcasti128 224(%rdi), %ymm4
+L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last:
+ vaesenclast %ymm4, %ymm0, %ymm0
+ vaesenclast %ymm4, %ymm1, %ymm1
+ vaesenclast %ymm4, %ymm2, %ymm2
+ vaesenclast %ymm4, %ymm3, %ymm3
+ leaq (%r11,%r14,1), %rcx
+ leaq (%r10,%r14,1), %rdx
+ vmovdqu (%rcx), %ymm5
+ vpxor %ymm5, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vmovdqu 32(%rcx), %ymm5
+ vpxor %ymm5, %ymm1, %ymm1
+ vmovdqu %ymm1, 32(%rdx)
+ vmovdqu 64(%rcx), %ymm5
+ vpxor %ymm5, %ymm2, %ymm2
+ vmovdqu %ymm2, 64(%rdx)
+ vmovdqu 96(%rcx), %ymm5
+ vpxor %ymm5, %ymm3, %ymm3
+ vmovdqu %ymm3, 96(%rdx)
+ addl $0x80, %r14d
+L_AES_GCM_decrypt_update_vaes_after_128:
+ vmovdqu (%rsp), %xmm6
+L_AES_GCM_decrypt_update_vaes_done_128:
+ movl %r8d, %edx
+ cmpl %edx, %r14d
+ jge L_AES_GCM_decrypt_update_vaes_done_dec
+ movl %r8d, %r13d
+ andl $0xfffffff0, %r13d
+ cmpl %r13d, %r14d
+ jge L_AES_GCM_decrypt_update_vaes_last_block_done
+L_AES_GCM_decrypt_update_vaes_last_block_start:
+ vmovdqu (%r11,%r14,1), %xmm12
+ vmovdqa %xmm6, %xmm0
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm12, %xmm1
+ vpxor %xmm15, %xmm1, %xmm1
+ vmovdqu (%r12), %xmm8
+ vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm8, %xmm7
+ vpaddd L_vaes_aes_gcm_one(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, (%r12)
+ vpxor (%rdi), %xmm7, %xmm7
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm9
+ vaesenc 16(%rdi), %xmm7, %xmm7
+ vaesenc 32(%rdi), %xmm7, %xmm7
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm10
+ vaesenc 48(%rdi), %xmm7, %xmm7
+ vaesenc 64(%rdi), %xmm7, %xmm7
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm11
+ vaesenc 80(%rdi), %xmm7, %xmm7
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1
+ vaesenc 96(%rdi), %xmm7, %xmm7
+ vpxor %xmm10, %xmm9, %xmm9
+ vpslldq $8, %xmm9, %xmm2
+ vpsrldq $8, %xmm9, %xmm9
+ vaesenc 112(%rdi), %xmm7, %xmm7
+ vpxor %xmm11, %xmm2, %xmm2
+ vpxor %xmm9, %xmm1, %xmm3
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm0
+ vpclmulqdq $16, %xmm0, %xmm2, %xmm10
+ vaesenc 128(%rdi), %xmm7, %xmm7
+ vpshufd $0x4e, %xmm2, %xmm9
+ vpxor %xmm10, %xmm9, %xmm9
+ vpclmulqdq $16, %xmm0, %xmm9, %xmm10
+ vaesenc 144(%rdi), %xmm7, %xmm7
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpxor %xmm10, %xmm9, %xmm9
+ vpxor %xmm3, %xmm9, %xmm15
+ cmpl $11, %esi
+ vmovdqa 160(%rdi), %xmm8
+ jl L_AES_GCM_decrypt_update_vaes_aesenc_gfmul_last
+ vaesenc %xmm8, %xmm7, %xmm7
+ vaesenc 176(%rdi), %xmm7, %xmm7
+ cmpl $13, %esi
+ vmovdqa 192(%rdi), %xmm8
+ jl L_AES_GCM_decrypt_update_vaes_aesenc_gfmul_last
+ vaesenc %xmm8, %xmm7, %xmm7
+ vaesenc 208(%rdi), %xmm7, %xmm7
+ vmovdqa 224(%rdi), %xmm8
+L_AES_GCM_decrypt_update_vaes_aesenc_gfmul_last:
+ vaesenclast %xmm8, %xmm7, %xmm7
+ vmovdqa %xmm12, %xmm0
+ vpxor %xmm0, %xmm7, %xmm7
+ vmovdqu %xmm7, (%r10,%r14,1)
+ addl $16, %r14d
+ cmpl %r13d, %r14d
+ jl L_AES_GCM_decrypt_update_vaes_last_block_start
+L_AES_GCM_decrypt_update_vaes_last_block_done:
+L_AES_GCM_decrypt_update_vaes_done_dec:
+ vmovdqa %xmm15, (%r9)
+ vzeroupper
+ addq $0x210, %rsp
+ popq %rbx
+ popq %r15
+ popq %r14
+ popq %r12
+ popq %r13
+ repz retq
+#ifndef __APPLE__
+.size AES_GCM_decrypt_update_vaes,.-AES_GCM_decrypt_update_vaes
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_GCM_decrypt_final_vaes
+.type AES_GCM_decrypt_final_vaes,@function
+.align 16
+AES_GCM_decrypt_final_vaes:
+#else
+.section __TEXT,__text
+.globl _AES_GCM_decrypt_final_vaes
+.p2align 4
+_AES_GCM_decrypt_final_vaes:
+#endif /* __APPLE__ */
+ pushq %r13
+ pushq %rbp
+ pushq %r12
+ movl %edx, %eax
+ movl %ecx, %r10d
+ movl %r8d, %r11d
+ movq 32(%rsp), %r8
+ movq 40(%rsp), %rbp
+ subq $16, %rsp
+ vmovdqa (%rdi), %xmm6
+ vmovdqa (%r9), %xmm5
+ vmovdqa (%r8), %xmm15
+ vpsrlq $63, %xmm5, %xmm8
+ vpsllq $0x01, %xmm5, %xmm7
+ vpslldq $8, %xmm8, %xmm8
+ vpor %xmm8, %xmm7, %xmm7
+ vpshufd $0xff, %xmm5, %xmm5
+ vpsrad $31, %xmm5, %xmm5
+ vpand L_vaes_aes_gcm_mod2_128(%rip), %xmm5, %xmm5
+ vpxor %xmm7, %xmm5, %xmm5
+ movl %r10d, %edx
+ movl %r11d, %ecx
+ shlq $3, %rdx
+ shlq $3, %rcx
+ vmovq %rdx, %xmm0
+ vmovq %rcx, %xmm1
+ vpunpcklqdq %xmm1, %xmm0, %xmm0
+ vpxor %xmm0, %xmm6, %xmm6
+ # ghash_gfmul_red_avx
+ vpclmulqdq $0x00, %xmm5, %xmm6, %xmm7
+ vpclmulqdq $0x01, %xmm5, %xmm6, %xmm8
+ vpclmulqdq $16, %xmm5, %xmm6, %xmm9
+ vpclmulqdq $0x11, %xmm5, %xmm6, %xmm10
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpxor %xmm11, %xmm8, %xmm8
+ vpxor %xmm7, %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm10
+ vmovdqa %xmm10, %xmm6
+ vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6
+ vpxor %xmm15, %xmm6, %xmm0
+ cmpl $16, %eax
+ je L_AES_GCM_decrypt_final_vaes_cmp_tag_16
+ subq $16, %rsp
+ xorq %rcx, %rcx
+ xorq %r12, %r12
+ vmovdqu %xmm0, (%rsp)
+L_AES_GCM_decrypt_final_vaes_cmp_tag_loop:
+ movzbl (%rsp,%rcx,1), %r13d
+ xorb (%rsi,%rcx,1), %r13b
+ orb %r13b, %r12b
+ incl %ecx
+ cmpl %eax, %ecx
+ jne L_AES_GCM_decrypt_final_vaes_cmp_tag_loop
+ cmpb $0x00, %r12b
+ sete %r12b
+ addq $16, %rsp
+ xorq %rcx, %rcx
+ jmp L_AES_GCM_decrypt_final_vaes_cmp_tag_done
+L_AES_GCM_decrypt_final_vaes_cmp_tag_16:
+ vmovdqu (%rsi), %xmm1
+ vpcmpeqb %xmm1, %xmm0, %xmm0
+ vpmovmskb %xmm0, %rdx
+ # %%edx == 0xFFFF then return 1 else => return 0
+ xorl %r12d, %r12d
+ cmpl $0xffff, %edx
+ sete %r12b
+L_AES_GCM_decrypt_final_vaes_cmp_tag_done:
+ movl %r12d, (%rbp)
+ vzeroupper
+ addq $16, %rsp
+ popq %r12
+ popq %rbp
+ popq %r13
+ repz retq
+#ifndef __APPLE__
+.size AES_GCM_decrypt_final_vaes,.-AES_GCM_decrypt_final_vaes
+#endif /* __APPLE__ */
+#endif /* WOLFSSL_AESGCM_STREAM */
+#endif /* HAVE_INTEL_VAES */
+#ifdef HAVE_INTEL_AVX512
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 32
+#else
+.p2align 5
+#endif /* __APPLE__ */
+L_avx512_aes_gcm_inc_z0:
+.quad 0x0000000000000000,0x0000000000000000
+.quad 0x0000000000000000,0x0000000000000001
+.quad 0x0000000000000000,0x0000000000000002
+.quad 0x0000000000000000,0x0000000000000003
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 32
+#else
+.p2align 5
+#endif /* __APPLE__ */
+L_avx512_aes_gcm_inc_z1:
+.quad 0x0000000000000000,0x0000000000000004
+.quad 0x0000000000000000,0x0000000000000005
+.quad 0x0000000000000000,0x0000000000000006
+.quad 0x0000000000000000,0x0000000000000007
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 32
+#else
+.p2align 5
+#endif /* __APPLE__ */
+L_avx512_aes_gcm_inc_z2:
+.quad 0x0000000000000000,0x0000000000000008
+.quad 0x0000000000000000,0x0000000000000009
+.quad 0x0000000000000000,0x000000000000000a
+.quad 0x0000000000000000,0x000000000000000b
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 32
+#else
+.p2align 5
+#endif /* __APPLE__ */
+L_avx512_aes_gcm_inc_z3:
+.quad 0x0000000000000000,0x000000000000000c
+.quad 0x0000000000000000,0x000000000000000d
+.quad 0x0000000000000000,0x000000000000000e
+.quad 0x0000000000000000,0x000000000000000f
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_avx512_aes_gcm_sixteen:
+.quad 0x0000000000000000,0x0000000000000010
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_GCM_generate_m0_avx512_rev8:
+.quad 0x08090a0b0c0d0e0f,0x0001020304050607
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_GCM_generate_m0_avx512_mod2_128:
+.quad 0x0000000000000000,0xe100000000000000
+#ifndef __APPLE__
+.text
+.globl GCM_generate_m0_avx512
+.type GCM_generate_m0_avx512,@function
+.align 16
+GCM_generate_m0_avx512:
+#else
+.section __TEXT,__text
+.globl _GCM_generate_m0_avx512
+.p2align 4
+_GCM_generate_m0_avx512:
+#endif /* __APPLE__ */
+ vmovdqu L_GCM_generate_m0_avx512_rev8(%rip), %xmm9
+ vmovdqu L_GCM_generate_m0_avx512_mod2_128(%rip), %xmm10
+ vpxor %xmm8, %xmm8, %xmm8
+ vmovdqu (%rdi), %xmm0
+ vmovdqu %xmm8, (%rsi)
+ vmovdqu %xmm0, %xmm8
+ vpshufb %xmm9, %xmm0, %xmm0
+ vpsllq $63, %xmm0, %xmm5
+ vpsrlq $0x01, %xmm0, %xmm4
+ vpslldq $8, %xmm5, %xmm1
+ vpsrldq $8, %xmm5, %xmm5
+ vpshufd $0xff, %xmm1, %xmm1
+ vpor %xmm5, %xmm4, %xmm4
+ vpsrad $31, %xmm1, %xmm1
+ vpand %xmm10, %xmm1, %xmm1
+ vpxor %xmm4, %xmm1, %xmm1
+ vpsllq $63, %xmm1, %xmm5
+ vpsrlq $0x01, %xmm1, %xmm4
+ vpslldq $8, %xmm5, %xmm2
+ vpsrldq $8, %xmm5, %xmm5
+ vpshufd $0xff, %xmm2, %xmm2
+ vpor %xmm5, %xmm4, %xmm4
+ vpsrad $31, %xmm2, %xmm2
+ vpand %xmm10, %xmm2, %xmm2
+ vpxor %xmm4, %xmm2, %xmm2
+ vpsllq $63, %xmm2, %xmm5
+ vpsrlq $0x01, %xmm2, %xmm4
+ vpslldq $8, %xmm5, %xmm3
+ vpsrldq $8, %xmm5, %xmm5
+ vpshufd $0xff, %xmm3, %xmm3
+ vpor %xmm5, %xmm4, %xmm4
+ vpsrad $31, %xmm3, %xmm3
+ vpand %xmm10, %xmm3, %xmm3
+ vpxor %xmm4, %xmm3, %xmm3
+ vpshufb %xmm9, %xmm3, %xmm3
+ vpshufb %xmm9, %xmm2, %xmm2
+ vpshufb %xmm9, %xmm1, %xmm1
+ vpshufb %xmm9, %xmm0, %xmm0
+ vpxor %xmm2, %xmm3, %xmm8
+ vmovdqu %xmm3, 16(%rsi)
+ vmovdqu %xmm2, 32(%rsi)
+ vmovdqu %xmm8, 48(%rsi)
+ vmovdqu %xmm1, 64(%rsi)
+ vpxor %xmm1, %xmm3, %xmm4
+ vpxor %xmm1, %xmm2, %xmm5
+ vpxor %xmm1, %xmm8, %xmm6
+ vmovdqu %xmm4, 80(%rsi)
+ vmovdqu %xmm5, 96(%rsi)
+ vmovdqu %xmm6, 112(%rsi)
+ vmovdqu %xmm0, 128(%rsi)
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm0, %xmm3, %xmm4
+ vpxor %xmm0, %xmm2, %xmm6
+ vmovdqu %xmm4, 144(%rsi)
+ vmovdqu %xmm6, 160(%rsi)
+ vpxor %xmm6, %xmm3, %xmm6
+ vmovdqu %xmm6, 176(%rsi)
+ vmovdqu %xmm1, 192(%rsi)
+ vpxor %xmm1, %xmm3, %xmm4
+ vpxor %xmm1, %xmm2, %xmm5
+ vpxor %xmm1, %xmm8, %xmm6
+ vmovdqu %xmm4, 208(%rsi)
+ vmovdqu %xmm5, 224(%rsi)
+ vmovdqu %xmm6, 240(%rsi)
+ vmovdqu (%rsi), %xmm0
+ vmovdqu 16(%rsi), %xmm1
+ vmovdqu 32(%rsi), %xmm2
+ vmovdqu 48(%rsi), %xmm3
+ vpshufb %xmm9, %xmm0, %xmm0
+ vpshufb %xmm9, %xmm1, %xmm1
+ vpshufb %xmm9, %xmm2, %xmm2
+ vpshufb %xmm9, %xmm3, %xmm3
+ vpsllq $60, %xmm0, %xmm4
+ vpsllq $60, %xmm1, %xmm5
+ vpsllq $60, %xmm2, %xmm6
+ vpsllq $60, %xmm3, %xmm7
+ vpsrlq $4, %xmm0, %xmm0
+ vpsrlq $4, %xmm1, %xmm1
+ vpsrlq $4, %xmm2, %xmm2
+ vpsrlq $4, %xmm3, %xmm3
+ vpsrldq $8, %xmm4, %xmm4
+ vpsrldq $8, %xmm5, %xmm5
+ vpsrldq $8, %xmm6, %xmm6
+ vpsrldq $8, %xmm7, %xmm7
+ vpor %xmm4, %xmm0, %xmm0
+ vpor %xmm5, %xmm1, %xmm1
+ vpor %xmm6, %xmm2, %xmm2
+ vpor %xmm7, %xmm3, %xmm3
+ vpshufb %xmm9, %xmm0, %xmm0
+ vpshufb %xmm9, %xmm1, %xmm1
+ vpshufb %xmm9, %xmm2, %xmm2
+ vpshufb %xmm9, %xmm3, %xmm3
+ vmovdqu %xmm0, 256(%rsi)
+ vmovdqu %xmm1, 272(%rsi)
+ vmovdqu %xmm2, 288(%rsi)
+ vmovdqu %xmm3, 304(%rsi)
+ vmovdqu 64(%rsi), %xmm0
+ vmovdqu 80(%rsi), %xmm1
+ vmovdqu 96(%rsi), %xmm2
+ vmovdqu 112(%rsi), %xmm3
+ vpshufb %xmm9, %xmm0, %xmm0
+ vpshufb %xmm9, %xmm1, %xmm1
+ vpshufb %xmm9, %xmm2, %xmm2
+ vpshufb %xmm9, %xmm3, %xmm3
+ vpsllq $60, %xmm0, %xmm4
+ vpsllq $60, %xmm1, %xmm5
+ vpsllq $60, %xmm2, %xmm6
+ vpsllq $60, %xmm3, %xmm7
+ vpsrlq $4, %xmm0, %xmm0
+ vpsrlq $4, %xmm1, %xmm1
+ vpsrlq $4, %xmm2, %xmm2
+ vpsrlq $4, %xmm3, %xmm3
+ vpsrldq $8, %xmm4, %xmm4
+ vpsrldq $8, %xmm5, %xmm5
+ vpsrldq $8, %xmm6, %xmm6
+ vpsrldq $8, %xmm7, %xmm7
+ vpor %xmm4, %xmm0, %xmm0
+ vpor %xmm5, %xmm1, %xmm1
+ vpor %xmm6, %xmm2, %xmm2
+ vpor %xmm7, %xmm3, %xmm3
+ vpshufb %xmm9, %xmm0, %xmm0
+ vpshufb %xmm9, %xmm1, %xmm1
+ vpshufb %xmm9, %xmm2, %xmm2
+ vpshufb %xmm9, %xmm3, %xmm3
+ vmovdqu %xmm0, 320(%rsi)
+ vmovdqu %xmm1, 336(%rsi)
+ vmovdqu %xmm2, 352(%rsi)
+ vmovdqu %xmm3, 368(%rsi)
+ vmovdqu 128(%rsi), %xmm0
+ vmovdqu 144(%rsi), %xmm1
+ vmovdqu 160(%rsi), %xmm2
+ vmovdqu 176(%rsi), %xmm3
+ vpshufb %xmm9, %xmm0, %xmm0
+ vpshufb %xmm9, %xmm1, %xmm1
+ vpshufb %xmm9, %xmm2, %xmm2
+ vpshufb %xmm9, %xmm3, %xmm3
+ vpsllq $60, %xmm0, %xmm4
+ vpsllq $60, %xmm1, %xmm5
+ vpsllq $60, %xmm2, %xmm6
+ vpsllq $60, %xmm3, %xmm7
+ vpsrlq $4, %xmm0, %xmm0
+ vpsrlq $4, %xmm1, %xmm1
+ vpsrlq $4, %xmm2, %xmm2
+ vpsrlq $4, %xmm3, %xmm3
+ vpsrldq $8, %xmm4, %xmm4
+ vpsrldq $8, %xmm5, %xmm5
+ vpsrldq $8, %xmm6, %xmm6
+ vpsrldq $8, %xmm7, %xmm7
+ vpor %xmm4, %xmm0, %xmm0
+ vpor %xmm5, %xmm1, %xmm1
+ vpor %xmm6, %xmm2, %xmm2
+ vpor %xmm7, %xmm3, %xmm3
+ vpshufb %xmm9, %xmm0, %xmm0
+ vpshufb %xmm9, %xmm1, %xmm1
+ vpshufb %xmm9, %xmm2, %xmm2
+ vpshufb %xmm9, %xmm3, %xmm3
+ vmovdqu %xmm0, 384(%rsi)
+ vmovdqu %xmm1, 400(%rsi)
+ vmovdqu %xmm2, 416(%rsi)
+ vmovdqu %xmm3, 432(%rsi)
+ vmovdqu 192(%rsi), %xmm0
+ vmovdqu 208(%rsi), %xmm1
+ vmovdqu 224(%rsi), %xmm2
+ vmovdqu 240(%rsi), %xmm3
+ vpshufb %xmm9, %xmm0, %xmm0
+ vpshufb %xmm9, %xmm1, %xmm1
+ vpshufb %xmm9, %xmm2, %xmm2
+ vpshufb %xmm9, %xmm3, %xmm3
+ vpsllq $60, %xmm0, %xmm4
+ vpsllq $60, %xmm1, %xmm5
+ vpsllq $60, %xmm2, %xmm6
+ vpsllq $60, %xmm3, %xmm7
+ vpsrlq $4, %xmm0, %xmm0
+ vpsrlq $4, %xmm1, %xmm1
+ vpsrlq $4, %xmm2, %xmm2
+ vpsrlq $4, %xmm3, %xmm3
+ vpsrldq $8, %xmm4, %xmm4
+ vpsrldq $8, %xmm5, %xmm5
+ vpsrldq $8, %xmm6, %xmm6
+ vpsrldq $8, %xmm7, %xmm7
+ vpor %xmm4, %xmm0, %xmm0
+ vpor %xmm5, %xmm1, %xmm1
+ vpor %xmm6, %xmm2, %xmm2
+ vpor %xmm7, %xmm3, %xmm3
+ vpshufb %xmm9, %xmm0, %xmm0
+ vpshufb %xmm9, %xmm1, %xmm1
+ vpshufb %xmm9, %xmm2, %xmm2
+ vpshufb %xmm9, %xmm3, %xmm3
+ vmovdqu %xmm0, 448(%rsi)
+ vmovdqu %xmm1, 464(%rsi)
+ vmovdqu %xmm2, 480(%rsi)
+ vmovdqu %xmm3, 496(%rsi)
+ repz retq
+#ifndef __APPLE__
+.size GCM_generate_m0_avx512,.-GCM_generate_m0_avx512
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_avx512_aes_gcm_one:
+.quad 0x0000000000000000,0x0000000000000001
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_avx512_aes_gcm_two:
+.quad 0x0000000000000000,0x0000000000000002
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_avx512_aes_gcm_three:
+.quad 0x0000000000000000,0x0000000000000003
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_avx512_aes_gcm_four:
+.quad 0x0000000000000000,0x0000000000000004
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_avx512_aes_gcm_five:
+.quad 0x0000000000000000,0x0000000000000005
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_avx512_aes_gcm_six:
+.quad 0x0000000000000000,0x0000000000000006
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_avx512_aes_gcm_seven:
+.quad 0x0000000000000000,0x0000000000000007
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_avx512_aes_gcm_eight:
+.quad 0x0000000000000000,0x0000000000000008
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_avx512_aes_gcm_bswap_epi64:
+.quad 0x0001020304050607,0x08090a0b0c0d0e0f
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_avx512_aes_gcm_bswap_mask:
+.quad 0x08090a0b0c0d0e0f,0x0001020304050607
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_avx512_aes_gcm_mod2_128:
+.quad 0x0000000000000001,0xc200000000000000
+#ifndef __APPLE__
+.text
+.globl AES_GCM_encrypt_avx512
+.type AES_GCM_encrypt_avx512,@function
+.align 16
+AES_GCM_encrypt_avx512:
+#else
+.section __TEXT,__text
+.globl _AES_GCM_encrypt_avx512
+.p2align 4
+_AES_GCM_encrypt_avx512:
+#endif /* __APPLE__ */
+ pushq %r13
+ pushq %r12
+ pushq %rbx
+ pushq %r14
+ pushq %r15
+ movq %rdx, %r12
+ movq %rcx, %rax
+ movl 48(%rsp), %r11d
+ movl 56(%rsp), %ebx
+ movl 64(%rsp), %r14d
+ movq 72(%rsp), %r15
+ movl 80(%rsp), %r10d
+ subq $0x440, %rsp
+ vpxor %xmm4, %xmm4, %xmm4
+ vpxor %xmm6, %xmm6, %xmm6
+ movl %ebx, %edx
+ cmpl $12, %edx
+ jne L_AES_GCM_encrypt_avx512_iv_not_12
+ # # Calculate values when IV is 12 bytes
+ # Set counter based on IV
+ movl $0x1000000, %ecx
+ vmovq (%rax), %xmm4
+ vpinsrd $2, 8(%rax), %xmm4, %xmm4
+ vpinsrd $3, %ecx, %xmm4, %xmm4
+ # H = Encrypt X(=0) and T = Encrypt counter
+ vmovdqa (%r15), %xmm5
+ vpxor %xmm5, %xmm4, %xmm1
+ vmovdqa 16(%r15), %xmm7
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ vmovdqa 32(%r15), %xmm7
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ vmovdqa 48(%r15), %xmm7
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ vmovdqa 64(%r15), %xmm7
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ vmovdqa 80(%r15), %xmm7
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ vmovdqa 96(%r15), %xmm7
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ vmovdqa 112(%r15), %xmm7
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ vmovdqa 128(%r15), %xmm7
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ vmovdqa 144(%r15), %xmm7
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ cmpl $11, %r10d
+ vmovdqa 160(%r15), %xmm7
+ jl L_AES_GCM_encrypt_avx512_calc_iv_12_last
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ vmovdqa 176(%r15), %xmm7
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ cmpl $13, %r10d
+ vmovdqa 192(%r15), %xmm7
+ jl L_AES_GCM_encrypt_avx512_calc_iv_12_last
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ vmovdqa 208(%r15), %xmm7
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ vmovdqa 224(%r15), %xmm7
+L_AES_GCM_encrypt_avx512_calc_iv_12_last:
+ vaesenclast %xmm7, %xmm5, %xmm5
+ vaesenclast %xmm7, %xmm1, %xmm1
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
+ vmovdqu %xmm1, 1040(%rsp)
+ jmp L_AES_GCM_encrypt_avx512_iv_done
+L_AES_GCM_encrypt_avx512_iv_not_12:
+ # Calculate values when IV is not 12 bytes
+ # H = Encrypt X(=0)
+ vmovdqa (%r15), %xmm5
+ vaesenc 16(%r15), %xmm5, %xmm5
+ vaesenc 32(%r15), %xmm5, %xmm5
+ vaesenc 48(%r15), %xmm5, %xmm5
+ vaesenc 64(%r15), %xmm5, %xmm5
+ vaesenc 80(%r15), %xmm5, %xmm5
+ vaesenc 96(%r15), %xmm5, %xmm5
+ vaesenc 112(%r15), %xmm5, %xmm5
+ vaesenc 128(%r15), %xmm5, %xmm5
+ vaesenc 144(%r15), %xmm5, %xmm5
+ cmpl $11, %r10d
+ vmovdqa 160(%r15), %xmm9
+ jl L_AES_GCM_encrypt_avx512_calc_iv_1_aesenc_avx_last
+ vaesenc %xmm9, %xmm5, %xmm5
+ vaesenc 176(%r15), %xmm5, %xmm5
+ cmpl $13, %r10d
+ vmovdqa 192(%r15), %xmm9
+ jl L_AES_GCM_encrypt_avx512_calc_iv_1_aesenc_avx_last
+ vaesenc %xmm9, %xmm5, %xmm5
+ vaesenc 208(%r15), %xmm5, %xmm5
+ vmovdqa 224(%r15), %xmm9
+L_AES_GCM_encrypt_avx512_calc_iv_1_aesenc_avx_last:
+ vaesenclast %xmm9, %xmm5, %xmm5
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
+ # Calc counter
+ # Initialization vector
+ cmpl $0x00, %edx
+ movq $0x00, %rcx
+ je L_AES_GCM_encrypt_avx512_calc_iv_done
+ cmpl $16, %edx
+ jl L_AES_GCM_encrypt_avx512_calc_iv_lt16
+ andl $0xfffffff0, %edx
+L_AES_GCM_encrypt_avx512_calc_iv_16_loop:
+ vmovdqu (%rax,%rcx,1), %xmm8
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
+ vpxor %xmm8, %xmm4, %xmm4
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm4, %xmm1
+ vpshufd $0x4e, %xmm5, %xmm2
+ vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3
+ vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0
+ vpxor %xmm4, %xmm1, %xmm1
+ vpxor %xmm5, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm7
+ vmovdqa %xmm3, %xmm4
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm7, %xmm7
+ vpxor %xmm1, %xmm4, %xmm4
+ vpsrld $31, %xmm7, %xmm0
+ vpsrld $31, %xmm4, %xmm1
+ vpslld $0x01, %xmm7, %xmm7
+ vpslld $0x01, %xmm4, %xmm4
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm4, %xmm4
+ vpor %xmm0, %xmm7, %xmm7
+ vpor %xmm1, %xmm4, %xmm4
+ vpslld $31, %xmm7, %xmm0
+ vpslld $30, %xmm7, %xmm1
+ vpslld $25, %xmm7, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm7, %xmm7
+ vpsrld $0x01, %xmm7, %xmm2
+ vpsrld $2, %xmm7, %xmm3
+ vpsrld $7, %xmm7, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm7, %xmm2, %xmm2
+ vpxor %xmm2, %xmm4, %xmm4
+ addl $16, %ecx
+ cmpl %edx, %ecx
+ jl L_AES_GCM_encrypt_avx512_calc_iv_16_loop
+ movl %ebx, %edx
+ cmpl %edx, %ecx
+ je L_AES_GCM_encrypt_avx512_calc_iv_done
+L_AES_GCM_encrypt_avx512_calc_iv_lt16:
+ subq $16, %rsp
+ vpxor %xmm8, %xmm8, %xmm8
+ xorl %ebx, %ebx
+ vmovdqu %xmm8, (%rsp)
+L_AES_GCM_encrypt_avx512_calc_iv_loop:
+ movzbl (%rax,%rcx,1), %r13d
+ movb %r13b, (%rsp,%rbx,1)
+ incl %ecx
+ incl %ebx
+ cmpl %edx, %ecx
+ jl L_AES_GCM_encrypt_avx512_calc_iv_loop
+ vmovdqu (%rsp), %xmm8
+ addq $16, %rsp
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
+ vpxor %xmm8, %xmm4, %xmm4
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm4, %xmm1
+ vpshufd $0x4e, %xmm5, %xmm2
+ vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3
+ vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0
+ vpxor %xmm4, %xmm1, %xmm1
+ vpxor %xmm5, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm7
+ vmovdqa %xmm3, %xmm4
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm7, %xmm7
+ vpxor %xmm1, %xmm4, %xmm4
+ vpsrld $31, %xmm7, %xmm0
+ vpsrld $31, %xmm4, %xmm1
+ vpslld $0x01, %xmm7, %xmm7
+ vpslld $0x01, %xmm4, %xmm4
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm4, %xmm4
+ vpor %xmm0, %xmm7, %xmm7
+ vpor %xmm1, %xmm4, %xmm4
+ vpslld $31, %xmm7, %xmm0
+ vpslld $30, %xmm7, %xmm1
+ vpslld $25, %xmm7, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm7, %xmm7
+ vpsrld $0x01, %xmm7, %xmm2
+ vpsrld $2, %xmm7, %xmm3
+ vpsrld $7, %xmm7, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm7, %xmm2, %xmm2
+ vpxor %xmm2, %xmm4, %xmm4
+L_AES_GCM_encrypt_avx512_calc_iv_done:
+ # T = Encrypt counter
+ vpxor %xmm0, %xmm0, %xmm0
+ shll $3, %edx
+ vmovq %rdx, %xmm0
+ vpxor %xmm0, %xmm4, %xmm4
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm4, %xmm1
+ vpshufd $0x4e, %xmm5, %xmm2
+ vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3
+ vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0
+ vpxor %xmm4, %xmm1, %xmm1
+ vpxor %xmm5, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm7
+ vmovdqa %xmm3, %xmm4
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm7, %xmm7
+ vpxor %xmm1, %xmm4, %xmm4
+ vpsrld $31, %xmm7, %xmm0
+ vpsrld $31, %xmm4, %xmm1
+ vpslld $0x01, %xmm7, %xmm7
+ vpslld $0x01, %xmm4, %xmm4
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm4, %xmm4
+ vpor %xmm0, %xmm7, %xmm7
+ vpor %xmm1, %xmm4, %xmm4
+ vpslld $31, %xmm7, %xmm0
+ vpslld $30, %xmm7, %xmm1
+ vpslld $25, %xmm7, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm7, %xmm7
+ vpsrld $0x01, %xmm7, %xmm2
+ vpsrld $2, %xmm7, %xmm3
+ vpsrld $7, %xmm7, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm7, %xmm2, %xmm2
+ vpxor %xmm2, %xmm4, %xmm4
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4
+ # Encrypt counter
+ vmovdqa (%r15), %xmm8
+ vpxor %xmm4, %xmm8, %xmm8
+ vaesenc 16(%r15), %xmm8, %xmm8
+ vaesenc 32(%r15), %xmm8, %xmm8
+ vaesenc 48(%r15), %xmm8, %xmm8
+ vaesenc 64(%r15), %xmm8, %xmm8
+ vaesenc 80(%r15), %xmm8, %xmm8
+ vaesenc 96(%r15), %xmm8, %xmm8
+ vaesenc 112(%r15), %xmm8, %xmm8
+ vaesenc 128(%r15), %xmm8, %xmm8
+ vaesenc 144(%r15), %xmm8, %xmm8
+ cmpl $11, %r10d
+ vmovdqa 160(%r15), %xmm9
+ jl L_AES_GCM_encrypt_avx512_calc_iv_2_aesenc_avx_last
+ vaesenc %xmm9, %xmm8, %xmm8
+ vaesenc 176(%r15), %xmm8, %xmm8
+ cmpl $13, %r10d
+ vmovdqa 192(%r15), %xmm9
+ jl L_AES_GCM_encrypt_avx512_calc_iv_2_aesenc_avx_last
+ vaesenc %xmm9, %xmm8, %xmm8
+ vaesenc 208(%r15), %xmm8, %xmm8
+ vmovdqa 224(%r15), %xmm9
+L_AES_GCM_encrypt_avx512_calc_iv_2_aesenc_avx_last:
+ vaesenclast %xmm9, %xmm8, %xmm8
+ vmovdqu %xmm8, 1040(%rsp)
+L_AES_GCM_encrypt_avx512_iv_done:
+ # Additional authentication data
+ movl %r11d, %edx
+ cmpl $0x00, %edx
+ je L_AES_GCM_encrypt_avx512_calc_aad_done
+ xorl %ecx, %ecx
+ cmpl $16, %edx
+ jl L_AES_GCM_encrypt_avx512_calc_aad_lt16
+ andl $0xfffffff0, %edx
+L_AES_GCM_encrypt_avx512_calc_aad_16_loop:
+ vmovdqu (%r12,%rcx,1), %xmm8
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
+ vpxor %xmm8, %xmm6, %xmm6
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm6, %xmm1
+ vpshufd $0x4e, %xmm5, %xmm2
+ vpclmulqdq $0x11, %xmm6, %xmm5, %xmm3
+ vpclmulqdq $0x00, %xmm6, %xmm5, %xmm0
+ vpxor %xmm6, %xmm1, %xmm1
+ vpxor %xmm5, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm7
+ vmovdqa %xmm3, %xmm6
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm7, %xmm7
+ vpxor %xmm1, %xmm6, %xmm6
+ vpsrld $31, %xmm7, %xmm0
+ vpsrld $31, %xmm6, %xmm1
+ vpslld $0x01, %xmm7, %xmm7
+ vpslld $0x01, %xmm6, %xmm6
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm6, %xmm6
+ vpor %xmm0, %xmm7, %xmm7
+ vpor %xmm1, %xmm6, %xmm6
+ vpslld $31, %xmm7, %xmm0
+ vpslld $30, %xmm7, %xmm1
+ vpslld $25, %xmm7, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm7, %xmm7
+ vpsrld $0x01, %xmm7, %xmm2
+ vpsrld $2, %xmm7, %xmm3
+ vpsrld $7, %xmm7, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm7, %xmm2, %xmm2
+ vpxor %xmm2, %xmm6, %xmm6
+ addl $16, %ecx
+ cmpl %edx, %ecx
+ jl L_AES_GCM_encrypt_avx512_calc_aad_16_loop
+ movl %r11d, %edx
+ cmpl %edx, %ecx
+ je L_AES_GCM_encrypt_avx512_calc_aad_done
+L_AES_GCM_encrypt_avx512_calc_aad_lt16:
+ subq $16, %rsp
+ vpxor %xmm8, %xmm8, %xmm8
+ xorl %ebx, %ebx
+ vmovdqu %xmm8, (%rsp)
+L_AES_GCM_encrypt_avx512_calc_aad_loop:
+ movzbl (%r12,%rcx,1), %r13d
+ movb %r13b, (%rsp,%rbx,1)
+ incl %ecx
+ incl %ebx
+ cmpl %edx, %ecx
+ jl L_AES_GCM_encrypt_avx512_calc_aad_loop
+ vmovdqu (%rsp), %xmm8
+ addq $16, %rsp
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
+ vpxor %xmm8, %xmm6, %xmm6
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm6, %xmm1
+ vpshufd $0x4e, %xmm5, %xmm2
+ vpclmulqdq $0x11, %xmm6, %xmm5, %xmm3
+ vpclmulqdq $0x00, %xmm6, %xmm5, %xmm0
+ vpxor %xmm6, %xmm1, %xmm1
+ vpxor %xmm5, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm7
+ vmovdqa %xmm3, %xmm6
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm7, %xmm7
+ vpxor %xmm1, %xmm6, %xmm6
+ vpsrld $31, %xmm7, %xmm0
+ vpsrld $31, %xmm6, %xmm1
+ vpslld $0x01, %xmm7, %xmm7
+ vpslld $0x01, %xmm6, %xmm6
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm6, %xmm6
+ vpor %xmm0, %xmm7, %xmm7
+ vpor %xmm1, %xmm6, %xmm6
+ vpslld $31, %xmm7, %xmm0
+ vpslld $30, %xmm7, %xmm1
+ vpslld $25, %xmm7, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm7, %xmm7
+ vpsrld $0x01, %xmm7, %xmm2
+ vpsrld $2, %xmm7, %xmm3
+ vpsrld $7, %xmm7, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm7, %xmm2, %xmm2
+ vpxor %xmm2, %xmm6, %xmm6
+L_AES_GCM_encrypt_avx512_calc_aad_done:
+ # Calculate counter and H
+ vpsrlq $63, %xmm5, %xmm9
+ vpsllq $0x01, %xmm5, %xmm8
+ vpslldq $8, %xmm9, %xmm9
+ vpor %xmm9, %xmm8, %xmm8
+ vpshufd $0xff, %xmm5, %xmm5
+ vpsrad $31, %xmm5, %xmm5
+ vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4
+ vpand L_avx512_aes_gcm_mod2_128(%rip), %xmm5, %xmm5
+ vpaddd L_avx512_aes_gcm_one(%rip), %xmm4, %xmm4
+ vpxor %xmm8, %xmm5, %xmm5
+ vmovdqu %xmm4, 1024(%rsp)
+ xorl %ebx, %ebx
+ cmpl $0x100, %r9d
+ jl L_AES_GCM_encrypt_avx512_done_128
+ vmovdqa %xmm6, %xmm2
+ # H ^ 1
+ vmovdqu %xmm5, (%rsp)
+ # H ^ 2
+ vpclmulqdq $0x00, %xmm5, %xmm5, %xmm8
+ vpclmulqdq $0x11, %xmm5, %xmm5, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm0
+ vmovdqu %xmm0, 16(%rsp)
+ # H ^ 3
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm5, %xmm9
+ vpxor %xmm5, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm0, %xmm10
+ vpxor %xmm0, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm5, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm5, %xmm0, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm1
+ vmovdqu %xmm1, 32(%rsp)
+ # H ^ 4
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm3
+ vmovdqu %xmm3, 48(%rsp)
+ # H ^ 5
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 64(%rsp)
+ # H ^ 6
+ vpclmulqdq $0x00, %xmm1, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm1, %xmm1, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 80(%rsp)
+ # H ^ 7
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm1, %xmm9
+ vpxor %xmm1, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm3, %xmm10
+ vpxor %xmm3, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm1, %xmm3, %xmm8
+ vpclmulqdq $0x11, %xmm1, %xmm3, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 96(%rsp)
+ # H ^ 8
+ vpclmulqdq $0x00, %xmm3, %xmm3, %xmm8
+ vpclmulqdq $0x11, %xmm3, %xmm3, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 112(%rsp)
+ # H ^ 9
+ vmovdqu 48(%rsp), %xmm0
+ vmovdqu 64(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 128(%rsp)
+ # H ^ 10
+ vmovdqu 64(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 144(%rsp)
+ # H ^ 11
+ vmovdqu 64(%rsp), %xmm0
+ vmovdqu 80(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 160(%rsp)
+ # H ^ 12
+ vmovdqu 80(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 176(%rsp)
+ # H ^ 13
+ vmovdqu 80(%rsp), %xmm0
+ vmovdqu 96(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 192(%rsp)
+ # H ^ 14
+ vmovdqu 96(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 208(%rsp)
+ # H ^ 15
+ vmovdqu 96(%rsp), %xmm0
+ vmovdqu 112(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 224(%rsp)
+ # H ^ 16
+ vmovdqu 112(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 240(%rsp)
+ cmpl $0x200, %r9d
+ jl L_AES_GCM_encrypt_avx512_no_ext
+ # H ^ 17
+ vmovdqu 112(%rsp), %xmm0
+ vmovdqu 128(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 256(%rsp)
+ # H ^ 18
+ vmovdqu 128(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 272(%rsp)
+ # H ^ 19
+ vmovdqu 128(%rsp), %xmm0
+ vmovdqu 144(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 288(%rsp)
+ # H ^ 20
+ vmovdqu 144(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 304(%rsp)
+ # H ^ 21
+ vmovdqu 144(%rsp), %xmm0
+ vmovdqu 160(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 320(%rsp)
+ # H ^ 22
+ vmovdqu 160(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 336(%rsp)
+ # H ^ 23
+ vmovdqu 160(%rsp), %xmm0
+ vmovdqu 176(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 352(%rsp)
+ # H ^ 24
+ vmovdqu 176(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 368(%rsp)
+ # H ^ 25
+ vmovdqu 176(%rsp), %xmm0
+ vmovdqu 192(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 384(%rsp)
+ # H ^ 26
+ vmovdqu 192(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 400(%rsp)
+ # H ^ 27
+ vmovdqu 192(%rsp), %xmm0
+ vmovdqu 208(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 416(%rsp)
+ # H ^ 28
+ vmovdqu 208(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 432(%rsp)
+ # H ^ 29
+ vmovdqu 208(%rsp), %xmm0
+ vmovdqu 224(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 448(%rsp)
+ # H ^ 30
+ vmovdqu 224(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 464(%rsp)
+ # H ^ 31
+ vmovdqu 224(%rsp), %xmm0
+ vmovdqu 240(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 480(%rsp)
+ # H ^ 32
+ vmovdqu 240(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 496(%rsp)
+L_AES_GCM_encrypt_avx512_no_ext:
+ vbroadcasti32x4 L_avx512_aes_gcm_bswap_epi64(%rip), %zmm22
+ vbroadcasti32x4 L_avx512_aes_gcm_bswap_mask(%rip), %zmm30
+ vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31
+ vbroadcasti32x4 (%r15), %zmm9
+ vbroadcasti32x4 16(%r15), %zmm10
+ vbroadcasti32x4 32(%r15), %zmm11
+ vbroadcasti32x4 48(%r15), %zmm12
+ vbroadcasti32x4 64(%r15), %zmm13
+ vbroadcasti32x4 80(%r15), %zmm14
+ vbroadcasti32x4 96(%r15), %zmm15
+ vbroadcasti32x4 112(%r15), %zmm1
+ vbroadcasti32x4 128(%r15), %zmm2
+ vbroadcasti32x4 144(%r15), %zmm3
+ cmpl $0x200, %r9d
+ jl L_AES_GCM_encrypt_avx512_no_windows
+ movl %r9d, %r13d
+ andl $0xfffffe00, %r13d
+ vmovdqu64 448(%rsp), %zmm23
+ vshufi64x2 $27, %zmm23, %zmm23, %zmm23
+ vmovdqu64 384(%rsp), %zmm24
+ vshufi64x2 $27, %zmm24, %zmm24, %zmm24
+ vmovdqu64 320(%rsp), %zmm25
+ vshufi64x2 $27, %zmm25, %zmm25, %zmm25
+ vmovdqu64 256(%rsp), %zmm26
+ vshufi64x2 $27, %zmm26, %zmm26, %zmm26
+ vmovdqu64 %zmm23, 512(%rsp)
+ vmovdqu64 %zmm24, 576(%rsp)
+ vmovdqu64 %zmm25, 640(%rsp)
+ vmovdqu64 %zmm26, 704(%rsp)
+ vmovdqu64 192(%rsp), %zmm23
+ vshufi64x2 $27, %zmm23, %zmm23, %zmm23
+ vmovdqu64 128(%rsp), %zmm24
+ vshufi64x2 $27, %zmm24, %zmm24, %zmm24
+ vmovdqu64 64(%rsp), %zmm25
+ vshufi64x2 $27, %zmm25, %zmm25, %zmm25
+ vmovdqu64 (%rsp), %zmm26
+ vshufi64x2 $27, %zmm26, %zmm26, %zmm26
+ vmovdqu64 %zmm23, 768(%rsp)
+ vmovdqu64 %zmm24, 832(%rsp)
+ vmovdqu64 %zmm25, 896(%rsp)
+ vmovdqu64 %zmm26, 960(%rsp)
+ # 512 bytes of input
+ leaq (%rsi,%rbx,1), %rcx
+ movq %rcx, 1056(%rsp)
+ vbroadcasti32x4 1024(%rsp), %zmm20
+ vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16
+ vpshufb %zmm22, %zmm16, %zmm16
+ vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17
+ vpshufb %zmm22, %zmm17, %zmm17
+ vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18
+ vpshufb %zmm22, %zmm18, %zmm18
+ vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19
+ vpshufb %zmm22, %zmm19, %zmm19
+ vmovdqu 1024(%rsp), %xmm8
+ vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, 1024(%rsp)
+ vpxorq %zmm9, %zmm16, %zmm16
+ vpxorq %zmm9, %zmm17, %zmm17
+ vpxorq %zmm9, %zmm18, %zmm18
+ vpxorq %zmm9, %zmm19, %zmm19
+ vaesenc %zmm10, %zmm16, %zmm16
+ vaesenc %zmm10, %zmm17, %zmm17
+ vaesenc %zmm10, %zmm18, %zmm18
+ vaesenc %zmm10, %zmm19, %zmm19
+ vaesenc %zmm11, %zmm16, %zmm16
+ vaesenc %zmm11, %zmm17, %zmm17
+ vaesenc %zmm11, %zmm18, %zmm18
+ vaesenc %zmm11, %zmm19, %zmm19
+ vaesenc %zmm12, %zmm16, %zmm16
+ vaesenc %zmm12, %zmm17, %zmm17
+ vaesenc %zmm12, %zmm18, %zmm18
+ vaesenc %zmm12, %zmm19, %zmm19
+ vaesenc %zmm13, %zmm16, %zmm16
+ vaesenc %zmm13, %zmm17, %zmm17
+ vaesenc %zmm13, %zmm18, %zmm18
+ vaesenc %zmm13, %zmm19, %zmm19
+ vaesenc %zmm14, %zmm16, %zmm16
+ vaesenc %zmm14, %zmm17, %zmm17
+ vaesenc %zmm14, %zmm18, %zmm18
+ vaesenc %zmm14, %zmm19, %zmm19
+ vaesenc %zmm15, %zmm16, %zmm16
+ vaesenc %zmm15, %zmm17, %zmm17
+ vaesenc %zmm15, %zmm18, %zmm18
+ vaesenc %zmm15, %zmm19, %zmm19
+ vaesenc %zmm1, %zmm16, %zmm16
+ vaesenc %zmm1, %zmm17, %zmm17
+ vaesenc %zmm1, %zmm18, %zmm18
+ vaesenc %zmm1, %zmm19, %zmm19
+ vaesenc %zmm2, %zmm16, %zmm16
+ vaesenc %zmm2, %zmm17, %zmm17
+ vaesenc %zmm2, %zmm18, %zmm18
+ vaesenc %zmm2, %zmm19, %zmm19
+ vaesenc %zmm3, %zmm16, %zmm16
+ vaesenc %zmm3, %zmm17, %zmm17
+ vaesenc %zmm3, %zmm18, %zmm18
+ vaesenc %zmm3, %zmm19, %zmm19
+ cmpl $11, %r10d
+ vbroadcasti32x4 160(%r15), %zmm20
+ jl L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 176(%r15), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ cmpl $13, %r10d
+ vbroadcasti32x4 192(%r15), %zmm20
+ jl L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 208(%r15), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 224(%r15), %zmm20
+L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last:
+ vaesenclast %zmm20, %zmm16, %zmm16
+ vaesenclast %zmm20, %zmm17, %zmm17
+ vaesenclast %zmm20, %zmm18, %zmm18
+ vaesenclast %zmm20, %zmm19, %zmm19
+ leaq (%rdi,%rbx,1), %rcx
+ leaq (%rsi,%rbx,1), %rdx
+ vmovdqu64 (%rcx), %zmm21
+ vpxorq %zmm21, %zmm16, %zmm16
+ vmovdqu64 %zmm16, (%rdx)
+ vmovdqu64 64(%rcx), %zmm21
+ vpxorq %zmm21, %zmm17, %zmm17
+ vmovdqu64 %zmm17, 64(%rdx)
+ vmovdqu64 128(%rcx), %zmm21
+ vpxorq %zmm21, %zmm18, %zmm18
+ vmovdqu64 %zmm18, 128(%rdx)
+ vmovdqu64 192(%rcx), %zmm21
+ vpxorq %zmm21, %zmm19, %zmm19
+ vmovdqu64 %zmm19, 192(%rdx)
+ addl $0x100, %ebx
+ vbroadcasti32x4 1024(%rsp), %zmm20
+ vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16
+ vpshufb %zmm22, %zmm16, %zmm16
+ vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17
+ vpshufb %zmm22, %zmm17, %zmm17
+ vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18
+ vpshufb %zmm22, %zmm18, %zmm18
+ vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19
+ vpshufb %zmm22, %zmm19, %zmm19
+ vmovdqu 1024(%rsp), %xmm8
+ vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, 1024(%rsp)
+ vpxorq %zmm9, %zmm16, %zmm16
+ vpxorq %zmm9, %zmm17, %zmm17
+ vpxorq %zmm9, %zmm18, %zmm18
+ vpxorq %zmm9, %zmm19, %zmm19
+ vaesenc %zmm10, %zmm16, %zmm16
+ vaesenc %zmm10, %zmm17, %zmm17
+ vaesenc %zmm10, %zmm18, %zmm18
+ vaesenc %zmm10, %zmm19, %zmm19
+ vaesenc %zmm11, %zmm16, %zmm16
+ vaesenc %zmm11, %zmm17, %zmm17
+ vaesenc %zmm11, %zmm18, %zmm18
+ vaesenc %zmm11, %zmm19, %zmm19
+ vaesenc %zmm12, %zmm16, %zmm16
+ vaesenc %zmm12, %zmm17, %zmm17
+ vaesenc %zmm12, %zmm18, %zmm18
+ vaesenc %zmm12, %zmm19, %zmm19
+ vaesenc %zmm13, %zmm16, %zmm16
+ vaesenc %zmm13, %zmm17, %zmm17
+ vaesenc %zmm13, %zmm18, %zmm18
+ vaesenc %zmm13, %zmm19, %zmm19
+ vaesenc %zmm14, %zmm16, %zmm16
+ vaesenc %zmm14, %zmm17, %zmm17
+ vaesenc %zmm14, %zmm18, %zmm18
+ vaesenc %zmm14, %zmm19, %zmm19
+ vaesenc %zmm15, %zmm16, %zmm16
+ vaesenc %zmm15, %zmm17, %zmm17
+ vaesenc %zmm15, %zmm18, %zmm18
+ vaesenc %zmm15, %zmm19, %zmm19
+ vaesenc %zmm1, %zmm16, %zmm16
+ vaesenc %zmm1, %zmm17, %zmm17
+ vaesenc %zmm1, %zmm18, %zmm18
+ vaesenc %zmm1, %zmm19, %zmm19
+ vaesenc %zmm2, %zmm16, %zmm16
+ vaesenc %zmm2, %zmm17, %zmm17
+ vaesenc %zmm2, %zmm18, %zmm18
+ vaesenc %zmm2, %zmm19, %zmm19
+ vaesenc %zmm3, %zmm16, %zmm16
+ vaesenc %zmm3, %zmm17, %zmm17
+ vaesenc %zmm3, %zmm18, %zmm18
+ vaesenc %zmm3, %zmm19, %zmm19
+ cmpl $11, %r10d
+ vbroadcasti32x4 160(%r15), %zmm20
+ jl L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 176(%r15), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ cmpl $13, %r10d
+ vbroadcasti32x4 192(%r15), %zmm20
+ jl L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 208(%r15), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 224(%r15), %zmm20
+L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last:
+ vaesenclast %zmm20, %zmm16, %zmm16
+ vaesenclast %zmm20, %zmm17, %zmm17
+ vaesenclast %zmm20, %zmm18, %zmm18
+ vaesenclast %zmm20, %zmm19, %zmm19
+ leaq (%rdi,%rbx,1), %rcx
+ leaq (%rsi,%rbx,1), %rdx
+ vmovdqu64 (%rcx), %zmm21
+ vpxorq %zmm21, %zmm16, %zmm16
+ vmovdqu64 %zmm16, (%rdx)
+ vmovdqu64 64(%rcx), %zmm21
+ vpxorq %zmm21, %zmm17, %zmm17
+ vmovdqu64 %zmm17, 64(%rdx)
+ vmovdqu64 128(%rcx), %zmm21
+ vpxorq %zmm21, %zmm18, %zmm18
+ vmovdqu64 %zmm18, 128(%rdx)
+ vmovdqu64 192(%rcx), %zmm21
+ vpxorq %zmm21, %zmm19, %zmm19
+ vmovdqu64 %zmm19, 192(%rdx)
+ addl $0x100, %ebx
+ cmpl %r13d, %ebx
+ jge L_AES_GCM_encrypt_avx512_last_win
+L_AES_GCM_encrypt_avx512_win_loop:
+ leaq (%rsi,%rbx,1), %rcx
+ movq %rcx, 1072(%rsp)
+ movq 1056(%rsp), %r12
+ vpxorq %zmm21, %zmm21, %zmm21
+ vinserti32x4 $0x00, %xmm6, %zmm21, %zmm21
+ vbroadcasti32x4 1024(%rsp), %zmm20
+ vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16
+ vpshufb %zmm22, %zmm16, %zmm16
+ vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17
+ vpshufb %zmm22, %zmm17, %zmm17
+ vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18
+ vpshufb %zmm22, %zmm18, %zmm18
+ vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19
+ vpshufb %zmm22, %zmm19, %zmm19
+ vmovdqu 1024(%rsp), %xmm8
+ vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, 1024(%rsp)
+ vpxorq %zmm9, %zmm16, %zmm16
+ vpxorq %zmm9, %zmm17, %zmm17
+ vpxorq %zmm9, %zmm18, %zmm18
+ vpxorq %zmm9, %zmm19, %zmm19
+ vmovdqu64 (%r12), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vpxorq %zmm21, %zmm31, %zmm31
+ vaesenc %zmm10, %zmm16, %zmm16
+ vaesenc %zmm10, %zmm17, %zmm17
+ vaesenc %zmm10, %zmm18, %zmm18
+ vaesenc %zmm10, %zmm19, %zmm19
+ vpclmulqdq $0x00, 512(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 512(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 512(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 512(%rsp), %zmm31, %zmm26
+ vmovdqa64 %zmm23, %zmm27
+ vpxorq %zmm24, %zmm25, %zmm28
+ vmovdqa64 %zmm26, %zmm29
+ vaesenc %zmm11, %zmm16, %zmm16
+ vaesenc %zmm11, %zmm17, %zmm17
+ vaesenc %zmm11, %zmm18, %zmm18
+ vaesenc %zmm11, %zmm19, %zmm19
+ vmovdqu64 64(%r12), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm12, %zmm16, %zmm16
+ vaesenc %zmm12, %zmm17, %zmm17
+ vaesenc %zmm12, %zmm18, %zmm18
+ vaesenc %zmm12, %zmm19, %zmm19
+ vpclmulqdq $0x00, 576(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 576(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 576(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 576(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm13, %zmm16, %zmm16
+ vaesenc %zmm13, %zmm17, %zmm17
+ vaesenc %zmm13, %zmm18, %zmm18
+ vaesenc %zmm13, %zmm19, %zmm19
+ vmovdqu64 128(%r12), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm14, %zmm16, %zmm16
+ vaesenc %zmm14, %zmm17, %zmm17
+ vaesenc %zmm14, %zmm18, %zmm18
+ vaesenc %zmm14, %zmm19, %zmm19
+ vpclmulqdq $0x00, 640(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 640(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 640(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 640(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm15, %zmm16, %zmm16
+ vaesenc %zmm15, %zmm17, %zmm17
+ vaesenc %zmm15, %zmm18, %zmm18
+ vaesenc %zmm15, %zmm19, %zmm19
+ vmovdqu64 192(%r12), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm1, %zmm16, %zmm16
+ vaesenc %zmm1, %zmm17, %zmm17
+ vaesenc %zmm1, %zmm18, %zmm18
+ vaesenc %zmm1, %zmm19, %zmm19
+ vpclmulqdq $0x00, 704(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 704(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 704(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 704(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm2, %zmm16, %zmm16
+ vaesenc %zmm2, %zmm17, %zmm17
+ vaesenc %zmm2, %zmm18, %zmm18
+ vaesenc %zmm2, %zmm19, %zmm19
+ vaesenc %zmm3, %zmm16, %zmm16
+ vaesenc %zmm3, %zmm17, %zmm17
+ vaesenc %zmm3, %zmm18, %zmm18
+ vaesenc %zmm3, %zmm19, %zmm19
+ cmpl $11, %r10d
+ vbroadcasti32x4 160(%r15), %zmm20
+ jl L_AES_GCM_encrypt_avx512_a_il_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 176(%r15), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ cmpl $13, %r10d
+ vbroadcasti32x4 192(%r15), %zmm20
+ jl L_AES_GCM_encrypt_avx512_a_il_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 208(%r15), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 224(%r15), %zmm20
+L_AES_GCM_encrypt_avx512_a_il_last:
+ vaesenclast %zmm20, %zmm16, %zmm16
+ vaesenclast %zmm20, %zmm17, %zmm17
+ vaesenclast %zmm20, %zmm18, %zmm18
+ vaesenclast %zmm20, %zmm19, %zmm19
+ leaq (%rdi,%rbx,1), %rcx
+ leaq (%rsi,%rbx,1), %rdx
+ vmovdqu64 (%rcx), %zmm21
+ vpxorq %zmm21, %zmm16, %zmm16
+ vmovdqu64 %zmm16, (%rdx)
+ vmovdqu64 64(%rcx), %zmm21
+ vpxorq %zmm21, %zmm17, %zmm17
+ vmovdqu64 %zmm17, 64(%rdx)
+ vmovdqu64 128(%rcx), %zmm21
+ vpxorq %zmm21, %zmm18, %zmm18
+ vmovdqu64 %zmm18, 128(%rdx)
+ vmovdqu64 192(%rcx), %zmm21
+ vpxorq %zmm21, %zmm19, %zmm19
+ vmovdqu64 %zmm19, 192(%rdx)
+ addl $0x100, %ebx
+ vbroadcasti32x4 1024(%rsp), %zmm20
+ vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16
+ vpshufb %zmm22, %zmm16, %zmm16
+ vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17
+ vpshufb %zmm22, %zmm17, %zmm17
+ vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18
+ vpshufb %zmm22, %zmm18, %zmm18
+ vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19
+ vpshufb %zmm22, %zmm19, %zmm19
+ vmovdqu 1024(%rsp), %xmm8
+ vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, 1024(%rsp)
+ vpxorq %zmm9, %zmm16, %zmm16
+ vpxorq %zmm9, %zmm17, %zmm17
+ vpxorq %zmm9, %zmm18, %zmm18
+ vpxorq %zmm9, %zmm19, %zmm19
+ vmovdqu64 256(%r12), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm10, %zmm16, %zmm16
+ vaesenc %zmm10, %zmm17, %zmm17
+ vaesenc %zmm10, %zmm18, %zmm18
+ vaesenc %zmm10, %zmm19, %zmm19
+ vpclmulqdq $0x00, 768(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 768(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 768(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 768(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm11, %zmm16, %zmm16
+ vaesenc %zmm11, %zmm17, %zmm17
+ vaesenc %zmm11, %zmm18, %zmm18
+ vaesenc %zmm11, %zmm19, %zmm19
+ vmovdqu64 320(%r12), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm12, %zmm16, %zmm16
+ vaesenc %zmm12, %zmm17, %zmm17
+ vaesenc %zmm12, %zmm18, %zmm18
+ vaesenc %zmm12, %zmm19, %zmm19
+ vpclmulqdq $0x00, 832(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 832(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 832(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 832(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm13, %zmm16, %zmm16
+ vaesenc %zmm13, %zmm17, %zmm17
+ vaesenc %zmm13, %zmm18, %zmm18
+ vaesenc %zmm13, %zmm19, %zmm19
+ vmovdqu64 384(%r12), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm14, %zmm16, %zmm16
+ vaesenc %zmm14, %zmm17, %zmm17
+ vaesenc %zmm14, %zmm18, %zmm18
+ vaesenc %zmm14, %zmm19, %zmm19
+ vpclmulqdq $0x00, 896(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 896(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 896(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 896(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm15, %zmm16, %zmm16
+ vaesenc %zmm15, %zmm17, %zmm17
+ vaesenc %zmm15, %zmm18, %zmm18
+ vaesenc %zmm15, %zmm19, %zmm19
+ vmovdqu64 448(%r12), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm1, %zmm16, %zmm16
+ vaesenc %zmm1, %zmm17, %zmm17
+ vaesenc %zmm1, %zmm18, %zmm18
+ vaesenc %zmm1, %zmm19, %zmm19
+ vpclmulqdq $0x00, 960(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 960(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 960(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 960(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm2, %zmm16, %zmm16
+ vaesenc %zmm2, %zmm17, %zmm17
+ vaesenc %zmm2, %zmm18, %zmm18
+ vaesenc %zmm2, %zmm19, %zmm19
+ vaesenc %zmm3, %zmm16, %zmm16
+ vaesenc %zmm3, %zmm17, %zmm17
+ vaesenc %zmm3, %zmm18, %zmm18
+ vaesenc %zmm3, %zmm19, %zmm19
+ cmpl $11, %r10d
+ vbroadcasti32x4 160(%r15), %zmm20
+ jl L_AES_GCM_encrypt_avx512_b_il_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 176(%r15), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ cmpl $13, %r10d
+ vbroadcasti32x4 192(%r15), %zmm20
+ jl L_AES_GCM_encrypt_avx512_b_il_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 208(%r15), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 224(%r15), %zmm20
+L_AES_GCM_encrypt_avx512_b_il_last:
+ vaesenclast %zmm20, %zmm16, %zmm16
+ vaesenclast %zmm20, %zmm17, %zmm17
+ vaesenclast %zmm20, %zmm18, %zmm18
+ vaesenclast %zmm20, %zmm19, %zmm19
+ leaq (%rdi,%rbx,1), %rcx
+ leaq (%rsi,%rbx,1), %rdx
+ vmovdqu64 (%rcx), %zmm21
+ vpxorq %zmm21, %zmm16, %zmm16
+ vmovdqu64 %zmm16, (%rdx)
+ vmovdqu64 64(%rcx), %zmm21
+ vpxorq %zmm21, %zmm17, %zmm17
+ vmovdqu64 %zmm17, 64(%rdx)
+ vmovdqu64 128(%rcx), %zmm21
+ vpxorq %zmm21, %zmm18, %zmm18
+ vmovdqu64 %zmm18, 128(%rdx)
+ vmovdqu64 192(%rcx), %zmm21
+ vpxorq %zmm21, %zmm19, %zmm19
+ vmovdqu64 %zmm19, 192(%rdx)
+ addl $0x100, %ebx
+ vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31
+ vpclmulqdq $0x01, %zmm27, %zmm31, %zmm23
+ vpshufd $0x4e, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm23, %zmm27, %zmm28
+ vpclmulqdq $0x01, %zmm28, %zmm31, %zmm23
+ vpshufd $0x4e, %zmm28, %zmm28
+ vpternlogq $0x96, %zmm23, %zmm28, %zmm29
+ vextracti32x4 $0x01, %zmm29, %xmm0
+ vextracti32x4 $2, %zmm29, %xmm4
+ vextracti32x4 $3, %zmm29, %xmm5
+ vpxorq %xmm0, %xmm29, %xmm6
+ vpternlogq $0x96, %xmm4, %xmm5, %xmm6
+ movq 1072(%rsp), %rcx
+ movq %rcx, 1056(%rsp)
+ cmpl %r13d, %ebx
+ jl L_AES_GCM_encrypt_avx512_win_loop
+L_AES_GCM_encrypt_avx512_last_win:
+ movq 1056(%rsp), %rcx
+ vpxorq %zmm20, %zmm20, %zmm20
+ vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20
+ vmovdqu64 512(%rsp), %zmm23
+ vmovdqu64 576(%rsp), %zmm24
+ vmovdqu64 640(%rsp), %zmm25
+ vmovdqu64 704(%rsp), %zmm26
+ vmovdqu64 (%rcx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpxorq %zmm20, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm23, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19
+ vmovdqa64 %zmm16, %zmm27
+ vpxorq %zmm17, %zmm18, %zmm28
+ vmovdqa64 %zmm19, %zmm29
+ vmovdqu64 64(%rcx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm24, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 128(%rcx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm25, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 192(%rcx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm26, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 768(%rsp), %zmm23
+ vmovdqu64 832(%rsp), %zmm24
+ vmovdqu64 896(%rsp), %zmm25
+ vmovdqu64 960(%rsp), %zmm26
+ vmovdqu64 256(%rcx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm23, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 320(%rcx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm24, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 384(%rcx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm25, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 448(%rcx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm26, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21
+ vpshufd $0x4e, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm21, %zmm27, %zmm28
+ vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21
+ vpshufd $0x4e, %zmm28, %zmm28
+ vpternlogq $0x96, %zmm21, %zmm28, %zmm29
+ vextracti32x4 $0x01, %zmm29, %xmm0
+ vextracti32x4 $2, %zmm29, %xmm4
+ vextracti32x4 $3, %zmm29, %xmm5
+ vpxorq %xmm0, %xmm29, %xmm6
+ vpternlogq $0x96, %xmm4, %xmm5, %xmm6
+L_AES_GCM_encrypt_avx512_no_windows:
+ vmovdqu64 192(%rsp), %zmm23
+ vshufi64x2 $27, %zmm23, %zmm23, %zmm23
+ vmovdqu64 128(%rsp), %zmm24
+ vshufi64x2 $27, %zmm24, %zmm24, %zmm24
+ vmovdqu64 64(%rsp), %zmm25
+ vshufi64x2 $27, %zmm25, %zmm25, %zmm25
+ vmovdqu64 (%rsp), %zmm26
+ vshufi64x2 $27, %zmm26, %zmm26, %zmm26
+ movl %r9d, %r13d
+ andl $0xffffff00, %r13d
+ cmpl %r13d, %ebx
+ jge L_AES_GCM_encrypt_avx512_after_256
+ # 256 bytes of input
+ vbroadcasti32x4 1024(%rsp), %zmm20
+ vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16
+ vpshufb %zmm22, %zmm16, %zmm16
+ vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17
+ vpshufb %zmm22, %zmm17, %zmm17
+ vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18
+ vpshufb %zmm22, %zmm18, %zmm18
+ vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19
+ vpshufb %zmm22, %zmm19, %zmm19
+ vmovdqu 1024(%rsp), %xmm8
+ vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, 1024(%rsp)
+ vpxorq %zmm9, %zmm16, %zmm16
+ vpxorq %zmm9, %zmm17, %zmm17
+ vpxorq %zmm9, %zmm18, %zmm18
+ vpxorq %zmm9, %zmm19, %zmm19
+ vaesenc %zmm10, %zmm16, %zmm16
+ vaesenc %zmm10, %zmm17, %zmm17
+ vaesenc %zmm10, %zmm18, %zmm18
+ vaesenc %zmm10, %zmm19, %zmm19
+ vaesenc %zmm11, %zmm16, %zmm16
+ vaesenc %zmm11, %zmm17, %zmm17
+ vaesenc %zmm11, %zmm18, %zmm18
+ vaesenc %zmm11, %zmm19, %zmm19
+ vaesenc %zmm12, %zmm16, %zmm16
+ vaesenc %zmm12, %zmm17, %zmm17
+ vaesenc %zmm12, %zmm18, %zmm18
+ vaesenc %zmm12, %zmm19, %zmm19
+ vaesenc %zmm13, %zmm16, %zmm16
+ vaesenc %zmm13, %zmm17, %zmm17
+ vaesenc %zmm13, %zmm18, %zmm18
+ vaesenc %zmm13, %zmm19, %zmm19
+ vaesenc %zmm14, %zmm16, %zmm16
+ vaesenc %zmm14, %zmm17, %zmm17
+ vaesenc %zmm14, %zmm18, %zmm18
+ vaesenc %zmm14, %zmm19, %zmm19
+ vaesenc %zmm15, %zmm16, %zmm16
+ vaesenc %zmm15, %zmm17, %zmm17
+ vaesenc %zmm15, %zmm18, %zmm18
+ vaesenc %zmm15, %zmm19, %zmm19
+ vaesenc %zmm1, %zmm16, %zmm16
+ vaesenc %zmm1, %zmm17, %zmm17
+ vaesenc %zmm1, %zmm18, %zmm18
+ vaesenc %zmm1, %zmm19, %zmm19
+ vaesenc %zmm2, %zmm16, %zmm16
+ vaesenc %zmm2, %zmm17, %zmm17
+ vaesenc %zmm2, %zmm18, %zmm18
+ vaesenc %zmm2, %zmm19, %zmm19
+ vaesenc %zmm3, %zmm16, %zmm16
+ vaesenc %zmm3, %zmm17, %zmm17
+ vaesenc %zmm3, %zmm18, %zmm18
+ vaesenc %zmm3, %zmm19, %zmm19
+ cmpl $11, %r10d
+ vbroadcasti32x4 160(%r15), %zmm20
+ jl L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 176(%r15), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ cmpl $13, %r10d
+ vbroadcasti32x4 192(%r15), %zmm20
+ jl L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 208(%r15), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 224(%r15), %zmm20
+L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last:
+ vaesenclast %zmm20, %zmm16, %zmm16
+ vaesenclast %zmm20, %zmm17, %zmm17
+ vaesenclast %zmm20, %zmm18, %zmm18
+ vaesenclast %zmm20, %zmm19, %zmm19
+ leaq (%rdi,%rbx,1), %rcx
+ leaq (%rsi,%rbx,1), %rdx
+ vmovdqu64 (%rcx), %zmm21
+ vpxorq %zmm21, %zmm16, %zmm16
+ vmovdqu64 %zmm16, (%rdx)
+ vmovdqu64 64(%rcx), %zmm21
+ vpxorq %zmm21, %zmm17, %zmm17
+ vmovdqu64 %zmm17, 64(%rdx)
+ vmovdqu64 128(%rcx), %zmm21
+ vpxorq %zmm21, %zmm18, %zmm18
+ vmovdqu64 %zmm18, 128(%rdx)
+ vmovdqu64 192(%rcx), %zmm21
+ vpxorq %zmm21, %zmm19, %zmm19
+ vmovdqu64 %zmm19, 192(%rdx)
+ movq %rdx, 1056(%rsp)
+ addl $0x100, %ebx
+ cmpl %r13d, %ebx
+ jge L_AES_GCM_encrypt_avx512_last_ghash
+L_AES_GCM_encrypt_avx512_ghash_128:
+ vbroadcasti32x4 1024(%rsp), %zmm20
+ vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16
+ vpshufb %zmm22, %zmm16, %zmm16
+ vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17
+ vpshufb %zmm22, %zmm17, %zmm17
+ vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18
+ vpshufb %zmm22, %zmm18, %zmm18
+ vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19
+ vpshufb %zmm22, %zmm19, %zmm19
+ vmovdqu 1024(%rsp), %xmm8
+ vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, 1024(%rsp)
+ vpxorq %zmm9, %zmm16, %zmm16
+ vpxorq %zmm9, %zmm17, %zmm17
+ vpxorq %zmm9, %zmm18, %zmm18
+ vpxorq %zmm9, %zmm19, %zmm19
+ vaesenc %zmm10, %zmm16, %zmm16
+ vaesenc %zmm10, %zmm17, %zmm17
+ vaesenc %zmm10, %zmm18, %zmm18
+ vaesenc %zmm10, %zmm19, %zmm19
+ vaesenc %zmm11, %zmm16, %zmm16
+ vaesenc %zmm11, %zmm17, %zmm17
+ vaesenc %zmm11, %zmm18, %zmm18
+ vaesenc %zmm11, %zmm19, %zmm19
+ vaesenc %zmm12, %zmm16, %zmm16
+ vaesenc %zmm12, %zmm17, %zmm17
+ vaesenc %zmm12, %zmm18, %zmm18
+ vaesenc %zmm12, %zmm19, %zmm19
+ vaesenc %zmm13, %zmm16, %zmm16
+ vaesenc %zmm13, %zmm17, %zmm17
+ vaesenc %zmm13, %zmm18, %zmm18
+ vaesenc %zmm13, %zmm19, %zmm19
+ vaesenc %zmm14, %zmm16, %zmm16
+ vaesenc %zmm14, %zmm17, %zmm17
+ vaesenc %zmm14, %zmm18, %zmm18
+ vaesenc %zmm14, %zmm19, %zmm19
+ vaesenc %zmm15, %zmm16, %zmm16
+ vaesenc %zmm15, %zmm17, %zmm17
+ vaesenc %zmm15, %zmm18, %zmm18
+ vaesenc %zmm15, %zmm19, %zmm19
+ vaesenc %zmm1, %zmm16, %zmm16
+ vaesenc %zmm1, %zmm17, %zmm17
+ vaesenc %zmm1, %zmm18, %zmm18
+ vaesenc %zmm1, %zmm19, %zmm19
+ vaesenc %zmm2, %zmm16, %zmm16
+ vaesenc %zmm2, %zmm17, %zmm17
+ vaesenc %zmm2, %zmm18, %zmm18
+ vaesenc %zmm2, %zmm19, %zmm19
+ vaesenc %zmm3, %zmm16, %zmm16
+ vaesenc %zmm3, %zmm17, %zmm17
+ vaesenc %zmm3, %zmm18, %zmm18
+ vaesenc %zmm3, %zmm19, %zmm19
+ cmpl $11, %r10d
+ vbroadcasti32x4 160(%r15), %zmm20
+ jl L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 176(%r15), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ cmpl $13, %r10d
+ vbroadcasti32x4 192(%r15), %zmm20
+ jl L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 208(%r15), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 224(%r15), %zmm20
+L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last:
+ vaesenclast %zmm20, %zmm16, %zmm16
+ vaesenclast %zmm20, %zmm17, %zmm17
+ vaesenclast %zmm20, %zmm18, %zmm18
+ vaesenclast %zmm20, %zmm19, %zmm19
+ leaq (%rdi,%rbx,1), %rcx
+ leaq (%rsi,%rbx,1), %rdx
+ vmovdqu64 (%rcx), %zmm21
+ vpxorq %zmm21, %zmm16, %zmm16
+ vmovdqu64 %zmm16, (%rdx)
+ vmovdqu64 64(%rcx), %zmm21
+ vpxorq %zmm21, %zmm17, %zmm17
+ vmovdqu64 %zmm17, 64(%rdx)
+ vmovdqu64 128(%rcx), %zmm21
+ vpxorq %zmm21, %zmm18, %zmm18
+ vmovdqu64 %zmm18, 128(%rdx)
+ vmovdqu64 192(%rcx), %zmm21
+ vpxorq %zmm21, %zmm19, %zmm19
+ vmovdqu64 %zmm19, 192(%rdx)
+ movq 1056(%rsp), %rcx
+ vpxorq %zmm20, %zmm20, %zmm20
+ vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20
+ vmovdqu64 (%rcx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpxorq %zmm20, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm23, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19
+ vmovdqa64 %zmm16, %zmm27
+ vpxorq %zmm17, %zmm18, %zmm28
+ vmovdqa64 %zmm19, %zmm29
+ vmovdqu64 64(%rcx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm24, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 128(%rcx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm25, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 192(%rcx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm26, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21
+ vpshufd $0x4e, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm21, %zmm27, %zmm28
+ vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21
+ vpshufd $0x4e, %zmm28, %zmm28
+ vpternlogq $0x96, %zmm21, %zmm28, %zmm29
+ vextracti32x4 $0x01, %zmm29, %xmm0
+ vextracti32x4 $2, %zmm29, %xmm4
+ vextracti32x4 $3, %zmm29, %xmm5
+ vpxorq %xmm0, %xmm29, %xmm6
+ vpternlogq $0x96, %xmm4, %xmm5, %xmm6
+ movq %rdx, 1056(%rsp)
+ addl $0x100, %ebx
+ cmpl %r13d, %ebx
+ jl L_AES_GCM_encrypt_avx512_ghash_128
+L_AES_GCM_encrypt_avx512_last_ghash:
+ movq 1056(%rsp), %rcx
+ vpxorq %zmm20, %zmm20, %zmm20
+ vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20
+ vmovdqu64 (%rcx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpxorq %zmm20, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm23, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19
+ vmovdqa64 %zmm16, %zmm27
+ vpxorq %zmm17, %zmm18, %zmm28
+ vmovdqa64 %zmm19, %zmm29
+ vmovdqu64 64(%rcx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm24, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 128(%rcx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm25, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 192(%rcx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm26, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21
+ vpshufd $0x4e, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm21, %zmm27, %zmm28
+ vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21
+ vpshufd $0x4e, %zmm28, %zmm28
+ vpternlogq $0x96, %zmm21, %zmm28, %zmm29
+ vextracti32x4 $0x01, %zmm29, %xmm0
+ vextracti32x4 $2, %zmm29, %xmm4
+ vextracti32x4 $3, %zmm29, %xmm5
+ vpxorq %xmm0, %xmm29, %xmm6
+ vpternlogq $0x96, %xmm4, %xmm5, %xmm6
+L_AES_GCM_encrypt_avx512_after_256:
+ vmovdqu (%rsp), %xmm5
+L_AES_GCM_encrypt_avx512_done_128:
+ movl %r9d, %edx
+ cmpl %edx, %ebx
+ jge L_AES_GCM_encrypt_avx512_done_enc
+ movl %r9d, %r13d
+ andl $0xfffffff0, %r13d
+ cmpl %r13d, %ebx
+ jge L_AES_GCM_encrypt_avx512_last_block_done
+ vmovdqu 1024(%rsp), %xmm9
+ vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8
+ vpaddd L_avx512_aes_gcm_one(%rip), %xmm9, %xmm9
+ vmovdqu %xmm9, 1024(%rsp)
+ vpxor (%r15), %xmm8, %xmm8
+ vaesenc 16(%r15), %xmm8, %xmm8
+ vaesenc 32(%r15), %xmm8, %xmm8
+ vaesenc 48(%r15), %xmm8, %xmm8
+ vaesenc 64(%r15), %xmm8, %xmm8
+ vaesenc 80(%r15), %xmm8, %xmm8
+ vaesenc 96(%r15), %xmm8, %xmm8
+ vaesenc 112(%r15), %xmm8, %xmm8
+ vaesenc 128(%r15), %xmm8, %xmm8
+ vaesenc 144(%r15), %xmm8, %xmm8
+ cmpl $11, %r10d
+ vmovdqa 160(%r15), %xmm9
+ jl L_AES_GCM_encrypt_avx512_aesenc_block_last
+ vaesenc %xmm9, %xmm8, %xmm8
+ vaesenc 176(%r15), %xmm8, %xmm8
+ cmpl $13, %r10d
+ vmovdqa 192(%r15), %xmm9
+ jl L_AES_GCM_encrypt_avx512_aesenc_block_last
+ vaesenc %xmm9, %xmm8, %xmm8
+ vaesenc 208(%r15), %xmm8, %xmm8
+ vmovdqa 224(%r15), %xmm9
+L_AES_GCM_encrypt_avx512_aesenc_block_last:
+ vaesenclast %xmm9, %xmm8, %xmm8
+ vmovdqu (%rdi,%rbx,1), %xmm9
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqu %xmm8, (%rsi,%rbx,1)
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
+ vpxor %xmm8, %xmm6, %xmm6
+ addl $16, %ebx
+ cmpl %r13d, %ebx
+ jge L_AES_GCM_encrypt_avx512_last_block_ghash
+L_AES_GCM_encrypt_avx512_last_block_start:
+ vmovdqu (%rdi,%rbx,1), %xmm13
+ vmovdqu 1024(%rsp), %xmm9
+ vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8
+ vpaddd L_avx512_aes_gcm_one(%rip), %xmm9, %xmm9
+ vmovdqu %xmm9, 1024(%rsp)
+ vpxor (%r15), %xmm8, %xmm8
+ vpclmulqdq $16, %xmm5, %xmm6, %xmm10
+ vaesenc 16(%r15), %xmm8, %xmm8
+ vaesenc 32(%r15), %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm5, %xmm6, %xmm11
+ vaesenc 48(%r15), %xmm8, %xmm8
+ vaesenc 64(%r15), %xmm8, %xmm8
+ vpclmulqdq $0x00, %xmm5, %xmm6, %xmm12
+ vaesenc 80(%r15), %xmm8, %xmm8
+ vpclmulqdq $0x11, %xmm5, %xmm6, %xmm1
+ vaesenc 96(%r15), %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpslldq $8, %xmm10, %xmm2
+ vpsrldq $8, %xmm10, %xmm10
+ vaesenc 112(%r15), %xmm8, %xmm8
+ vpxor %xmm12, %xmm2, %xmm2
+ vpxor %xmm10, %xmm1, %xmm3
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm0
+ vpclmulqdq $16, %xmm0, %xmm2, %xmm11
+ vaesenc 128(%r15), %xmm8, %xmm8
+ vpshufd $0x4e, %xmm2, %xmm10
+ vpxor %xmm11, %xmm10, %xmm10
+ vpclmulqdq $16, %xmm0, %xmm10, %xmm11
+ vaesenc 144(%r15), %xmm8, %xmm8
+ vpshufd $0x4e, %xmm10, %xmm10
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm3, %xmm10, %xmm6
+ cmpl $11, %r10d
+ vmovdqa 160(%r15), %xmm9
+ jl L_AES_GCM_encrypt_avx512_aesenc_gfmul_last
+ vaesenc %xmm9, %xmm8, %xmm8
+ vaesenc 176(%r15), %xmm8, %xmm8
+ cmpl $13, %r10d
+ vmovdqa 192(%r15), %xmm9
+ jl L_AES_GCM_encrypt_avx512_aesenc_gfmul_last
+ vaesenc %xmm9, %xmm8, %xmm8
+ vaesenc 208(%r15), %xmm8, %xmm8
+ vmovdqa 224(%r15), %xmm9
+L_AES_GCM_encrypt_avx512_aesenc_gfmul_last:
+ vaesenclast %xmm9, %xmm8, %xmm8
+ vmovdqa %xmm13, %xmm0
+ vpxor %xmm0, %xmm8, %xmm8
+ vmovdqu %xmm8, (%rsi,%rbx,1)
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
+ addl $16, %ebx
+ vpxor %xmm8, %xmm6, %xmm6
+ cmpl %r13d, %ebx
+ jl L_AES_GCM_encrypt_avx512_last_block_start
+L_AES_GCM_encrypt_avx512_last_block_ghash:
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm5, %xmm9
+ vpxor %xmm5, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm6, %xmm10
+ vpxor %xmm6, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm5, %xmm6, %xmm8
+ vpclmulqdq $0x11, %xmm5, %xmm6, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm6
+L_AES_GCM_encrypt_avx512_last_block_done:
+ movl %r9d, %ecx
+ movl %ecx, %edx
+ andl $15, %ecx
+ jz L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_done
+ vmovdqu 1024(%rsp), %xmm4
+ vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4
+ vpxor (%r15), %xmm4, %xmm4
+ vaesenc 16(%r15), %xmm4, %xmm4
+ vaesenc 32(%r15), %xmm4, %xmm4
+ vaesenc 48(%r15), %xmm4, %xmm4
+ vaesenc 64(%r15), %xmm4, %xmm4
+ vaesenc 80(%r15), %xmm4, %xmm4
+ vaesenc 96(%r15), %xmm4, %xmm4
+ vaesenc 112(%r15), %xmm4, %xmm4
+ vaesenc 128(%r15), %xmm4, %xmm4
+ vaesenc 144(%r15), %xmm4, %xmm4
+ cmpl $11, %r10d
+ vmovdqa 160(%r15), %xmm9
+ jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_aesenc_avx_last
+ vaesenc %xmm9, %xmm4, %xmm4
+ vaesenc 176(%r15), %xmm4, %xmm4
+ cmpl $13, %r10d
+ vmovdqa 192(%r15), %xmm9
+ jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_aesenc_avx_last
+ vaesenc %xmm9, %xmm4, %xmm4
+ vaesenc 208(%r15), %xmm4, %xmm4
+ vmovdqa 224(%r15), %xmm9
+L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_aesenc_avx_last:
+ vaesenclast %xmm9, %xmm4, %xmm4
+ subq $16, %rsp
+ xorl %ecx, %ecx
+ vmovdqu %xmm4, (%rsp)
+L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_loop:
+ movzbl (%rdi,%rbx,1), %r13d
+ xorb (%rsp,%rcx,1), %r13b
+ movb %r13b, (%rsi,%rbx,1)
+ movb %r13b, (%rsp,%rcx,1)
+ incl %ebx
+ incl %ecx
+ cmpl %edx, %ebx
+ jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_loop
+ xorq %r13, %r13
+ cmpl $16, %ecx
+ je L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_finish_enc
+L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_byte_loop:
+ movb %r13b, (%rsp,%rcx,1)
+ incl %ecx
+ cmpl $16, %ecx
+ jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_byte_loop
+L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_finish_enc:
+ vmovdqu (%rsp), %xmm4
+ addq $16, %rsp
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4
+ vpxor %xmm4, %xmm6, %xmm6
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm5, %xmm9
+ vpxor %xmm5, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm6, %xmm10
+ vpxor %xmm6, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm5, %xmm6, %xmm8
+ vpclmulqdq $0x11, %xmm5, %xmm6, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm6
+L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_done:
+L_AES_GCM_encrypt_avx512_done_enc:
+ movl %r9d, %edx
+ movl %r11d, %ecx
+ shlq $3, %rdx
+ shlq $3, %rcx
+ vmovq %rdx, %xmm0
+ vmovq %rcx, %xmm1
+ vpunpcklqdq %xmm1, %xmm0, %xmm0
+ vpxor %xmm0, %xmm6, %xmm6
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm5, %xmm9
+ vpxor %xmm5, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm6, %xmm10
+ vpxor %xmm6, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm5, %xmm6, %xmm8
+ vpclmulqdq $0x11, %xmm5, %xmm6, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm6
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6
+ vmovdqu 1040(%rsp), %xmm0
+ vpxor %xmm6, %xmm0, %xmm0
+ cmpl $16, %r14d
+ je L_AES_GCM_encrypt_avx512_store_tag_16
+ xorq %rcx, %rcx
+ vmovdqu %xmm0, (%rsp)
+L_AES_GCM_encrypt_avx512_store_tag_loop:
+ movzbl (%rsp,%rcx,1), %r13d
+ movb %r13b, (%r8,%rcx,1)
+ incl %ecx
+ cmpl %r14d, %ecx
+ jne L_AES_GCM_encrypt_avx512_store_tag_loop
+ jmp L_AES_GCM_encrypt_avx512_store_tag_done
+L_AES_GCM_encrypt_avx512_store_tag_16:
+ vmovdqu %xmm0, (%r8)
+L_AES_GCM_encrypt_avx512_store_tag_done:
+ vzeroupper
+ addq $0x440, %rsp
+ popq %r15
+ popq %r14
+ popq %rbx
+ popq %r12
+ popq %r13
+ repz retq
+#ifndef __APPLE__
+.size AES_GCM_encrypt_avx512,.-AES_GCM_encrypt_avx512
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_GCM_decrypt_avx512
+.type AES_GCM_decrypt_avx512,@function
+.align 16
+AES_GCM_decrypt_avx512:
+#else
+.section __TEXT,__text
+.globl _AES_GCM_decrypt_avx512
+.p2align 4
+_AES_GCM_decrypt_avx512:
+#endif /* __APPLE__ */
+ pushq %r13
+ pushq %r12
+ pushq %rbx
+ pushq %r14
+ pushq %r15
+ pushq %rbp
+ movq %rdx, %r12
+ movq %rcx, %rax
+ movl 56(%rsp), %r11d
+ movl 64(%rsp), %ebx
+ movl 72(%rsp), %r14d
+ movq 80(%rsp), %r15
+ movl 88(%rsp), %r10d
+ movq 96(%rsp), %rbp
+ subq $0x420, %rsp
+ vpxor %xmm4, %xmm4, %xmm4
+ vpxor %xmm6, %xmm6, %xmm6
+ cmpl $12, %ebx
+ movl %ebx, %edx
+ jne L_AES_GCM_decrypt_avx512_iv_not_12
+ # # Calculate values when IV is 12 bytes
+ # Set counter based on IV
+ movl $0x1000000, %ecx
+ vmovq (%rax), %xmm4
+ vpinsrd $2, 8(%rax), %xmm4, %xmm4
+ vpinsrd $3, %ecx, %xmm4, %xmm4
+ # H = Encrypt X(=0) and T = Encrypt counter
+ vmovdqa (%r15), %xmm5
+ vpxor %xmm5, %xmm4, %xmm1
+ vmovdqa 16(%r15), %xmm7
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ vmovdqa 32(%r15), %xmm7
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ vmovdqa 48(%r15), %xmm7
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ vmovdqa 64(%r15), %xmm7
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ vmovdqa 80(%r15), %xmm7
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ vmovdqa 96(%r15), %xmm7
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ vmovdqa 112(%r15), %xmm7
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ vmovdqa 128(%r15), %xmm7
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ vmovdqa 144(%r15), %xmm7
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ cmpl $11, %r10d
+ vmovdqa 160(%r15), %xmm7
+ jl L_AES_GCM_decrypt_avx512_calc_iv_12_last
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ vmovdqa 176(%r15), %xmm7
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ cmpl $13, %r10d
+ vmovdqa 192(%r15), %xmm7
+ jl L_AES_GCM_decrypt_avx512_calc_iv_12_last
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ vmovdqa 208(%r15), %xmm7
+ vaesenc %xmm7, %xmm5, %xmm5
+ vaesenc %xmm7, %xmm1, %xmm1
+ vmovdqa 224(%r15), %xmm7
+L_AES_GCM_decrypt_avx512_calc_iv_12_last:
+ vaesenclast %xmm7, %xmm5, %xmm5
+ vaesenclast %xmm7, %xmm1, %xmm1
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
+ vmovdqu %xmm1, 1040(%rsp)
+ jmp L_AES_GCM_decrypt_avx512_iv_done
+L_AES_GCM_decrypt_avx512_iv_not_12:
+ # Calculate values when IV is not 12 bytes
+ # H = Encrypt X(=0)
+ vmovdqa (%r15), %xmm5
+ vaesenc 16(%r15), %xmm5, %xmm5
+ vaesenc 32(%r15), %xmm5, %xmm5
+ vaesenc 48(%r15), %xmm5, %xmm5
+ vaesenc 64(%r15), %xmm5, %xmm5
+ vaesenc 80(%r15), %xmm5, %xmm5
+ vaesenc 96(%r15), %xmm5, %xmm5
+ vaesenc 112(%r15), %xmm5, %xmm5
+ vaesenc 128(%r15), %xmm5, %xmm5
+ vaesenc 144(%r15), %xmm5, %xmm5
+ cmpl $11, %r10d
+ vmovdqa 160(%r15), %xmm9
+ jl L_AES_GCM_decrypt_avx512_calc_iv_1_aesenc_avx_last
+ vaesenc %xmm9, %xmm5, %xmm5
+ vaesenc 176(%r15), %xmm5, %xmm5
+ cmpl $13, %r10d
+ vmovdqa 192(%r15), %xmm9
+ jl L_AES_GCM_decrypt_avx512_calc_iv_1_aesenc_avx_last
+ vaesenc %xmm9, %xmm5, %xmm5
+ vaesenc 208(%r15), %xmm5, %xmm5
+ vmovdqa 224(%r15), %xmm9
+L_AES_GCM_decrypt_avx512_calc_iv_1_aesenc_avx_last:
+ vaesenclast %xmm9, %xmm5, %xmm5
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
+ # Calc counter
+ # Initialization vector
+ cmpl $0x00, %edx
+ movq $0x00, %rcx
+ je L_AES_GCM_decrypt_avx512_calc_iv_done
+ cmpl $16, %edx
+ jl L_AES_GCM_decrypt_avx512_calc_iv_lt16
+ andl $0xfffffff0, %edx
+L_AES_GCM_decrypt_avx512_calc_iv_16_loop:
+ vmovdqu (%rax,%rcx,1), %xmm8
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
+ vpxor %xmm8, %xmm4, %xmm4
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm4, %xmm1
+ vpshufd $0x4e, %xmm5, %xmm2
+ vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3
+ vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0
+ vpxor %xmm4, %xmm1, %xmm1
+ vpxor %xmm5, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm7
+ vmovdqa %xmm3, %xmm4
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm7, %xmm7
+ vpxor %xmm1, %xmm4, %xmm4
+ vpsrld $31, %xmm7, %xmm0
+ vpsrld $31, %xmm4, %xmm1
+ vpslld $0x01, %xmm7, %xmm7
+ vpslld $0x01, %xmm4, %xmm4
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm4, %xmm4
+ vpor %xmm0, %xmm7, %xmm7
+ vpor %xmm1, %xmm4, %xmm4
+ vpslld $31, %xmm7, %xmm0
+ vpslld $30, %xmm7, %xmm1
+ vpslld $25, %xmm7, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm7, %xmm7
+ vpsrld $0x01, %xmm7, %xmm2
+ vpsrld $2, %xmm7, %xmm3
+ vpsrld $7, %xmm7, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm7, %xmm2, %xmm2
+ vpxor %xmm2, %xmm4, %xmm4
+ addl $16, %ecx
+ cmpl %edx, %ecx
+ jl L_AES_GCM_decrypt_avx512_calc_iv_16_loop
+ movl %ebx, %edx
+ cmpl %edx, %ecx
+ je L_AES_GCM_decrypt_avx512_calc_iv_done
+L_AES_GCM_decrypt_avx512_calc_iv_lt16:
+ subq $16, %rsp
+ vpxor %xmm8, %xmm8, %xmm8
+ xorl %ebx, %ebx
+ vmovdqu %xmm8, (%rsp)
+L_AES_GCM_decrypt_avx512_calc_iv_loop:
+ movzbl (%rax,%rcx,1), %r13d
+ movb %r13b, (%rsp,%rbx,1)
+ incl %ecx
+ incl %ebx
+ cmpl %edx, %ecx
+ jl L_AES_GCM_decrypt_avx512_calc_iv_loop
+ vmovdqu (%rsp), %xmm8
+ addq $16, %rsp
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
+ vpxor %xmm8, %xmm4, %xmm4
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm4, %xmm1
+ vpshufd $0x4e, %xmm5, %xmm2
+ vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3
+ vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0
+ vpxor %xmm4, %xmm1, %xmm1
+ vpxor %xmm5, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm7
+ vmovdqa %xmm3, %xmm4
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm7, %xmm7
+ vpxor %xmm1, %xmm4, %xmm4
+ vpsrld $31, %xmm7, %xmm0
+ vpsrld $31, %xmm4, %xmm1
+ vpslld $0x01, %xmm7, %xmm7
+ vpslld $0x01, %xmm4, %xmm4
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm4, %xmm4
+ vpor %xmm0, %xmm7, %xmm7
+ vpor %xmm1, %xmm4, %xmm4
+ vpslld $31, %xmm7, %xmm0
+ vpslld $30, %xmm7, %xmm1
+ vpslld $25, %xmm7, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm7, %xmm7
+ vpsrld $0x01, %xmm7, %xmm2
+ vpsrld $2, %xmm7, %xmm3
+ vpsrld $7, %xmm7, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm7, %xmm2, %xmm2
+ vpxor %xmm2, %xmm4, %xmm4
+L_AES_GCM_decrypt_avx512_calc_iv_done:
+ # T = Encrypt counter
+ vpxor %xmm0, %xmm0, %xmm0
+ shll $3, %edx
+ vmovq %rdx, %xmm0
+ vpxor %xmm0, %xmm4, %xmm4
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm4, %xmm1
+ vpshufd $0x4e, %xmm5, %xmm2
+ vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3
+ vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0
+ vpxor %xmm4, %xmm1, %xmm1
+ vpxor %xmm5, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm7
+ vmovdqa %xmm3, %xmm4
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm7, %xmm7
+ vpxor %xmm1, %xmm4, %xmm4
+ vpsrld $31, %xmm7, %xmm0
+ vpsrld $31, %xmm4, %xmm1
+ vpslld $0x01, %xmm7, %xmm7
+ vpslld $0x01, %xmm4, %xmm4
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm4, %xmm4
+ vpor %xmm0, %xmm7, %xmm7
+ vpor %xmm1, %xmm4, %xmm4
+ vpslld $31, %xmm7, %xmm0
+ vpslld $30, %xmm7, %xmm1
+ vpslld $25, %xmm7, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm7, %xmm7
+ vpsrld $0x01, %xmm7, %xmm2
+ vpsrld $2, %xmm7, %xmm3
+ vpsrld $7, %xmm7, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm7, %xmm2, %xmm2
+ vpxor %xmm2, %xmm4, %xmm4
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4
+ # Encrypt counter
+ vmovdqa (%r15), %xmm8
+ vpxor %xmm4, %xmm8, %xmm8
+ vaesenc 16(%r15), %xmm8, %xmm8
+ vaesenc 32(%r15), %xmm8, %xmm8
+ vaesenc 48(%r15), %xmm8, %xmm8
+ vaesenc 64(%r15), %xmm8, %xmm8
+ vaesenc 80(%r15), %xmm8, %xmm8
+ vaesenc 96(%r15), %xmm8, %xmm8
+ vaesenc 112(%r15), %xmm8, %xmm8
+ vaesenc 128(%r15), %xmm8, %xmm8
+ vaesenc 144(%r15), %xmm8, %xmm8
+ cmpl $11, %r10d
+ vmovdqa 160(%r15), %xmm9
+ jl L_AES_GCM_decrypt_avx512_calc_iv_2_aesenc_avx_last
+ vaesenc %xmm9, %xmm8, %xmm8
+ vaesenc 176(%r15), %xmm8, %xmm8
+ cmpl $13, %r10d
+ vmovdqa 192(%r15), %xmm9
+ jl L_AES_GCM_decrypt_avx512_calc_iv_2_aesenc_avx_last
+ vaesenc %xmm9, %xmm8, %xmm8
+ vaesenc 208(%r15), %xmm8, %xmm8
+ vmovdqa 224(%r15), %xmm9
+L_AES_GCM_decrypt_avx512_calc_iv_2_aesenc_avx_last:
+ vaesenclast %xmm9, %xmm8, %xmm8
+ vmovdqu %xmm8, 1040(%rsp)
+L_AES_GCM_decrypt_avx512_iv_done:
+ # Additional authentication data
+ movl %r11d, %edx
+ cmpl $0x00, %edx
+ je L_AES_GCM_decrypt_avx512_calc_aad_done
+ xorl %ecx, %ecx
+ cmpl $16, %edx
+ jl L_AES_GCM_decrypt_avx512_calc_aad_lt16
+ andl $0xfffffff0, %edx
+L_AES_GCM_decrypt_avx512_calc_aad_16_loop:
+ vmovdqu (%r12,%rcx,1), %xmm8
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
+ vpxor %xmm8, %xmm6, %xmm6
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm6, %xmm1
+ vpshufd $0x4e, %xmm5, %xmm2
+ vpclmulqdq $0x11, %xmm6, %xmm5, %xmm3
+ vpclmulqdq $0x00, %xmm6, %xmm5, %xmm0
+ vpxor %xmm6, %xmm1, %xmm1
+ vpxor %xmm5, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm7
+ vmovdqa %xmm3, %xmm6
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm7, %xmm7
+ vpxor %xmm1, %xmm6, %xmm6
+ vpsrld $31, %xmm7, %xmm0
+ vpsrld $31, %xmm6, %xmm1
+ vpslld $0x01, %xmm7, %xmm7
+ vpslld $0x01, %xmm6, %xmm6
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm6, %xmm6
+ vpor %xmm0, %xmm7, %xmm7
+ vpor %xmm1, %xmm6, %xmm6
+ vpslld $31, %xmm7, %xmm0
+ vpslld $30, %xmm7, %xmm1
+ vpslld $25, %xmm7, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm7, %xmm7
+ vpsrld $0x01, %xmm7, %xmm2
+ vpsrld $2, %xmm7, %xmm3
+ vpsrld $7, %xmm7, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm7, %xmm2, %xmm2
+ vpxor %xmm2, %xmm6, %xmm6
+ addl $16, %ecx
+ cmpl %edx, %ecx
+ jl L_AES_GCM_decrypt_avx512_calc_aad_16_loop
+ movl %r11d, %edx
+ cmpl %edx, %ecx
+ je L_AES_GCM_decrypt_avx512_calc_aad_done
+L_AES_GCM_decrypt_avx512_calc_aad_lt16:
+ subq $16, %rsp
+ vpxor %xmm8, %xmm8, %xmm8
+ xorl %ebx, %ebx
+ vmovdqu %xmm8, (%rsp)
+L_AES_GCM_decrypt_avx512_calc_aad_loop:
+ movzbl (%r12,%rcx,1), %r13d
+ movb %r13b, (%rsp,%rbx,1)
+ incl %ecx
+ incl %ebx
+ cmpl %edx, %ecx
+ jl L_AES_GCM_decrypt_avx512_calc_aad_loop
+ vmovdqu (%rsp), %xmm8
+ addq $16, %rsp
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
+ vpxor %xmm8, %xmm6, %xmm6
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm6, %xmm1
+ vpshufd $0x4e, %xmm5, %xmm2
+ vpclmulqdq $0x11, %xmm6, %xmm5, %xmm3
+ vpclmulqdq $0x00, %xmm6, %xmm5, %xmm0
+ vpxor %xmm6, %xmm1, %xmm1
+ vpxor %xmm5, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm7
+ vmovdqa %xmm3, %xmm6
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm7, %xmm7
+ vpxor %xmm1, %xmm6, %xmm6
+ vpsrld $31, %xmm7, %xmm0
+ vpsrld $31, %xmm6, %xmm1
+ vpslld $0x01, %xmm7, %xmm7
+ vpslld $0x01, %xmm6, %xmm6
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm6, %xmm6
+ vpor %xmm0, %xmm7, %xmm7
+ vpor %xmm1, %xmm6, %xmm6
+ vpslld $31, %xmm7, %xmm0
+ vpslld $30, %xmm7, %xmm1
+ vpslld $25, %xmm7, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm7, %xmm7
+ vpsrld $0x01, %xmm7, %xmm2
+ vpsrld $2, %xmm7, %xmm3
+ vpsrld $7, %xmm7, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm7, %xmm2, %xmm2
+ vpxor %xmm2, %xmm6, %xmm6
+L_AES_GCM_decrypt_avx512_calc_aad_done:
+ # Calculate counter and H
+ vpsrlq $63, %xmm5, %xmm9
+ vpsllq $0x01, %xmm5, %xmm8
+ vpslldq $8, %xmm9, %xmm9
+ vpor %xmm9, %xmm8, %xmm8
+ vpshufd $0xff, %xmm5, %xmm5
+ vpsrad $31, %xmm5, %xmm5
+ vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4
+ vpand L_avx512_aes_gcm_mod2_128(%rip), %xmm5, %xmm5
+ vpaddd L_avx512_aes_gcm_one(%rip), %xmm4, %xmm4
+ vpxor %xmm8, %xmm5, %xmm5
+ vmovdqu %xmm4, 1024(%rsp)
+ xorl %ebx, %ebx
+ cmpl $0x100, %r9d
+ jl L_AES_GCM_decrypt_avx512_done_128
+ vmovdqa %xmm6, %xmm2
+ # H ^ 1
+ vmovdqu %xmm5, (%rsp)
+ # H ^ 2
+ vpclmulqdq $0x00, %xmm5, %xmm5, %xmm8
+ vpclmulqdq $0x11, %xmm5, %xmm5, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm0
+ vmovdqu %xmm0, 16(%rsp)
+ # H ^ 3
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm5, %xmm9
+ vpxor %xmm5, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm0, %xmm10
+ vpxor %xmm0, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm5, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm5, %xmm0, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm1
+ vmovdqu %xmm1, 32(%rsp)
+ # H ^ 4
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm3
+ vmovdqu %xmm3, 48(%rsp)
+ # H ^ 5
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 64(%rsp)
+ # H ^ 6
+ vpclmulqdq $0x00, %xmm1, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm1, %xmm1, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 80(%rsp)
+ # H ^ 7
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm1, %xmm9
+ vpxor %xmm1, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm3, %xmm10
+ vpxor %xmm3, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm1, %xmm3, %xmm8
+ vpclmulqdq $0x11, %xmm1, %xmm3, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 96(%rsp)
+ # H ^ 8
+ vpclmulqdq $0x00, %xmm3, %xmm3, %xmm8
+ vpclmulqdq $0x11, %xmm3, %xmm3, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 112(%rsp)
+ # H ^ 9
+ vmovdqu 48(%rsp), %xmm0
+ vmovdqu 64(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 128(%rsp)
+ # H ^ 10
+ vmovdqu 64(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 144(%rsp)
+ # H ^ 11
+ vmovdqu 64(%rsp), %xmm0
+ vmovdqu 80(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 160(%rsp)
+ # H ^ 12
+ vmovdqu 80(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 176(%rsp)
+ # H ^ 13
+ vmovdqu 80(%rsp), %xmm0
+ vmovdqu 96(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 192(%rsp)
+ # H ^ 14
+ vmovdqu 96(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 208(%rsp)
+ # H ^ 15
+ vmovdqu 96(%rsp), %xmm0
+ vmovdqu 112(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 224(%rsp)
+ # H ^ 16
+ vmovdqu 112(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 240(%rsp)
+ cmpl $0x200, %r9d
+ jl L_AES_GCM_decrypt_avx512_no_ext
+ # H ^ 17
+ vmovdqu 112(%rsp), %xmm0
+ vmovdqu 128(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 256(%rsp)
+ # H ^ 18
+ vmovdqu 128(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 272(%rsp)
+ # H ^ 19
+ vmovdqu 128(%rsp), %xmm0
+ vmovdqu 144(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 288(%rsp)
+ # H ^ 20
+ vmovdqu 144(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 304(%rsp)
+ # H ^ 21
+ vmovdqu 144(%rsp), %xmm0
+ vmovdqu 160(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 320(%rsp)
+ # H ^ 22
+ vmovdqu 160(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 336(%rsp)
+ # H ^ 23
+ vmovdqu 160(%rsp), %xmm0
+ vmovdqu 176(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 352(%rsp)
+ # H ^ 24
+ vmovdqu 176(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 368(%rsp)
+ # H ^ 25
+ vmovdqu 176(%rsp), %xmm0
+ vmovdqu 192(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 384(%rsp)
+ # H ^ 26
+ vmovdqu 192(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 400(%rsp)
+ # H ^ 27
+ vmovdqu 192(%rsp), %xmm0
+ vmovdqu 208(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 416(%rsp)
+ # H ^ 28
+ vmovdqu 208(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 432(%rsp)
+ # H ^ 29
+ vmovdqu 208(%rsp), %xmm0
+ vmovdqu 224(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 448(%rsp)
+ # H ^ 30
+ vmovdqu 224(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 464(%rsp)
+ # H ^ 31
+ vmovdqu 224(%rsp), %xmm0
+ vmovdqu 240(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 480(%rsp)
+ # H ^ 32
+ vmovdqu 240(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 496(%rsp)
+L_AES_GCM_decrypt_avx512_no_ext:
+ vbroadcasti32x4 L_avx512_aes_gcm_bswap_epi64(%rip), %zmm22
+ vbroadcasti32x4 L_avx512_aes_gcm_bswap_mask(%rip), %zmm30
+ vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31
+ vbroadcasti32x4 (%r15), %zmm9
+ vbroadcasti32x4 16(%r15), %zmm10
+ vbroadcasti32x4 32(%r15), %zmm11
+ vbroadcasti32x4 48(%r15), %zmm12
+ vbroadcasti32x4 64(%r15), %zmm13
+ vbroadcasti32x4 80(%r15), %zmm14
+ vbroadcasti32x4 96(%r15), %zmm15
+ vbroadcasti32x4 112(%r15), %zmm1
+ vbroadcasti32x4 128(%r15), %zmm2
+ vbroadcasti32x4 144(%r15), %zmm3
+ cmpl $0x200, %r9d
+ jl L_AES_GCM_decrypt_avx512_no_windows
+ movl %r9d, %r13d
+ andl $0xfffffe00, %r13d
+ vmovdqu64 448(%rsp), %zmm23
+ vshufi64x2 $27, %zmm23, %zmm23, %zmm23
+ vmovdqu64 384(%rsp), %zmm24
+ vshufi64x2 $27, %zmm24, %zmm24, %zmm24
+ vmovdqu64 320(%rsp), %zmm25
+ vshufi64x2 $27, %zmm25, %zmm25, %zmm25
+ vmovdqu64 256(%rsp), %zmm26
+ vshufi64x2 $27, %zmm26, %zmm26, %zmm26
+ vmovdqu64 %zmm23, 512(%rsp)
+ vmovdqu64 %zmm24, 576(%rsp)
+ vmovdqu64 %zmm25, 640(%rsp)
+ vmovdqu64 %zmm26, 704(%rsp)
+ vmovdqu64 192(%rsp), %zmm23
+ vshufi64x2 $27, %zmm23, %zmm23, %zmm23
+ vmovdqu64 128(%rsp), %zmm24
+ vshufi64x2 $27, %zmm24, %zmm24, %zmm24
+ vmovdqu64 64(%rsp), %zmm25
+ vshufi64x2 $27, %zmm25, %zmm25, %zmm25
+ vmovdqu64 (%rsp), %zmm26
+ vshufi64x2 $27, %zmm26, %zmm26, %zmm26
+ vmovdqu64 %zmm23, 768(%rsp)
+ vmovdqu64 %zmm24, 832(%rsp)
+ vmovdqu64 %zmm25, 896(%rsp)
+ vmovdqu64 %zmm26, 960(%rsp)
+ # 512 bytes of input
+ xorl %r12d, %r12d
+ leaq (%rdi,%rbx,1), %rax
+ vpxorq %zmm20, %zmm20, %zmm20
+ vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20
+ vmovdqu64 512(%rsp), %zmm23
+ vmovdqu64 576(%rsp), %zmm24
+ vmovdqu64 640(%rsp), %zmm25
+ vmovdqu64 704(%rsp), %zmm26
+ vmovdqu64 (%rax), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpxorq %zmm20, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm23, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19
+ vmovdqa64 %zmm16, %zmm27
+ vpxorq %zmm17, %zmm18, %zmm28
+ vmovdqa64 %zmm19, %zmm29
+ vmovdqu64 64(%rax), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm24, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 128(%rax), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm25, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 192(%rax), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm26, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 768(%rsp), %zmm23
+ vmovdqu64 832(%rsp), %zmm24
+ vmovdqu64 896(%rsp), %zmm25
+ vmovdqu64 960(%rsp), %zmm26
+ vmovdqu64 256(%rax), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm23, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 320(%rax), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm24, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 384(%rax), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm25, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 448(%rax), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm26, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21
+ vpshufd $0x4e, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm21, %zmm27, %zmm28
+ vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21
+ vpshufd $0x4e, %zmm28, %zmm28
+ vpternlogq $0x96, %zmm21, %zmm28, %zmm29
+ vextracti32x4 $0x01, %zmm29, %xmm0
+ vextracti32x4 $2, %zmm29, %xmm4
+ vextracti32x4 $3, %zmm29, %xmm5
+ vpxorq %xmm0, %xmm29, %xmm6
+ vpternlogq $0x96, %xmm4, %xmm5, %xmm6
+ addl $0x200, %ebx
+ cmpl %r13d, %ebx
+ jge L_AES_GCM_decrypt_avx512_last_aes
+L_AES_GCM_decrypt_avx512_win_loop:
+ leaq (%rdi,%rbx,1), %rax
+ vpxorq %zmm21, %zmm21, %zmm21
+ vinserti32x4 $0x00, %xmm6, %zmm21, %zmm21
+ vbroadcasti32x4 1024(%rsp), %zmm20
+ vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16
+ vpshufb %zmm22, %zmm16, %zmm16
+ vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17
+ vpshufb %zmm22, %zmm17, %zmm17
+ vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18
+ vpshufb %zmm22, %zmm18, %zmm18
+ vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19
+ vpshufb %zmm22, %zmm19, %zmm19
+ vmovdqu 1024(%rsp), %xmm8
+ vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, 1024(%rsp)
+ vpxorq %zmm9, %zmm16, %zmm16
+ vpxorq %zmm9, %zmm17, %zmm17
+ vpxorq %zmm9, %zmm18, %zmm18
+ vpxorq %zmm9, %zmm19, %zmm19
+ vmovdqu64 (%rax), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vpxorq %zmm21, %zmm31, %zmm31
+ vaesenc %zmm10, %zmm16, %zmm16
+ vaesenc %zmm10, %zmm17, %zmm17
+ vaesenc %zmm10, %zmm18, %zmm18
+ vaesenc %zmm10, %zmm19, %zmm19
+ vpclmulqdq $0x00, 512(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 512(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 512(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 512(%rsp), %zmm31, %zmm26
+ vmovdqa64 %zmm23, %zmm27
+ vpxorq %zmm24, %zmm25, %zmm28
+ vmovdqa64 %zmm26, %zmm29
+ vaesenc %zmm11, %zmm16, %zmm16
+ vaesenc %zmm11, %zmm17, %zmm17
+ vaesenc %zmm11, %zmm18, %zmm18
+ vaesenc %zmm11, %zmm19, %zmm19
+ vmovdqu64 64(%rax), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm12, %zmm16, %zmm16
+ vaesenc %zmm12, %zmm17, %zmm17
+ vaesenc %zmm12, %zmm18, %zmm18
+ vaesenc %zmm12, %zmm19, %zmm19
+ vpclmulqdq $0x00, 576(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 576(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 576(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 576(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm13, %zmm16, %zmm16
+ vaesenc %zmm13, %zmm17, %zmm17
+ vaesenc %zmm13, %zmm18, %zmm18
+ vaesenc %zmm13, %zmm19, %zmm19
+ vmovdqu64 128(%rax), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm14, %zmm16, %zmm16
+ vaesenc %zmm14, %zmm17, %zmm17
+ vaesenc %zmm14, %zmm18, %zmm18
+ vaesenc %zmm14, %zmm19, %zmm19
+ vpclmulqdq $0x00, 640(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 640(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 640(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 640(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm15, %zmm16, %zmm16
+ vaesenc %zmm15, %zmm17, %zmm17
+ vaesenc %zmm15, %zmm18, %zmm18
+ vaesenc %zmm15, %zmm19, %zmm19
+ vmovdqu64 192(%rax), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm1, %zmm16, %zmm16
+ vaesenc %zmm1, %zmm17, %zmm17
+ vaesenc %zmm1, %zmm18, %zmm18
+ vaesenc %zmm1, %zmm19, %zmm19
+ vpclmulqdq $0x00, 704(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 704(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 704(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 704(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm2, %zmm16, %zmm16
+ vaesenc %zmm2, %zmm17, %zmm17
+ vaesenc %zmm2, %zmm18, %zmm18
+ vaesenc %zmm2, %zmm19, %zmm19
+ vaesenc %zmm3, %zmm16, %zmm16
+ vaesenc %zmm3, %zmm17, %zmm17
+ vaesenc %zmm3, %zmm18, %zmm18
+ vaesenc %zmm3, %zmm19, %zmm19
+ cmpl $11, %r10d
+ vbroadcasti32x4 160(%r15), %zmm20
+ jl L_AES_GCM_decrypt_avx512_a_il_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 176(%r15), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ cmpl $13, %r10d
+ vbroadcasti32x4 192(%r15), %zmm20
+ jl L_AES_GCM_decrypt_avx512_a_il_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 208(%r15), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 224(%r15), %zmm20
+L_AES_GCM_decrypt_avx512_a_il_last:
+ vaesenclast %zmm20, %zmm16, %zmm16
+ vaesenclast %zmm20, %zmm17, %zmm17
+ vaesenclast %zmm20, %zmm18, %zmm18
+ vaesenclast %zmm20, %zmm19, %zmm19
+ leaq (%rdi,%r12,1), %rcx
+ leaq (%rsi,%r12,1), %rdx
+ vmovdqu64 (%rcx), %zmm21
+ vpxorq %zmm21, %zmm16, %zmm16
+ vmovdqu64 %zmm16, (%rdx)
+ vmovdqu64 64(%rcx), %zmm21
+ vpxorq %zmm21, %zmm17, %zmm17
+ vmovdqu64 %zmm17, 64(%rdx)
+ vmovdqu64 128(%rcx), %zmm21
+ vpxorq %zmm21, %zmm18, %zmm18
+ vmovdqu64 %zmm18, 128(%rdx)
+ vmovdqu64 192(%rcx), %zmm21
+ vpxorq %zmm21, %zmm19, %zmm19
+ vmovdqu64 %zmm19, 192(%rdx)
+ addl $0x100, %r12d
+ vbroadcasti32x4 1024(%rsp), %zmm20
+ vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16
+ vpshufb %zmm22, %zmm16, %zmm16
+ vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17
+ vpshufb %zmm22, %zmm17, %zmm17
+ vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18
+ vpshufb %zmm22, %zmm18, %zmm18
+ vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19
+ vpshufb %zmm22, %zmm19, %zmm19
+ vmovdqu 1024(%rsp), %xmm8
+ vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, 1024(%rsp)
+ vpxorq %zmm9, %zmm16, %zmm16
+ vpxorq %zmm9, %zmm17, %zmm17
+ vpxorq %zmm9, %zmm18, %zmm18
+ vpxorq %zmm9, %zmm19, %zmm19
+ vmovdqu64 256(%rax), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm10, %zmm16, %zmm16
+ vaesenc %zmm10, %zmm17, %zmm17
+ vaesenc %zmm10, %zmm18, %zmm18
+ vaesenc %zmm10, %zmm19, %zmm19
+ vpclmulqdq $0x00, 768(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 768(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 768(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 768(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm11, %zmm16, %zmm16
+ vaesenc %zmm11, %zmm17, %zmm17
+ vaesenc %zmm11, %zmm18, %zmm18
+ vaesenc %zmm11, %zmm19, %zmm19
+ vmovdqu64 320(%rax), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm12, %zmm16, %zmm16
+ vaesenc %zmm12, %zmm17, %zmm17
+ vaesenc %zmm12, %zmm18, %zmm18
+ vaesenc %zmm12, %zmm19, %zmm19
+ vpclmulqdq $0x00, 832(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 832(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 832(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 832(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm13, %zmm16, %zmm16
+ vaesenc %zmm13, %zmm17, %zmm17
+ vaesenc %zmm13, %zmm18, %zmm18
+ vaesenc %zmm13, %zmm19, %zmm19
+ vmovdqu64 384(%rax), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm14, %zmm16, %zmm16
+ vaesenc %zmm14, %zmm17, %zmm17
+ vaesenc %zmm14, %zmm18, %zmm18
+ vaesenc %zmm14, %zmm19, %zmm19
+ vpclmulqdq $0x00, 896(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 896(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 896(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 896(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm15, %zmm16, %zmm16
+ vaesenc %zmm15, %zmm17, %zmm17
+ vaesenc %zmm15, %zmm18, %zmm18
+ vaesenc %zmm15, %zmm19, %zmm19
+ vmovdqu64 448(%rax), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm1, %zmm16, %zmm16
+ vaesenc %zmm1, %zmm17, %zmm17
+ vaesenc %zmm1, %zmm18, %zmm18
+ vaesenc %zmm1, %zmm19, %zmm19
+ vpclmulqdq $0x00, 960(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 960(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 960(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 960(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm2, %zmm16, %zmm16
+ vaesenc %zmm2, %zmm17, %zmm17
+ vaesenc %zmm2, %zmm18, %zmm18
+ vaesenc %zmm2, %zmm19, %zmm19
+ vaesenc %zmm3, %zmm16, %zmm16
+ vaesenc %zmm3, %zmm17, %zmm17
+ vaesenc %zmm3, %zmm18, %zmm18
+ vaesenc %zmm3, %zmm19, %zmm19
+ cmpl $11, %r10d
+ vbroadcasti32x4 160(%r15), %zmm20
+ jl L_AES_GCM_decrypt_avx512_b_il_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 176(%r15), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ cmpl $13, %r10d
+ vbroadcasti32x4 192(%r15), %zmm20
+ jl L_AES_GCM_decrypt_avx512_b_il_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 208(%r15), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 224(%r15), %zmm20
+L_AES_GCM_decrypt_avx512_b_il_last:
+ vaesenclast %zmm20, %zmm16, %zmm16
+ vaesenclast %zmm20, %zmm17, %zmm17
+ vaesenclast %zmm20, %zmm18, %zmm18
+ vaesenclast %zmm20, %zmm19, %zmm19
+ leaq (%rdi,%r12,1), %rcx
+ leaq (%rsi,%r12,1), %rdx
+ vmovdqu64 (%rcx), %zmm21
+ vpxorq %zmm21, %zmm16, %zmm16
+ vmovdqu64 %zmm16, (%rdx)
+ vmovdqu64 64(%rcx), %zmm21
+ vpxorq %zmm21, %zmm17, %zmm17
+ vmovdqu64 %zmm17, 64(%rdx)
+ vmovdqu64 128(%rcx), %zmm21
+ vpxorq %zmm21, %zmm18, %zmm18
+ vmovdqu64 %zmm18, 128(%rdx)
+ vmovdqu64 192(%rcx), %zmm21
+ vpxorq %zmm21, %zmm19, %zmm19
+ vmovdqu64 %zmm19, 192(%rdx)
+ addl $0x100, %r12d
+ vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31
+ vpclmulqdq $0x01, %zmm27, %zmm31, %zmm23
+ vpshufd $0x4e, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm23, %zmm27, %zmm28
+ vpclmulqdq $0x01, %zmm28, %zmm31, %zmm23
+ vpshufd $0x4e, %zmm28, %zmm28
+ vpternlogq $0x96, %zmm23, %zmm28, %zmm29
+ vextracti32x4 $0x01, %zmm29, %xmm0
+ vextracti32x4 $2, %zmm29, %xmm4
+ vextracti32x4 $3, %zmm29, %xmm5
+ vpxorq %xmm0, %xmm29, %xmm6
+ vpternlogq $0x96, %xmm4, %xmm5, %xmm6
+ addl $0x200, %ebx
+ cmpl %r13d, %ebx
+ jl L_AES_GCM_decrypt_avx512_win_loop
+L_AES_GCM_decrypt_avx512_last_aes:
+ vbroadcasti32x4 1024(%rsp), %zmm20
+ vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16
+ vpshufb %zmm22, %zmm16, %zmm16
+ vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17
+ vpshufb %zmm22, %zmm17, %zmm17
+ vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18
+ vpshufb %zmm22, %zmm18, %zmm18
+ vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19
+ vpshufb %zmm22, %zmm19, %zmm19
+ vmovdqu 1024(%rsp), %xmm8
+ vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, 1024(%rsp)
+ vpxorq %zmm9, %zmm16, %zmm16
+ vpxorq %zmm9, %zmm17, %zmm17
+ vpxorq %zmm9, %zmm18, %zmm18
+ vpxorq %zmm9, %zmm19, %zmm19
+ vaesenc %zmm10, %zmm16, %zmm16
+ vaesenc %zmm10, %zmm17, %zmm17
+ vaesenc %zmm10, %zmm18, %zmm18
+ vaesenc %zmm10, %zmm19, %zmm19
+ vaesenc %zmm11, %zmm16, %zmm16
+ vaesenc %zmm11, %zmm17, %zmm17
+ vaesenc %zmm11, %zmm18, %zmm18
+ vaesenc %zmm11, %zmm19, %zmm19
+ vaesenc %zmm12, %zmm16, %zmm16
+ vaesenc %zmm12, %zmm17, %zmm17
+ vaesenc %zmm12, %zmm18, %zmm18
+ vaesenc %zmm12, %zmm19, %zmm19
+ vaesenc %zmm13, %zmm16, %zmm16
+ vaesenc %zmm13, %zmm17, %zmm17
+ vaesenc %zmm13, %zmm18, %zmm18
+ vaesenc %zmm13, %zmm19, %zmm19
+ vaesenc %zmm14, %zmm16, %zmm16
+ vaesenc %zmm14, %zmm17, %zmm17
+ vaesenc %zmm14, %zmm18, %zmm18
+ vaesenc %zmm14, %zmm19, %zmm19
+ vaesenc %zmm15, %zmm16, %zmm16
+ vaesenc %zmm15, %zmm17, %zmm17
+ vaesenc %zmm15, %zmm18, %zmm18
+ vaesenc %zmm15, %zmm19, %zmm19
+ vaesenc %zmm1, %zmm16, %zmm16
+ vaesenc %zmm1, %zmm17, %zmm17
+ vaesenc %zmm1, %zmm18, %zmm18
+ vaesenc %zmm1, %zmm19, %zmm19
+ vaesenc %zmm2, %zmm16, %zmm16
+ vaesenc %zmm2, %zmm17, %zmm17
+ vaesenc %zmm2, %zmm18, %zmm18
+ vaesenc %zmm2, %zmm19, %zmm19
+ vaesenc %zmm3, %zmm16, %zmm16
+ vaesenc %zmm3, %zmm17, %zmm17
+ vaesenc %zmm3, %zmm18, %zmm18
+ vaesenc %zmm3, %zmm19, %zmm19
+ cmpl $11, %r10d
+ vbroadcasti32x4 160(%r15), %zmm20
+ jl L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 176(%r15), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ cmpl $13, %r10d
+ vbroadcasti32x4 192(%r15), %zmm20
+ jl L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 208(%r15), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 224(%r15), %zmm20
+L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last:
+ vaesenclast %zmm20, %zmm16, %zmm16
+ vaesenclast %zmm20, %zmm17, %zmm17
+ vaesenclast %zmm20, %zmm18, %zmm18
+ vaesenclast %zmm20, %zmm19, %zmm19
+ leaq (%rdi,%r12,1), %rcx
+ leaq (%rsi,%r12,1), %rdx
+ vmovdqu64 (%rcx), %zmm21
+ vpxorq %zmm21, %zmm16, %zmm16
+ vmovdqu64 %zmm16, (%rdx)
+ vmovdqu64 64(%rcx), %zmm21
+ vpxorq %zmm21, %zmm17, %zmm17
+ vmovdqu64 %zmm17, 64(%rdx)
+ vmovdqu64 128(%rcx), %zmm21
+ vpxorq %zmm21, %zmm18, %zmm18
+ vmovdqu64 %zmm18, 128(%rdx)
+ vmovdqu64 192(%rcx), %zmm21
+ vpxorq %zmm21, %zmm19, %zmm19
+ vmovdqu64 %zmm19, 192(%rdx)
+ addl $0x100, %r12d
+ vbroadcasti32x4 1024(%rsp), %zmm20
+ vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16
+ vpshufb %zmm22, %zmm16, %zmm16
+ vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17
+ vpshufb %zmm22, %zmm17, %zmm17
+ vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18
+ vpshufb %zmm22, %zmm18, %zmm18
+ vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19
+ vpshufb %zmm22, %zmm19, %zmm19
+ vmovdqu 1024(%rsp), %xmm8
+ vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, 1024(%rsp)
+ vpxorq %zmm9, %zmm16, %zmm16
+ vpxorq %zmm9, %zmm17, %zmm17
+ vpxorq %zmm9, %zmm18, %zmm18
+ vpxorq %zmm9, %zmm19, %zmm19
+ vaesenc %zmm10, %zmm16, %zmm16
+ vaesenc %zmm10, %zmm17, %zmm17
+ vaesenc %zmm10, %zmm18, %zmm18
+ vaesenc %zmm10, %zmm19, %zmm19
+ vaesenc %zmm11, %zmm16, %zmm16
+ vaesenc %zmm11, %zmm17, %zmm17
+ vaesenc %zmm11, %zmm18, %zmm18
+ vaesenc %zmm11, %zmm19, %zmm19
+ vaesenc %zmm12, %zmm16, %zmm16
+ vaesenc %zmm12, %zmm17, %zmm17
+ vaesenc %zmm12, %zmm18, %zmm18
+ vaesenc %zmm12, %zmm19, %zmm19
+ vaesenc %zmm13, %zmm16, %zmm16
+ vaesenc %zmm13, %zmm17, %zmm17
+ vaesenc %zmm13, %zmm18, %zmm18
+ vaesenc %zmm13, %zmm19, %zmm19
+ vaesenc %zmm14, %zmm16, %zmm16
+ vaesenc %zmm14, %zmm17, %zmm17
+ vaesenc %zmm14, %zmm18, %zmm18
+ vaesenc %zmm14, %zmm19, %zmm19
+ vaesenc %zmm15, %zmm16, %zmm16
+ vaesenc %zmm15, %zmm17, %zmm17
+ vaesenc %zmm15, %zmm18, %zmm18
+ vaesenc %zmm15, %zmm19, %zmm19
+ vaesenc %zmm1, %zmm16, %zmm16
+ vaesenc %zmm1, %zmm17, %zmm17
+ vaesenc %zmm1, %zmm18, %zmm18
+ vaesenc %zmm1, %zmm19, %zmm19
+ vaesenc %zmm2, %zmm16, %zmm16
+ vaesenc %zmm2, %zmm17, %zmm17
+ vaesenc %zmm2, %zmm18, %zmm18
+ vaesenc %zmm2, %zmm19, %zmm19
+ vaesenc %zmm3, %zmm16, %zmm16
+ vaesenc %zmm3, %zmm17, %zmm17
+ vaesenc %zmm3, %zmm18, %zmm18
+ vaesenc %zmm3, %zmm19, %zmm19
+ cmpl $11, %r10d
+ vbroadcasti32x4 160(%r15), %zmm20
+ jl L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 176(%r15), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ cmpl $13, %r10d
+ vbroadcasti32x4 192(%r15), %zmm20
+ jl L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 208(%r15), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 224(%r15), %zmm20
+L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last:
+ vaesenclast %zmm20, %zmm16, %zmm16
+ vaesenclast %zmm20, %zmm17, %zmm17
+ vaesenclast %zmm20, %zmm18, %zmm18
+ vaesenclast %zmm20, %zmm19, %zmm19
+ leaq (%rdi,%r12,1), %rcx
+ leaq (%rsi,%r12,1), %rdx
+ vmovdqu64 (%rcx), %zmm21
+ vpxorq %zmm21, %zmm16, %zmm16
+ vmovdqu64 %zmm16, (%rdx)
+ vmovdqu64 64(%rcx), %zmm21
+ vpxorq %zmm21, %zmm17, %zmm17
+ vmovdqu64 %zmm17, 64(%rdx)
+ vmovdqu64 128(%rcx), %zmm21
+ vpxorq %zmm21, %zmm18, %zmm18
+ vmovdqu64 %zmm18, 128(%rdx)
+ vmovdqu64 192(%rcx), %zmm21
+ vpxorq %zmm21, %zmm19, %zmm19
+ vmovdqu64 %zmm19, 192(%rdx)
+ addl $0x100, %r12d
+L_AES_GCM_decrypt_avx512_no_windows:
+ vmovdqu64 192(%rsp), %zmm23
+ vshufi64x2 $27, %zmm23, %zmm23, %zmm23
+ vmovdqu64 128(%rsp), %zmm24
+ vshufi64x2 $27, %zmm24, %zmm24, %zmm24
+ vmovdqu64 64(%rsp), %zmm25
+ vshufi64x2 $27, %zmm25, %zmm25, %zmm25
+ vmovdqu64 (%rsp), %zmm26
+ vshufi64x2 $27, %zmm26, %zmm26, %zmm26
+ movl %r9d, %r13d
+ andl $0xffffff00, %r13d
+ cmpl %r13d, %ebx
+ jge L_AES_GCM_decrypt_avx512_after_256
+ # 256 bytes of input
+ leaq (%rdi,%rbx,1), %rax
+ vpxorq %zmm20, %zmm20, %zmm20
+ vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20
+ vmovdqu64 (%rax), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpxorq %zmm20, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm23, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19
+ vmovdqa64 %zmm16, %zmm27
+ vpxorq %zmm17, %zmm18, %zmm28
+ vmovdqa64 %zmm19, %zmm29
+ vmovdqu64 64(%rax), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm24, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 128(%rax), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm25, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 192(%rax), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm26, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21
+ vpshufd $0x4e, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm21, %zmm27, %zmm28
+ vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21
+ vpshufd $0x4e, %zmm28, %zmm28
+ vpternlogq $0x96, %zmm21, %zmm28, %zmm29
+ vextracti32x4 $0x01, %zmm29, %xmm0
+ vextracti32x4 $2, %zmm29, %xmm4
+ vextracti32x4 $3, %zmm29, %xmm5
+ vpxorq %xmm0, %xmm29, %xmm6
+ vpternlogq $0x96, %xmm4, %xmm5, %xmm6
+ vbroadcasti32x4 1024(%rsp), %zmm20
+ vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16
+ vpshufb %zmm22, %zmm16, %zmm16
+ vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17
+ vpshufb %zmm22, %zmm17, %zmm17
+ vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18
+ vpshufb %zmm22, %zmm18, %zmm18
+ vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19
+ vpshufb %zmm22, %zmm19, %zmm19
+ vmovdqu 1024(%rsp), %xmm8
+ vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, 1024(%rsp)
+ vpxorq %zmm9, %zmm16, %zmm16
+ vpxorq %zmm9, %zmm17, %zmm17
+ vpxorq %zmm9, %zmm18, %zmm18
+ vpxorq %zmm9, %zmm19, %zmm19
+ vaesenc %zmm10, %zmm16, %zmm16
+ vaesenc %zmm10, %zmm17, %zmm17
+ vaesenc %zmm10, %zmm18, %zmm18
+ vaesenc %zmm10, %zmm19, %zmm19
+ vaesenc %zmm11, %zmm16, %zmm16
+ vaesenc %zmm11, %zmm17, %zmm17
+ vaesenc %zmm11, %zmm18, %zmm18
+ vaesenc %zmm11, %zmm19, %zmm19
+ vaesenc %zmm12, %zmm16, %zmm16
+ vaesenc %zmm12, %zmm17, %zmm17
+ vaesenc %zmm12, %zmm18, %zmm18
+ vaesenc %zmm12, %zmm19, %zmm19
+ vaesenc %zmm13, %zmm16, %zmm16
+ vaesenc %zmm13, %zmm17, %zmm17
+ vaesenc %zmm13, %zmm18, %zmm18
+ vaesenc %zmm13, %zmm19, %zmm19
+ vaesenc %zmm14, %zmm16, %zmm16
+ vaesenc %zmm14, %zmm17, %zmm17
+ vaesenc %zmm14, %zmm18, %zmm18
+ vaesenc %zmm14, %zmm19, %zmm19
+ vaesenc %zmm15, %zmm16, %zmm16
+ vaesenc %zmm15, %zmm17, %zmm17
+ vaesenc %zmm15, %zmm18, %zmm18
+ vaesenc %zmm15, %zmm19, %zmm19
+ vaesenc %zmm1, %zmm16, %zmm16
+ vaesenc %zmm1, %zmm17, %zmm17
+ vaesenc %zmm1, %zmm18, %zmm18
+ vaesenc %zmm1, %zmm19, %zmm19
+ vaesenc %zmm2, %zmm16, %zmm16
+ vaesenc %zmm2, %zmm17, %zmm17
+ vaesenc %zmm2, %zmm18, %zmm18
+ vaesenc %zmm2, %zmm19, %zmm19
+ vaesenc %zmm3, %zmm16, %zmm16
+ vaesenc %zmm3, %zmm17, %zmm17
+ vaesenc %zmm3, %zmm18, %zmm18
+ vaesenc %zmm3, %zmm19, %zmm19
+ cmpl $11, %r10d
+ vbroadcasti32x4 160(%r15), %zmm20
+ jl L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 176(%r15), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ cmpl $13, %r10d
+ vbroadcasti32x4 192(%r15), %zmm20
+ jl L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 208(%r15), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 224(%r15), %zmm20
+L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last:
+ vaesenclast %zmm20, %zmm16, %zmm16
+ vaesenclast %zmm20, %zmm17, %zmm17
+ vaesenclast %zmm20, %zmm18, %zmm18
+ vaesenclast %zmm20, %zmm19, %zmm19
+ leaq (%rdi,%rbx,1), %rcx
+ leaq (%rsi,%rbx,1), %rdx
+ vmovdqu64 (%rcx), %zmm21
+ vpxorq %zmm21, %zmm16, %zmm16
+ vmovdqu64 %zmm16, (%rdx)
+ vmovdqu64 64(%rcx), %zmm21
+ vpxorq %zmm21, %zmm17, %zmm17
+ vmovdqu64 %zmm17, 64(%rdx)
+ vmovdqu64 128(%rcx), %zmm21
+ vpxorq %zmm21, %zmm18, %zmm18
+ vmovdqu64 %zmm18, 128(%rdx)
+ vmovdqu64 192(%rcx), %zmm21
+ vpxorq %zmm21, %zmm19, %zmm19
+ vmovdqu64 %zmm19, 192(%rdx)
+ addl $0x100, %ebx
+L_AES_GCM_decrypt_avx512_after_256:
+ vmovdqu (%rsp), %xmm5
+L_AES_GCM_decrypt_avx512_done_128:
+ movl %r9d, %edx
+ cmpl %edx, %ebx
+ jge L_AES_GCM_decrypt_avx512_done_dec
+ movl %r9d, %r13d
+ andl $0xfffffff0, %r13d
+ cmpl %r13d, %ebx
+ jge L_AES_GCM_decrypt_avx512_last_block_done
+L_AES_GCM_decrypt_avx512_last_block_start:
+ vmovdqu (%rdi,%rbx,1), %xmm13
+ vmovdqa %xmm5, %xmm0
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm13, %xmm1
+ vpxor %xmm6, %xmm1, %xmm1
+ vmovdqu 1024(%rsp), %xmm9
+ vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8
+ vpaddd L_avx512_aes_gcm_one(%rip), %xmm9, %xmm9
+ vmovdqu %xmm9, 1024(%rsp)
+ vpxor (%r15), %xmm8, %xmm8
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm10
+ vaesenc 16(%r15), %xmm8, %xmm8
+ vaesenc 32(%r15), %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm11
+ vaesenc 48(%r15), %xmm8, %xmm8
+ vaesenc 64(%r15), %xmm8, %xmm8
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm12
+ vaesenc 80(%r15), %xmm8, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1
+ vaesenc 96(%r15), %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpslldq $8, %xmm10, %xmm2
+ vpsrldq $8, %xmm10, %xmm10
+ vaesenc 112(%r15), %xmm8, %xmm8
+ vpxor %xmm12, %xmm2, %xmm2
+ vpxor %xmm10, %xmm1, %xmm3
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm0
+ vpclmulqdq $16, %xmm0, %xmm2, %xmm11
+ vaesenc 128(%r15), %xmm8, %xmm8
+ vpshufd $0x4e, %xmm2, %xmm10
+ vpxor %xmm11, %xmm10, %xmm10
+ vpclmulqdq $16, %xmm0, %xmm10, %xmm11
+ vaesenc 144(%r15), %xmm8, %xmm8
+ vpshufd $0x4e, %xmm10, %xmm10
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm3, %xmm10, %xmm6
+ cmpl $11, %r10d
+ vmovdqa 160(%r15), %xmm9
+ jl L_AES_GCM_decrypt_avx512_aesenc_gfmul_last
+ vaesenc %xmm9, %xmm8, %xmm8
+ vaesenc 176(%r15), %xmm8, %xmm8
+ cmpl $13, %r10d
+ vmovdqa 192(%r15), %xmm9
+ jl L_AES_GCM_decrypt_avx512_aesenc_gfmul_last
+ vaesenc %xmm9, %xmm8, %xmm8
+ vaesenc 208(%r15), %xmm8, %xmm8
+ vmovdqa 224(%r15), %xmm9
+L_AES_GCM_decrypt_avx512_aesenc_gfmul_last:
+ vaesenclast %xmm9, %xmm8, %xmm8
+ vmovdqa %xmm13, %xmm0
+ vpxor %xmm0, %xmm8, %xmm8
+ vmovdqu %xmm8, (%rsi,%rbx,1)
+ addl $16, %ebx
+ cmpl %r13d, %ebx
+ jl L_AES_GCM_decrypt_avx512_last_block_start
+L_AES_GCM_decrypt_avx512_last_block_done:
+ movl %r9d, %ecx
+ movl %ecx, %edx
+ andl $15, %ecx
+ jz L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_done
+ vmovdqu 1024(%rsp), %xmm4
+ vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4
+ vpxor (%r15), %xmm4, %xmm4
+ vaesenc 16(%r15), %xmm4, %xmm4
+ vaesenc 32(%r15), %xmm4, %xmm4
+ vaesenc 48(%r15), %xmm4, %xmm4
+ vaesenc 64(%r15), %xmm4, %xmm4
+ vaesenc 80(%r15), %xmm4, %xmm4
+ vaesenc 96(%r15), %xmm4, %xmm4
+ vaesenc 112(%r15), %xmm4, %xmm4
+ vaesenc 128(%r15), %xmm4, %xmm4
+ vaesenc 144(%r15), %xmm4, %xmm4
+ cmpl $11, %r10d
+ vmovdqa 160(%r15), %xmm9
+ jl L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_aesenc_avx_last
+ vaesenc %xmm9, %xmm4, %xmm4
+ vaesenc 176(%r15), %xmm4, %xmm4
+ cmpl $13, %r10d
+ vmovdqa 192(%r15), %xmm9
+ jl L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_aesenc_avx_last
+ vaesenc %xmm9, %xmm4, %xmm4
+ vaesenc 208(%r15), %xmm4, %xmm4
+ vmovdqa 224(%r15), %xmm9
+L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_aesenc_avx_last:
+ vaesenclast %xmm9, %xmm4, %xmm4
+ subq $32, %rsp
+ xorl %ecx, %ecx
+ vmovdqu %xmm4, (%rsp)
+ vpxor %xmm0, %xmm0, %xmm0
+ vmovdqu %xmm0, 16(%rsp)
+L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_loop:
+ movzbl (%rdi,%rbx,1), %r13d
+ movb %r13b, 16(%rsp,%rcx,1)
+ xorb (%rsp,%rcx,1), %r13b
+ movb %r13b, (%rsi,%rbx,1)
+ incl %ebx
+ incl %ecx
+ cmpl %edx, %ebx
+ jl L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_loop
+ vmovdqu 16(%rsp), %xmm4
+ addq $32, %rsp
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4
+ vpxor %xmm4, %xmm6, %xmm6
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm5, %xmm9
+ vpxor %xmm5, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm6, %xmm10
+ vpxor %xmm6, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm5, %xmm6, %xmm8
+ vpclmulqdq $0x11, %xmm5, %xmm6, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm6
+L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_done:
+L_AES_GCM_decrypt_avx512_done_dec:
+ movl %r9d, %edx
+ movl %r11d, %ecx
+ shlq $3, %rdx
+ shlq $3, %rcx
+ vmovq %rdx, %xmm0
+ vmovq %rcx, %xmm1
+ vpunpcklqdq %xmm1, %xmm0, %xmm0
+ vpxor %xmm0, %xmm6, %xmm6
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm5, %xmm9
+ vpxor %xmm5, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm6, %xmm10
+ vpxor %xmm6, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm5, %xmm6, %xmm8
+ vpclmulqdq $0x11, %xmm5, %xmm6, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm6
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6
+ vmovdqu 1040(%rsp), %xmm0
+ vpxor %xmm6, %xmm0, %xmm0
+ cmpl $16, %r14d
+ je L_AES_GCM_decrypt_avx512_cmp_tag_16
+ subq $16, %rsp
+ xorq %rcx, %rcx
+ xorq %rbx, %rbx
+ vmovdqu %xmm0, (%rsp)
+L_AES_GCM_decrypt_avx512_cmp_tag_loop:
+ movzbl (%rsp,%rcx,1), %r13d
+ xorb (%r8,%rcx,1), %r13b
+ orb %r13b, %bl
+ incl %ecx
+ cmpl %r14d, %ecx
+ jne L_AES_GCM_decrypt_avx512_cmp_tag_loop
+ cmpb $0x00, %bl
+ sete %bl
+ addq $16, %rsp
+ xorq %rcx, %rcx
+ jmp L_AES_GCM_decrypt_avx512_cmp_tag_done
+L_AES_GCM_decrypt_avx512_cmp_tag_16:
+ vmovdqu (%r8), %xmm1
+ vpcmpeqb %xmm1, %xmm0, %xmm0
+ vpmovmskb %xmm0, %rdx
+ # %%edx == 0xFFFF then return 1 else => return 0
+ xorl %ebx, %ebx
+ cmpl $0xffff, %edx
+ sete %bl
+L_AES_GCM_decrypt_avx512_cmp_tag_done:
+ movl %ebx, (%rbp)
+ vzeroupper
+ addq $0x420, %rsp
+ popq %rbp
+ popq %r15
+ popq %r14
+ popq %rbx
+ popq %r12
+ popq %r13
+ repz retq
+#ifndef __APPLE__
+.size AES_GCM_decrypt_avx512,.-AES_GCM_decrypt_avx512
+#endif /* __APPLE__ */
+#ifdef WOLFSSL_AESGCM_STREAM
+#ifndef __APPLE__
+.text
+.globl AES_GCM_init_avx512
+.type AES_GCM_init_avx512,@function
+.align 16
+AES_GCM_init_avx512:
+#else
+.section __TEXT,__text
+.globl _AES_GCM_init_avx512
+.p2align 4
+_AES_GCM_init_avx512:
+#endif /* __APPLE__ */
+ pushq %r12
+ pushq %r13
+ movq %rdx, %r10
+ movl %ecx, %r11d
+ movq 24(%rsp), %rax
+ subq $16, %rsp
+ vpxor %xmm4, %xmm4, %xmm4
+ movl %r11d, %edx
+ cmpl $12, %edx
+ jne L_AES_GCM_init_avx512_iv_not_12
+ # # Calculate values when IV is 12 bytes
+ # Set counter based on IV
+ movl $0x1000000, %ecx
+ vmovq (%r10), %xmm4
+ vpinsrd $2, 8(%r10), %xmm4, %xmm4
+ vpinsrd $3, %ecx, %xmm4, %xmm4
+ # H = Encrypt X(=0) and T = Encrypt counter
+ vmovdqa (%rdi), %xmm5
+ vpxor %xmm5, %xmm4, %xmm1
+ vmovdqa 16(%rdi), %xmm6
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ vmovdqa 32(%rdi), %xmm6
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ vmovdqa 48(%rdi), %xmm6
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ vmovdqa 64(%rdi), %xmm6
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ vmovdqa 80(%rdi), %xmm6
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ vmovdqa 96(%rdi), %xmm6
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ vmovdqa 112(%rdi), %xmm6
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ vmovdqa 128(%rdi), %xmm6
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ vmovdqa 144(%rdi), %xmm6
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ cmpl $11, %esi
+ vmovdqa 160(%rdi), %xmm6
+ jl L_AES_GCM_init_avx512_calc_iv_12_last
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ vmovdqa 176(%rdi), %xmm6
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ cmpl $13, %esi
+ vmovdqa 192(%rdi), %xmm6
+ jl L_AES_GCM_init_avx512_calc_iv_12_last
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ vmovdqa 208(%rdi), %xmm6
+ vaesenc %xmm6, %xmm5, %xmm5
+ vaesenc %xmm6, %xmm1, %xmm1
+ vmovdqa 224(%rdi), %xmm6
+L_AES_GCM_init_avx512_calc_iv_12_last:
+ vaesenclast %xmm6, %xmm5, %xmm5
+ vaesenclast %xmm6, %xmm1, %xmm1
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
+ vmovdqu %xmm1, %xmm15
+ jmp L_AES_GCM_init_avx512_iv_done
+L_AES_GCM_init_avx512_iv_not_12:
+ # Calculate values when IV is not 12 bytes
+ # H = Encrypt X(=0)
+ vmovdqa (%rdi), %xmm5
+ vaesenc 16(%rdi), %xmm5, %xmm5
+ vaesenc 32(%rdi), %xmm5, %xmm5
+ vaesenc 48(%rdi), %xmm5, %xmm5
+ vaesenc 64(%rdi), %xmm5, %xmm5
+ vaesenc 80(%rdi), %xmm5, %xmm5
+ vaesenc 96(%rdi), %xmm5, %xmm5
+ vaesenc 112(%rdi), %xmm5, %xmm5
+ vaesenc 128(%rdi), %xmm5, %xmm5
+ vaesenc 144(%rdi), %xmm5, %xmm5
+ cmpl $11, %esi
+ vmovdqa 160(%rdi), %xmm8
+ jl L_AES_GCM_init_avx512_calc_iv_1_aesenc_avx_last
+ vaesenc %xmm8, %xmm5, %xmm5
+ vaesenc 176(%rdi), %xmm5, %xmm5
+ cmpl $13, %esi
+ vmovdqa 192(%rdi), %xmm8
+ jl L_AES_GCM_init_avx512_calc_iv_1_aesenc_avx_last
+ vaesenc %xmm8, %xmm5, %xmm5
+ vaesenc 208(%rdi), %xmm5, %xmm5
+ vmovdqa 224(%rdi), %xmm8
+L_AES_GCM_init_avx512_calc_iv_1_aesenc_avx_last:
+ vaesenclast %xmm8, %xmm5, %xmm5
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
+ # Calc counter
+ # Initialization vector
+ cmpl $0x00, %edx
+ movq $0x00, %rcx
+ je L_AES_GCM_init_avx512_calc_iv_done
+ cmpl $16, %edx
+ jl L_AES_GCM_init_avx512_calc_iv_lt16
+ andl $0xfffffff0, %edx
+L_AES_GCM_init_avx512_calc_iv_16_loop:
+ vmovdqu (%r10,%rcx,1), %xmm7
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7
+ vpxor %xmm7, %xmm4, %xmm4
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm4, %xmm1
+ vpshufd $0x4e, %xmm5, %xmm2
+ vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3
+ vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0
+ vpxor %xmm4, %xmm1, %xmm1
+ vpxor %xmm5, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm6
+ vmovdqa %xmm3, %xmm4
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm6, %xmm6
+ vpxor %xmm1, %xmm4, %xmm4
+ vpsrld $31, %xmm6, %xmm0
+ vpsrld $31, %xmm4, %xmm1
+ vpslld $0x01, %xmm6, %xmm6
+ vpslld $0x01, %xmm4, %xmm4
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm4, %xmm4
+ vpor %xmm0, %xmm6, %xmm6
+ vpor %xmm1, %xmm4, %xmm4
+ vpslld $31, %xmm6, %xmm0
+ vpslld $30, %xmm6, %xmm1
+ vpslld $25, %xmm6, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm6, %xmm6
+ vpsrld $0x01, %xmm6, %xmm2
+ vpsrld $2, %xmm6, %xmm3
+ vpsrld $7, %xmm6, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm6, %xmm2, %xmm2
+ vpxor %xmm2, %xmm4, %xmm4
+ addl $16, %ecx
+ cmpl %edx, %ecx
+ jl L_AES_GCM_init_avx512_calc_iv_16_loop
+ movl %r11d, %edx
+ cmpl %edx, %ecx
+ je L_AES_GCM_init_avx512_calc_iv_done
+L_AES_GCM_init_avx512_calc_iv_lt16:
+ subq $16, %rsp
+ vpxor %xmm7, %xmm7, %xmm7
+ xorl %r13d, %r13d
+ vmovdqu %xmm7, (%rsp)
+L_AES_GCM_init_avx512_calc_iv_loop:
+ movzbl (%r10,%rcx,1), %r12d
+ movb %r12b, (%rsp,%r13,1)
+ incl %ecx
+ incl %r13d
+ cmpl %edx, %ecx
+ jl L_AES_GCM_init_avx512_calc_iv_loop
+ vmovdqu (%rsp), %xmm7
+ addq $16, %rsp
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7
+ vpxor %xmm7, %xmm4, %xmm4
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm4, %xmm1
+ vpshufd $0x4e, %xmm5, %xmm2
+ vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3
+ vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0
+ vpxor %xmm4, %xmm1, %xmm1
+ vpxor %xmm5, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm6
+ vmovdqa %xmm3, %xmm4
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm6, %xmm6
+ vpxor %xmm1, %xmm4, %xmm4
+ vpsrld $31, %xmm6, %xmm0
+ vpsrld $31, %xmm4, %xmm1
+ vpslld $0x01, %xmm6, %xmm6
+ vpslld $0x01, %xmm4, %xmm4
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm4, %xmm4
+ vpor %xmm0, %xmm6, %xmm6
+ vpor %xmm1, %xmm4, %xmm4
+ vpslld $31, %xmm6, %xmm0
+ vpslld $30, %xmm6, %xmm1
+ vpslld $25, %xmm6, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm6, %xmm6
+ vpsrld $0x01, %xmm6, %xmm2
+ vpsrld $2, %xmm6, %xmm3
+ vpsrld $7, %xmm6, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm6, %xmm2, %xmm2
+ vpxor %xmm2, %xmm4, %xmm4
+L_AES_GCM_init_avx512_calc_iv_done:
+ # T = Encrypt counter
+ vpxor %xmm0, %xmm0, %xmm0
+ shll $3, %edx
+ vmovq %rdx, %xmm0
+ vpxor %xmm0, %xmm4, %xmm4
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm4, %xmm1
+ vpshufd $0x4e, %xmm5, %xmm2
+ vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3
+ vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0
+ vpxor %xmm4, %xmm1, %xmm1
+ vpxor %xmm5, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm6
+ vmovdqa %xmm3, %xmm4
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm6, %xmm6
+ vpxor %xmm1, %xmm4, %xmm4
+ vpsrld $31, %xmm6, %xmm0
+ vpsrld $31, %xmm4, %xmm1
+ vpslld $0x01, %xmm6, %xmm6
+ vpslld $0x01, %xmm4, %xmm4
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm4, %xmm4
+ vpor %xmm0, %xmm6, %xmm6
+ vpor %xmm1, %xmm4, %xmm4
+ vpslld $31, %xmm6, %xmm0
+ vpslld $30, %xmm6, %xmm1
+ vpslld $25, %xmm6, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm6, %xmm6
+ vpsrld $0x01, %xmm6, %xmm2
+ vpsrld $2, %xmm6, %xmm3
+ vpsrld $7, %xmm6, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm6, %xmm2, %xmm2
+ vpxor %xmm2, %xmm4, %xmm4
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4
+ # Encrypt counter
+ vmovdqa (%rdi), %xmm7
+ vpxor %xmm4, %xmm7, %xmm7
+ vaesenc 16(%rdi), %xmm7, %xmm7
+ vaesenc 32(%rdi), %xmm7, %xmm7
+ vaesenc 48(%rdi), %xmm7, %xmm7
+ vaesenc 64(%rdi), %xmm7, %xmm7
+ vaesenc 80(%rdi), %xmm7, %xmm7
+ vaesenc 96(%rdi), %xmm7, %xmm7
+ vaesenc 112(%rdi), %xmm7, %xmm7
+ vaesenc 128(%rdi), %xmm7, %xmm7
+ vaesenc 144(%rdi), %xmm7, %xmm7
+ cmpl $11, %esi
+ vmovdqa 160(%rdi), %xmm8
+ jl L_AES_GCM_init_avx512_calc_iv_2_aesenc_avx_last
+ vaesenc %xmm8, %xmm7, %xmm7
+ vaesenc 176(%rdi), %xmm7, %xmm7
+ cmpl $13, %esi
+ vmovdqa 192(%rdi), %xmm8
+ jl L_AES_GCM_init_avx512_calc_iv_2_aesenc_avx_last
+ vaesenc %xmm8, %xmm7, %xmm7
+ vaesenc 208(%rdi), %xmm7, %xmm7
+ vmovdqa 224(%rdi), %xmm8
+L_AES_GCM_init_avx512_calc_iv_2_aesenc_avx_last:
+ vaesenclast %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, %xmm15
+L_AES_GCM_init_avx512_iv_done:
+ vmovdqa %xmm15, (%rax)
+ vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4
+ vpaddd L_avx512_aes_gcm_one(%rip), %xmm4, %xmm4
+ vmovdqa %xmm5, (%r8)
+ vmovdqa %xmm4, (%r9)
+ addq $16, %rsp
+ popq %r13
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size AES_GCM_init_avx512,.-AES_GCM_init_avx512
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_GCM_aad_update_avx512
+.type AES_GCM_aad_update_avx512,@function
+.align 16
+AES_GCM_aad_update_avx512:
+#else
+.section __TEXT,__text
+.globl _AES_GCM_aad_update_avx512
+.p2align 4
+_AES_GCM_aad_update_avx512:
+#endif /* __APPLE__ */
+ movq %rcx, %rax
+ vmovdqa (%rdx), %xmm5
+ vmovdqa (%rax), %xmm6
+ xorl %ecx, %ecx
+L_AES_GCM_aad_update_avx512_16_loop:
+ vmovdqu (%rdi,%rcx,1), %xmm7
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7
+ vpxor %xmm7, %xmm5, %xmm5
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm5, %xmm1
+ vpshufd $0x4e, %xmm6, %xmm2
+ vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3
+ vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0
+ vpxor %xmm5, %xmm1, %xmm1
+ vpxor %xmm6, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm4
+ vmovdqa %xmm3, %xmm5
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm4, %xmm4
+ vpxor %xmm1, %xmm5, %xmm5
+ vpsrld $31, %xmm4, %xmm0
+ vpsrld $31, %xmm5, %xmm1
+ vpslld $0x01, %xmm4, %xmm4
+ vpslld $0x01, %xmm5, %xmm5
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm5, %xmm5
+ vpor %xmm0, %xmm4, %xmm4
+ vpor %xmm1, %xmm5, %xmm5
+ vpslld $31, %xmm4, %xmm0
+ vpslld $30, %xmm4, %xmm1
+ vpslld $25, %xmm4, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm4, %xmm4
+ vpsrld $0x01, %xmm4, %xmm2
+ vpsrld $2, %xmm4, %xmm3
+ vpsrld $7, %xmm4, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm4, %xmm2, %xmm2
+ vpxor %xmm2, %xmm5, %xmm5
+ addl $16, %ecx
+ cmpl %esi, %ecx
+ jl L_AES_GCM_aad_update_avx512_16_loop
+ vmovdqa %xmm5, (%rdx)
+ repz retq
+#ifndef __APPLE__
+.size AES_GCM_aad_update_avx512,.-AES_GCM_aad_update_avx512
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_GCM_encrypt_block_avx512
+.type AES_GCM_encrypt_block_avx512,@function
+.align 16
+AES_GCM_encrypt_block_avx512:
+#else
+.section __TEXT,__text
+.globl _AES_GCM_encrypt_block_avx512
+.p2align 4
+_AES_GCM_encrypt_block_avx512:
+#endif /* __APPLE__ */
+ movq %rdx, %r10
+ movq %rcx, %r11
+ vmovdqu (%r8), %xmm1
+ vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm1, %xmm0
+ vpaddd L_avx512_aes_gcm_one(%rip), %xmm1, %xmm1
+ vmovdqu %xmm1, (%r8)
+ vpxor (%rdi), %xmm0, %xmm0
+ vaesenc 16(%rdi), %xmm0, %xmm0
+ vaesenc 32(%rdi), %xmm0, %xmm0
+ vaesenc 48(%rdi), %xmm0, %xmm0
+ vaesenc 64(%rdi), %xmm0, %xmm0
+ vaesenc 80(%rdi), %xmm0, %xmm0
+ vaesenc 96(%rdi), %xmm0, %xmm0
+ vaesenc 112(%rdi), %xmm0, %xmm0
+ vaesenc 128(%rdi), %xmm0, %xmm0
+ vaesenc 144(%rdi), %xmm0, %xmm0
+ cmpl $11, %esi
+ vmovdqa 160(%rdi), %xmm1
+ jl L_AES_GCM_encrypt_block_avx512_aesenc_block_last
+ vaesenc %xmm1, %xmm0, %xmm0
+ vaesenc 176(%rdi), %xmm0, %xmm0
+ cmpl $13, %esi
+ vmovdqa 192(%rdi), %xmm1
+ jl L_AES_GCM_encrypt_block_avx512_aesenc_block_last
+ vaesenc %xmm1, %xmm0, %xmm0
+ vaesenc 208(%rdi), %xmm0, %xmm0
+ vmovdqa 224(%rdi), %xmm1
+L_AES_GCM_encrypt_block_avx512_aesenc_block_last:
+ vaesenclast %xmm1, %xmm0, %xmm0
+ vmovdqu (%r11), %xmm1
+ vpxor %xmm1, %xmm0, %xmm0
+ vmovdqu %xmm0, (%r10)
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
+ vzeroupper
+ repz retq
+#ifndef __APPLE__
+.size AES_GCM_encrypt_block_avx512,.-AES_GCM_encrypt_block_avx512
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_GCM_ghash_block_avx512
+.type AES_GCM_ghash_block_avx512,@function
+.align 16
+AES_GCM_ghash_block_avx512:
+#else
+.section __TEXT,__text
+.globl _AES_GCM_ghash_block_avx512
+.p2align 4
+_AES_GCM_ghash_block_avx512:
+#endif /* __APPLE__ */
+ vmovdqa (%rsi), %xmm4
+ vmovdqa (%rdx), %xmm5
+ vmovdqu (%rdi), %xmm7
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7
+ vpxor %xmm7, %xmm4, %xmm4
+ # ghash_gfmul_avx
+ vpshufd $0x4e, %xmm4, %xmm1
+ vpshufd $0x4e, %xmm5, %xmm2
+ vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3
+ vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0
+ vpxor %xmm4, %xmm1, %xmm1
+ vpxor %xmm5, %xmm2, %xmm2
+ vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ vpxor %xmm3, %xmm1, %xmm1
+ vmovdqa %xmm0, %xmm6
+ vmovdqa %xmm3, %xmm4
+ vpslldq $8, %xmm1, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpxor %xmm2, %xmm6, %xmm6
+ vpxor %xmm1, %xmm4, %xmm4
+ vpsrld $31, %xmm6, %xmm0
+ vpsrld $31, %xmm4, %xmm1
+ vpslld $0x01, %xmm6, %xmm6
+ vpslld $0x01, %xmm4, %xmm4
+ vpsrldq $12, %xmm0, %xmm2
+ vpslldq $4, %xmm0, %xmm0
+ vpslldq $4, %xmm1, %xmm1
+ vpor %xmm2, %xmm4, %xmm4
+ vpor %xmm0, %xmm6, %xmm6
+ vpor %xmm1, %xmm4, %xmm4
+ vpslld $31, %xmm6, %xmm0
+ vpslld $30, %xmm6, %xmm1
+ vpslld $25, %xmm6, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vmovdqa %xmm0, %xmm1
+ vpsrldq $4, %xmm1, %xmm1
+ vpslldq $12, %xmm0, %xmm0
+ vpxor %xmm0, %xmm6, %xmm6
+ vpsrld $0x01, %xmm6, %xmm2
+ vpsrld $2, %xmm6, %xmm3
+ vpsrld $7, %xmm6, %xmm0
+ vpxor %xmm3, %xmm2, %xmm2
+ vpxor %xmm0, %xmm2, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm6, %xmm2, %xmm2
+ vpxor %xmm2, %xmm4, %xmm4
+ vmovdqa %xmm4, (%rsi)
+ vzeroupper
+ repz retq
+#ifndef __APPLE__
+.size AES_GCM_ghash_block_avx512,.-AES_GCM_ghash_block_avx512
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_GCM_encrypt_update_avx512
+.type AES_GCM_encrypt_update_avx512,@function
+.align 16
+AES_GCM_encrypt_update_avx512:
+#else
+.section __TEXT,__text
+.globl _AES_GCM_encrypt_update_avx512
+.p2align 4
+_AES_GCM_encrypt_update_avx512:
+#endif /* __APPLE__ */
+ pushq %r13
+ pushq %r12
+ pushq %r14
+ pushq %r15
+ pushq %rbx
+ pushq %rbp
+ movq %rdx, %r10
+ movq %rcx, %r11
+ movq 56(%rsp), %rax
+ movq 64(%rsp), %r12
+ subq $0x410, %rsp
+ vmovdqa (%r9), %xmm6
+ vmovdqa (%rax), %xmm5
+ vpsrlq $63, %xmm5, %xmm9
+ vpsllq $0x01, %xmm5, %xmm8
+ vpslldq $8, %xmm9, %xmm9
+ vpor %xmm9, %xmm8, %xmm8
+ vpshufd $0xff, %xmm5, %xmm5
+ vpsrad $31, %xmm5, %xmm5
+ vpand L_avx512_aes_gcm_mod2_128(%rip), %xmm5, %xmm5
+ vpxor %xmm8, %xmm5, %xmm5
+ xorl %r14d, %r14d
+ cmpl $0x100, %r8d
+ jl L_AES_GCM_encrypt_update_avx512_done_128
+ vmovdqa %xmm6, %xmm2
+ # H ^ 1
+ vmovdqu %xmm5, (%rsp)
+ # H ^ 2
+ vpclmulqdq $0x00, %xmm5, %xmm5, %xmm8
+ vpclmulqdq $0x11, %xmm5, %xmm5, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm0
+ vmovdqu %xmm0, 16(%rsp)
+ # H ^ 3
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm5, %xmm9
+ vpxor %xmm5, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm0, %xmm10
+ vpxor %xmm0, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm5, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm5, %xmm0, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm1
+ vmovdqu %xmm1, 32(%rsp)
+ # H ^ 4
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm3
+ vmovdqu %xmm3, 48(%rsp)
+ # H ^ 5
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 64(%rsp)
+ # H ^ 6
+ vpclmulqdq $0x00, %xmm1, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm1, %xmm1, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 80(%rsp)
+ # H ^ 7
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm1, %xmm9
+ vpxor %xmm1, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm3, %xmm10
+ vpxor %xmm3, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm1, %xmm3, %xmm8
+ vpclmulqdq $0x11, %xmm1, %xmm3, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 96(%rsp)
+ # H ^ 8
+ vpclmulqdq $0x00, %xmm3, %xmm3, %xmm8
+ vpclmulqdq $0x11, %xmm3, %xmm3, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 112(%rsp)
+ # H ^ 9
+ vmovdqu 48(%rsp), %xmm0
+ vmovdqu 64(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 128(%rsp)
+ # H ^ 10
+ vmovdqu 64(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 144(%rsp)
+ # H ^ 11
+ vmovdqu 64(%rsp), %xmm0
+ vmovdqu 80(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 160(%rsp)
+ # H ^ 12
+ vmovdqu 80(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 176(%rsp)
+ # H ^ 13
+ vmovdqu 80(%rsp), %xmm0
+ vmovdqu 96(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 192(%rsp)
+ # H ^ 14
+ vmovdqu 96(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 208(%rsp)
+ # H ^ 15
+ vmovdqu 96(%rsp), %xmm0
+ vmovdqu 112(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 224(%rsp)
+ # H ^ 16
+ vmovdqu 112(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 240(%rsp)
+ cmpl $0x200, %r8d
+ jl L_AES_GCM_encrypt_update_avx512_no_ext
+ # H ^ 17
+ vmovdqu 112(%rsp), %xmm0
+ vmovdqu 128(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 256(%rsp)
+ # H ^ 18
+ vmovdqu 128(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 272(%rsp)
+ # H ^ 19
+ vmovdqu 128(%rsp), %xmm0
+ vmovdqu 144(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 288(%rsp)
+ # H ^ 20
+ vmovdqu 144(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 304(%rsp)
+ # H ^ 21
+ vmovdqu 144(%rsp), %xmm0
+ vmovdqu 160(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 320(%rsp)
+ # H ^ 22
+ vmovdqu 160(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 336(%rsp)
+ # H ^ 23
+ vmovdqu 160(%rsp), %xmm0
+ vmovdqu 176(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 352(%rsp)
+ # H ^ 24
+ vmovdqu 176(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 368(%rsp)
+ # H ^ 25
+ vmovdqu 176(%rsp), %xmm0
+ vmovdqu 192(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 384(%rsp)
+ # H ^ 26
+ vmovdqu 192(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 400(%rsp)
+ # H ^ 27
+ vmovdqu 192(%rsp), %xmm0
+ vmovdqu 208(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 416(%rsp)
+ # H ^ 28
+ vmovdqu 208(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 432(%rsp)
+ # H ^ 29
+ vmovdqu 208(%rsp), %xmm0
+ vmovdqu 224(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 448(%rsp)
+ # H ^ 30
+ vmovdqu 224(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 464(%rsp)
+ # H ^ 31
+ vmovdqu 224(%rsp), %xmm0
+ vmovdqu 240(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 480(%rsp)
+ # H ^ 32
+ vmovdqu 240(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 496(%rsp)
+L_AES_GCM_encrypt_update_avx512_no_ext:
+ vbroadcasti32x4 L_avx512_aes_gcm_bswap_epi64(%rip), %zmm22
+ vbroadcasti32x4 L_avx512_aes_gcm_bswap_mask(%rip), %zmm30
+ vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31
+ vbroadcasti32x4 (%rdi), %zmm9
+ vbroadcasti32x4 16(%rdi), %zmm10
+ vbroadcasti32x4 32(%rdi), %zmm11
+ vbroadcasti32x4 48(%rdi), %zmm12
+ vbroadcasti32x4 64(%rdi), %zmm13
+ vbroadcasti32x4 80(%rdi), %zmm14
+ vbroadcasti32x4 96(%rdi), %zmm15
+ vbroadcasti32x4 112(%rdi), %zmm1
+ vbroadcasti32x4 128(%rdi), %zmm2
+ vbroadcasti32x4 144(%rdi), %zmm3
+ cmpl $0x200, %r8d
+ jl L_AES_GCM_encrypt_update_avx512_no_windows
+ movl %r8d, %ebp
+ andl $0xfffffe00, %ebp
+ vmovdqu64 448(%rsp), %zmm23
+ vshufi64x2 $27, %zmm23, %zmm23, %zmm23
+ vmovdqu64 384(%rsp), %zmm24
+ vshufi64x2 $27, %zmm24, %zmm24, %zmm24
+ vmovdqu64 320(%rsp), %zmm25
+ vshufi64x2 $27, %zmm25, %zmm25, %zmm25
+ vmovdqu64 256(%rsp), %zmm26
+ vshufi64x2 $27, %zmm26, %zmm26, %zmm26
+ vmovdqu64 %zmm23, 512(%rsp)
+ vmovdqu64 %zmm24, 576(%rsp)
+ vmovdqu64 %zmm25, 640(%rsp)
+ vmovdqu64 %zmm26, 704(%rsp)
+ vmovdqu64 192(%rsp), %zmm23
+ vshufi64x2 $27, %zmm23, %zmm23, %zmm23
+ vmovdqu64 128(%rsp), %zmm24
+ vshufi64x2 $27, %zmm24, %zmm24, %zmm24
+ vmovdqu64 64(%rsp), %zmm25
+ vshufi64x2 $27, %zmm25, %zmm25, %zmm25
+ vmovdqu64 (%rsp), %zmm26
+ vshufi64x2 $27, %zmm26, %zmm26, %zmm26
+ vmovdqu64 %zmm23, 768(%rsp)
+ vmovdqu64 %zmm24, 832(%rsp)
+ vmovdqu64 %zmm25, 896(%rsp)
+ vmovdqu64 %zmm26, 960(%rsp)
+ # 512 bytes of input
+ leaq (%r10,%r14,1), %r15
+ vbroadcasti32x4 (%r12), %zmm20
+ vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16
+ vpshufb %zmm22, %zmm16, %zmm16
+ vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17
+ vpshufb %zmm22, %zmm17, %zmm17
+ vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18
+ vpshufb %zmm22, %zmm18, %zmm18
+ vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19
+ vpshufb %zmm22, %zmm19, %zmm19
+ vmovdqu (%r12), %xmm8
+ vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, (%r12)
+ vpxorq %zmm9, %zmm16, %zmm16
+ vpxorq %zmm9, %zmm17, %zmm17
+ vpxorq %zmm9, %zmm18, %zmm18
+ vpxorq %zmm9, %zmm19, %zmm19
+ vaesenc %zmm10, %zmm16, %zmm16
+ vaesenc %zmm10, %zmm17, %zmm17
+ vaesenc %zmm10, %zmm18, %zmm18
+ vaesenc %zmm10, %zmm19, %zmm19
+ vaesenc %zmm11, %zmm16, %zmm16
+ vaesenc %zmm11, %zmm17, %zmm17
+ vaesenc %zmm11, %zmm18, %zmm18
+ vaesenc %zmm11, %zmm19, %zmm19
+ vaesenc %zmm12, %zmm16, %zmm16
+ vaesenc %zmm12, %zmm17, %zmm17
+ vaesenc %zmm12, %zmm18, %zmm18
+ vaesenc %zmm12, %zmm19, %zmm19
+ vaesenc %zmm13, %zmm16, %zmm16
+ vaesenc %zmm13, %zmm17, %zmm17
+ vaesenc %zmm13, %zmm18, %zmm18
+ vaesenc %zmm13, %zmm19, %zmm19
+ vaesenc %zmm14, %zmm16, %zmm16
+ vaesenc %zmm14, %zmm17, %zmm17
+ vaesenc %zmm14, %zmm18, %zmm18
+ vaesenc %zmm14, %zmm19, %zmm19
+ vaesenc %zmm15, %zmm16, %zmm16
+ vaesenc %zmm15, %zmm17, %zmm17
+ vaesenc %zmm15, %zmm18, %zmm18
+ vaesenc %zmm15, %zmm19, %zmm19
+ vaesenc %zmm1, %zmm16, %zmm16
+ vaesenc %zmm1, %zmm17, %zmm17
+ vaesenc %zmm1, %zmm18, %zmm18
+ vaesenc %zmm1, %zmm19, %zmm19
+ vaesenc %zmm2, %zmm16, %zmm16
+ vaesenc %zmm2, %zmm17, %zmm17
+ vaesenc %zmm2, %zmm18, %zmm18
+ vaesenc %zmm2, %zmm19, %zmm19
+ vaesenc %zmm3, %zmm16, %zmm16
+ vaesenc %zmm3, %zmm17, %zmm17
+ vaesenc %zmm3, %zmm18, %zmm18
+ vaesenc %zmm3, %zmm19, %zmm19
+ cmpl $11, %esi
+ vbroadcasti32x4 160(%rdi), %zmm20
+ jl L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 176(%rdi), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ cmpl $13, %esi
+ vbroadcasti32x4 192(%rdi), %zmm20
+ jl L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 208(%rdi), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 224(%rdi), %zmm20
+L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last:
+ vaesenclast %zmm20, %zmm16, %zmm16
+ vaesenclast %zmm20, %zmm17, %zmm17
+ vaesenclast %zmm20, %zmm18, %zmm18
+ vaesenclast %zmm20, %zmm19, %zmm19
+ leaq (%r11,%r14,1), %rcx
+ leaq (%r10,%r14,1), %rdx
+ vmovdqu64 (%rcx), %zmm21
+ vpxorq %zmm21, %zmm16, %zmm16
+ vmovdqu64 %zmm16, (%rdx)
+ vmovdqu64 64(%rcx), %zmm21
+ vpxorq %zmm21, %zmm17, %zmm17
+ vmovdqu64 %zmm17, 64(%rdx)
+ vmovdqu64 128(%rcx), %zmm21
+ vpxorq %zmm21, %zmm18, %zmm18
+ vmovdqu64 %zmm18, 128(%rdx)
+ vmovdqu64 192(%rcx), %zmm21
+ vpxorq %zmm21, %zmm19, %zmm19
+ vmovdqu64 %zmm19, 192(%rdx)
+ addl $0x100, %r14d
+ vbroadcasti32x4 (%r12), %zmm20
+ vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16
+ vpshufb %zmm22, %zmm16, %zmm16
+ vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17
+ vpshufb %zmm22, %zmm17, %zmm17
+ vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18
+ vpshufb %zmm22, %zmm18, %zmm18
+ vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19
+ vpshufb %zmm22, %zmm19, %zmm19
+ vmovdqu (%r12), %xmm8
+ vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, (%r12)
+ vpxorq %zmm9, %zmm16, %zmm16
+ vpxorq %zmm9, %zmm17, %zmm17
+ vpxorq %zmm9, %zmm18, %zmm18
+ vpxorq %zmm9, %zmm19, %zmm19
+ vaesenc %zmm10, %zmm16, %zmm16
+ vaesenc %zmm10, %zmm17, %zmm17
+ vaesenc %zmm10, %zmm18, %zmm18
+ vaesenc %zmm10, %zmm19, %zmm19
+ vaesenc %zmm11, %zmm16, %zmm16
+ vaesenc %zmm11, %zmm17, %zmm17
+ vaesenc %zmm11, %zmm18, %zmm18
+ vaesenc %zmm11, %zmm19, %zmm19
+ vaesenc %zmm12, %zmm16, %zmm16
+ vaesenc %zmm12, %zmm17, %zmm17
+ vaesenc %zmm12, %zmm18, %zmm18
+ vaesenc %zmm12, %zmm19, %zmm19
+ vaesenc %zmm13, %zmm16, %zmm16
+ vaesenc %zmm13, %zmm17, %zmm17
+ vaesenc %zmm13, %zmm18, %zmm18
+ vaesenc %zmm13, %zmm19, %zmm19
+ vaesenc %zmm14, %zmm16, %zmm16
+ vaesenc %zmm14, %zmm17, %zmm17
+ vaesenc %zmm14, %zmm18, %zmm18
+ vaesenc %zmm14, %zmm19, %zmm19
+ vaesenc %zmm15, %zmm16, %zmm16
+ vaesenc %zmm15, %zmm17, %zmm17
+ vaesenc %zmm15, %zmm18, %zmm18
+ vaesenc %zmm15, %zmm19, %zmm19
+ vaesenc %zmm1, %zmm16, %zmm16
+ vaesenc %zmm1, %zmm17, %zmm17
+ vaesenc %zmm1, %zmm18, %zmm18
+ vaesenc %zmm1, %zmm19, %zmm19
+ vaesenc %zmm2, %zmm16, %zmm16
+ vaesenc %zmm2, %zmm17, %zmm17
+ vaesenc %zmm2, %zmm18, %zmm18
+ vaesenc %zmm2, %zmm19, %zmm19
+ vaesenc %zmm3, %zmm16, %zmm16
+ vaesenc %zmm3, %zmm17, %zmm17
+ vaesenc %zmm3, %zmm18, %zmm18
+ vaesenc %zmm3, %zmm19, %zmm19
+ cmpl $11, %esi
+ vbroadcasti32x4 160(%rdi), %zmm20
+ jl L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 176(%rdi), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ cmpl $13, %esi
+ vbroadcasti32x4 192(%rdi), %zmm20
+ jl L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 208(%rdi), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 224(%rdi), %zmm20
+L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last:
+ vaesenclast %zmm20, %zmm16, %zmm16
+ vaesenclast %zmm20, %zmm17, %zmm17
+ vaesenclast %zmm20, %zmm18, %zmm18
+ vaesenclast %zmm20, %zmm19, %zmm19
+ leaq (%r11,%r14,1), %rcx
+ leaq (%r10,%r14,1), %rdx
+ vmovdqu64 (%rcx), %zmm21
+ vpxorq %zmm21, %zmm16, %zmm16
+ vmovdqu64 %zmm16, (%rdx)
+ vmovdqu64 64(%rcx), %zmm21
+ vpxorq %zmm21, %zmm17, %zmm17
+ vmovdqu64 %zmm17, 64(%rdx)
+ vmovdqu64 128(%rcx), %zmm21
+ vpxorq %zmm21, %zmm18, %zmm18
+ vmovdqu64 %zmm18, 128(%rdx)
+ vmovdqu64 192(%rcx), %zmm21
+ vpxorq %zmm21, %zmm19, %zmm19
+ vmovdqu64 %zmm19, 192(%rdx)
+ addl $0x100, %r14d
+ cmpl %ebp, %r14d
+ jge L_AES_GCM_encrypt_update_avx512_last_win
+L_AES_GCM_encrypt_update_avx512_win_loop:
+ leaq (%r10,%r14,1), %rbx
+ vpxorq %zmm21, %zmm21, %zmm21
+ vinserti32x4 $0x00, %xmm6, %zmm21, %zmm21
+ vbroadcasti32x4 (%r12), %zmm20
+ vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16
+ vpshufb %zmm22, %zmm16, %zmm16
+ vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17
+ vpshufb %zmm22, %zmm17, %zmm17
+ vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18
+ vpshufb %zmm22, %zmm18, %zmm18
+ vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19
+ vpshufb %zmm22, %zmm19, %zmm19
+ vmovdqu (%r12), %xmm8
+ vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, (%r12)
+ vpxorq %zmm9, %zmm16, %zmm16
+ vpxorq %zmm9, %zmm17, %zmm17
+ vpxorq %zmm9, %zmm18, %zmm18
+ vpxorq %zmm9, %zmm19, %zmm19
+ vmovdqu64 (%r15), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vpxorq %zmm21, %zmm31, %zmm31
+ vaesenc %zmm10, %zmm16, %zmm16
+ vaesenc %zmm10, %zmm17, %zmm17
+ vaesenc %zmm10, %zmm18, %zmm18
+ vaesenc %zmm10, %zmm19, %zmm19
+ vpclmulqdq $0x00, 512(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 512(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 512(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 512(%rsp), %zmm31, %zmm26
+ vmovdqa64 %zmm23, %zmm27
+ vpxorq %zmm24, %zmm25, %zmm28
+ vmovdqa64 %zmm26, %zmm29
+ vaesenc %zmm11, %zmm16, %zmm16
+ vaesenc %zmm11, %zmm17, %zmm17
+ vaesenc %zmm11, %zmm18, %zmm18
+ vaesenc %zmm11, %zmm19, %zmm19
+ vmovdqu64 64(%r15), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm12, %zmm16, %zmm16
+ vaesenc %zmm12, %zmm17, %zmm17
+ vaesenc %zmm12, %zmm18, %zmm18
+ vaesenc %zmm12, %zmm19, %zmm19
+ vpclmulqdq $0x00, 576(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 576(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 576(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 576(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm13, %zmm16, %zmm16
+ vaesenc %zmm13, %zmm17, %zmm17
+ vaesenc %zmm13, %zmm18, %zmm18
+ vaesenc %zmm13, %zmm19, %zmm19
+ vmovdqu64 128(%r15), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm14, %zmm16, %zmm16
+ vaesenc %zmm14, %zmm17, %zmm17
+ vaesenc %zmm14, %zmm18, %zmm18
+ vaesenc %zmm14, %zmm19, %zmm19
+ vpclmulqdq $0x00, 640(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 640(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 640(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 640(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm15, %zmm16, %zmm16
+ vaesenc %zmm15, %zmm17, %zmm17
+ vaesenc %zmm15, %zmm18, %zmm18
+ vaesenc %zmm15, %zmm19, %zmm19
+ vmovdqu64 192(%r15), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm1, %zmm16, %zmm16
+ vaesenc %zmm1, %zmm17, %zmm17
+ vaesenc %zmm1, %zmm18, %zmm18
+ vaesenc %zmm1, %zmm19, %zmm19
+ vpclmulqdq $0x00, 704(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 704(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 704(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 704(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm2, %zmm16, %zmm16
+ vaesenc %zmm2, %zmm17, %zmm17
+ vaesenc %zmm2, %zmm18, %zmm18
+ vaesenc %zmm2, %zmm19, %zmm19
+ vaesenc %zmm3, %zmm16, %zmm16
+ vaesenc %zmm3, %zmm17, %zmm17
+ vaesenc %zmm3, %zmm18, %zmm18
+ vaesenc %zmm3, %zmm19, %zmm19
+ cmpl $11, %esi
+ vbroadcasti32x4 160(%rdi), %zmm20
+ jl L_AES_GCM_encrypt_update_avx512_a_il_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 176(%rdi), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ cmpl $13, %esi
+ vbroadcasti32x4 192(%rdi), %zmm20
+ jl L_AES_GCM_encrypt_update_avx512_a_il_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 208(%rdi), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 224(%rdi), %zmm20
+L_AES_GCM_encrypt_update_avx512_a_il_last:
+ vaesenclast %zmm20, %zmm16, %zmm16
+ vaesenclast %zmm20, %zmm17, %zmm17
+ vaesenclast %zmm20, %zmm18, %zmm18
+ vaesenclast %zmm20, %zmm19, %zmm19
+ leaq (%r11,%r14,1), %rcx
+ leaq (%r10,%r14,1), %rdx
+ vmovdqu64 (%rcx), %zmm21
+ vpxorq %zmm21, %zmm16, %zmm16
+ vmovdqu64 %zmm16, (%rdx)
+ vmovdqu64 64(%rcx), %zmm21
+ vpxorq %zmm21, %zmm17, %zmm17
+ vmovdqu64 %zmm17, 64(%rdx)
+ vmovdqu64 128(%rcx), %zmm21
+ vpxorq %zmm21, %zmm18, %zmm18
+ vmovdqu64 %zmm18, 128(%rdx)
+ vmovdqu64 192(%rcx), %zmm21
+ vpxorq %zmm21, %zmm19, %zmm19
+ vmovdqu64 %zmm19, 192(%rdx)
+ addl $0x100, %r14d
+ vbroadcasti32x4 (%r12), %zmm20
+ vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16
+ vpshufb %zmm22, %zmm16, %zmm16
+ vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17
+ vpshufb %zmm22, %zmm17, %zmm17
+ vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18
+ vpshufb %zmm22, %zmm18, %zmm18
+ vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19
+ vpshufb %zmm22, %zmm19, %zmm19
+ vmovdqu (%r12), %xmm8
+ vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, (%r12)
+ vpxorq %zmm9, %zmm16, %zmm16
+ vpxorq %zmm9, %zmm17, %zmm17
+ vpxorq %zmm9, %zmm18, %zmm18
+ vpxorq %zmm9, %zmm19, %zmm19
+ vmovdqu64 256(%r15), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm10, %zmm16, %zmm16
+ vaesenc %zmm10, %zmm17, %zmm17
+ vaesenc %zmm10, %zmm18, %zmm18
+ vaesenc %zmm10, %zmm19, %zmm19
+ vpclmulqdq $0x00, 768(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 768(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 768(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 768(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm11, %zmm16, %zmm16
+ vaesenc %zmm11, %zmm17, %zmm17
+ vaesenc %zmm11, %zmm18, %zmm18
+ vaesenc %zmm11, %zmm19, %zmm19
+ vmovdqu64 320(%r15), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm12, %zmm16, %zmm16
+ vaesenc %zmm12, %zmm17, %zmm17
+ vaesenc %zmm12, %zmm18, %zmm18
+ vaesenc %zmm12, %zmm19, %zmm19
+ vpclmulqdq $0x00, 832(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 832(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 832(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 832(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm13, %zmm16, %zmm16
+ vaesenc %zmm13, %zmm17, %zmm17
+ vaesenc %zmm13, %zmm18, %zmm18
+ vaesenc %zmm13, %zmm19, %zmm19
+ vmovdqu64 384(%r15), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm14, %zmm16, %zmm16
+ vaesenc %zmm14, %zmm17, %zmm17
+ vaesenc %zmm14, %zmm18, %zmm18
+ vaesenc %zmm14, %zmm19, %zmm19
+ vpclmulqdq $0x00, 896(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 896(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 896(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 896(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm15, %zmm16, %zmm16
+ vaesenc %zmm15, %zmm17, %zmm17
+ vaesenc %zmm15, %zmm18, %zmm18
+ vaesenc %zmm15, %zmm19, %zmm19
+ vmovdqu64 448(%r15), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm1, %zmm16, %zmm16
+ vaesenc %zmm1, %zmm17, %zmm17
+ vaesenc %zmm1, %zmm18, %zmm18
+ vaesenc %zmm1, %zmm19, %zmm19
+ vpclmulqdq $0x00, 960(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 960(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 960(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 960(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm2, %zmm16, %zmm16
+ vaesenc %zmm2, %zmm17, %zmm17
+ vaesenc %zmm2, %zmm18, %zmm18
+ vaesenc %zmm2, %zmm19, %zmm19
+ vaesenc %zmm3, %zmm16, %zmm16
+ vaesenc %zmm3, %zmm17, %zmm17
+ vaesenc %zmm3, %zmm18, %zmm18
+ vaesenc %zmm3, %zmm19, %zmm19
+ cmpl $11, %esi
+ vbroadcasti32x4 160(%rdi), %zmm20
+ jl L_AES_GCM_encrypt_update_avx512_b_il_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 176(%rdi), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ cmpl $13, %esi
+ vbroadcasti32x4 192(%rdi), %zmm20
+ jl L_AES_GCM_encrypt_update_avx512_b_il_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 208(%rdi), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 224(%rdi), %zmm20
+L_AES_GCM_encrypt_update_avx512_b_il_last:
+ vaesenclast %zmm20, %zmm16, %zmm16
+ vaesenclast %zmm20, %zmm17, %zmm17
+ vaesenclast %zmm20, %zmm18, %zmm18
+ vaesenclast %zmm20, %zmm19, %zmm19
+ leaq (%r11,%r14,1), %rcx
+ leaq (%r10,%r14,1), %rdx
+ vmovdqu64 (%rcx), %zmm21
+ vpxorq %zmm21, %zmm16, %zmm16
+ vmovdqu64 %zmm16, (%rdx)
+ vmovdqu64 64(%rcx), %zmm21
+ vpxorq %zmm21, %zmm17, %zmm17
+ vmovdqu64 %zmm17, 64(%rdx)
+ vmovdqu64 128(%rcx), %zmm21
+ vpxorq %zmm21, %zmm18, %zmm18
+ vmovdqu64 %zmm18, 128(%rdx)
+ vmovdqu64 192(%rcx), %zmm21
+ vpxorq %zmm21, %zmm19, %zmm19
+ vmovdqu64 %zmm19, 192(%rdx)
+ addl $0x100, %r14d
+ vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31
+ vpclmulqdq $0x01, %zmm27, %zmm31, %zmm23
+ vpshufd $0x4e, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm23, %zmm27, %zmm28
+ vpclmulqdq $0x01, %zmm28, %zmm31, %zmm23
+ vpshufd $0x4e, %zmm28, %zmm28
+ vpternlogq $0x96, %zmm23, %zmm28, %zmm29
+ vextracti32x4 $0x01, %zmm29, %xmm0
+ vextracti32x4 $2, %zmm29, %xmm4
+ vextracti32x4 $3, %zmm29, %xmm5
+ vpxorq %xmm0, %xmm29, %xmm6
+ vpternlogq $0x96, %xmm4, %xmm5, %xmm6
+ movq %rbx, %r15
+ cmpl %ebp, %r14d
+ jl L_AES_GCM_encrypt_update_avx512_win_loop
+L_AES_GCM_encrypt_update_avx512_last_win:
+ vpxorq %zmm20, %zmm20, %zmm20
+ vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20
+ vmovdqu64 512(%rsp), %zmm23
+ vmovdqu64 576(%rsp), %zmm24
+ vmovdqu64 640(%rsp), %zmm25
+ vmovdqu64 704(%rsp), %zmm26
+ vmovdqu64 (%r15), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpxorq %zmm20, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm23, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19
+ vmovdqa64 %zmm16, %zmm27
+ vpxorq %zmm17, %zmm18, %zmm28
+ vmovdqa64 %zmm19, %zmm29
+ vmovdqu64 64(%r15), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm24, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 128(%r15), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm25, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 192(%r15), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm26, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 768(%rsp), %zmm23
+ vmovdqu64 832(%rsp), %zmm24
+ vmovdqu64 896(%rsp), %zmm25
+ vmovdqu64 960(%rsp), %zmm26
+ vmovdqu64 256(%r15), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm23, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 320(%r15), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm24, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 384(%r15), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm25, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 448(%r15), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm26, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21
+ vpshufd $0x4e, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm21, %zmm27, %zmm28
+ vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21
+ vpshufd $0x4e, %zmm28, %zmm28
+ vpternlogq $0x96, %zmm21, %zmm28, %zmm29
+ vextracti32x4 $0x01, %zmm29, %xmm0
+ vextracti32x4 $2, %zmm29, %xmm4
+ vextracti32x4 $3, %zmm29, %xmm5
+ vpxorq %xmm0, %xmm29, %xmm6
+ vpternlogq $0x96, %xmm4, %xmm5, %xmm6
+L_AES_GCM_encrypt_update_avx512_no_windows:
+ vmovdqu64 192(%rsp), %zmm23
+ vshufi64x2 $27, %zmm23, %zmm23, %zmm23
+ vmovdqu64 128(%rsp), %zmm24
+ vshufi64x2 $27, %zmm24, %zmm24, %zmm24
+ vmovdqu64 64(%rsp), %zmm25
+ vshufi64x2 $27, %zmm25, %zmm25, %zmm25
+ vmovdqu64 (%rsp), %zmm26
+ vshufi64x2 $27, %zmm26, %zmm26, %zmm26
+ movl %r8d, %r13d
+ andl $0xffffff00, %r13d
+ cmpl %r13d, %r14d
+ jge L_AES_GCM_encrypt_update_avx512_after_256
+ # 256 bytes of input
+ vbroadcasti32x4 (%r12), %zmm20
+ vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16
+ vpshufb %zmm22, %zmm16, %zmm16
+ vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17
+ vpshufb %zmm22, %zmm17, %zmm17
+ vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18
+ vpshufb %zmm22, %zmm18, %zmm18
+ vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19
+ vpshufb %zmm22, %zmm19, %zmm19
+ vmovdqu (%r12), %xmm8
+ vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, (%r12)
+ vpxorq %zmm9, %zmm16, %zmm16
+ vpxorq %zmm9, %zmm17, %zmm17
+ vpxorq %zmm9, %zmm18, %zmm18
+ vpxorq %zmm9, %zmm19, %zmm19
+ vaesenc %zmm10, %zmm16, %zmm16
+ vaesenc %zmm10, %zmm17, %zmm17
+ vaesenc %zmm10, %zmm18, %zmm18
+ vaesenc %zmm10, %zmm19, %zmm19
+ vaesenc %zmm11, %zmm16, %zmm16
+ vaesenc %zmm11, %zmm17, %zmm17
+ vaesenc %zmm11, %zmm18, %zmm18
+ vaesenc %zmm11, %zmm19, %zmm19
+ vaesenc %zmm12, %zmm16, %zmm16
+ vaesenc %zmm12, %zmm17, %zmm17
+ vaesenc %zmm12, %zmm18, %zmm18
+ vaesenc %zmm12, %zmm19, %zmm19
+ vaesenc %zmm13, %zmm16, %zmm16
+ vaesenc %zmm13, %zmm17, %zmm17
+ vaesenc %zmm13, %zmm18, %zmm18
+ vaesenc %zmm13, %zmm19, %zmm19
+ vaesenc %zmm14, %zmm16, %zmm16
+ vaesenc %zmm14, %zmm17, %zmm17
+ vaesenc %zmm14, %zmm18, %zmm18
+ vaesenc %zmm14, %zmm19, %zmm19
+ vaesenc %zmm15, %zmm16, %zmm16
+ vaesenc %zmm15, %zmm17, %zmm17
+ vaesenc %zmm15, %zmm18, %zmm18
+ vaesenc %zmm15, %zmm19, %zmm19
+ vaesenc %zmm1, %zmm16, %zmm16
+ vaesenc %zmm1, %zmm17, %zmm17
+ vaesenc %zmm1, %zmm18, %zmm18
+ vaesenc %zmm1, %zmm19, %zmm19
+ vaesenc %zmm2, %zmm16, %zmm16
+ vaesenc %zmm2, %zmm17, %zmm17
+ vaesenc %zmm2, %zmm18, %zmm18
+ vaesenc %zmm2, %zmm19, %zmm19
+ vaesenc %zmm3, %zmm16, %zmm16
+ vaesenc %zmm3, %zmm17, %zmm17
+ vaesenc %zmm3, %zmm18, %zmm18
+ vaesenc %zmm3, %zmm19, %zmm19
+ cmpl $11, %esi
+ vbroadcasti32x4 160(%rdi), %zmm20
+ jl L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 176(%rdi), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ cmpl $13, %esi
+ vbroadcasti32x4 192(%rdi), %zmm20
+ jl L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 208(%rdi), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 224(%rdi), %zmm20
+L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last:
+ vaesenclast %zmm20, %zmm16, %zmm16
+ vaesenclast %zmm20, %zmm17, %zmm17
+ vaesenclast %zmm20, %zmm18, %zmm18
+ vaesenclast %zmm20, %zmm19, %zmm19
+ leaq (%r11,%r14,1), %rcx
+ leaq (%r10,%r14,1), %rdx
+ vmovdqu64 (%rcx), %zmm21
+ vpxorq %zmm21, %zmm16, %zmm16
+ vmovdqu64 %zmm16, (%rdx)
+ vmovdqu64 64(%rcx), %zmm21
+ vpxorq %zmm21, %zmm17, %zmm17
+ vmovdqu64 %zmm17, 64(%rdx)
+ vmovdqu64 128(%rcx), %zmm21
+ vpxorq %zmm21, %zmm18, %zmm18
+ vmovdqu64 %zmm18, 128(%rdx)
+ vmovdqu64 192(%rcx), %zmm21
+ vpxorq %zmm21, %zmm19, %zmm19
+ vmovdqu64 %zmm19, 192(%rdx)
+ movq %rdx, %r15
+ addl $0x100, %r14d
+ cmpl %r13d, %r14d
+ jge L_AES_GCM_encrypt_update_avx512_last_ghash
+L_AES_GCM_encrypt_update_avx512_ghash_128:
+ vbroadcasti32x4 (%r12), %zmm20
+ vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16
+ vpshufb %zmm22, %zmm16, %zmm16
+ vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17
+ vpshufb %zmm22, %zmm17, %zmm17
+ vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18
+ vpshufb %zmm22, %zmm18, %zmm18
+ vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19
+ vpshufb %zmm22, %zmm19, %zmm19
+ vmovdqu (%r12), %xmm8
+ vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, (%r12)
+ vpxorq %zmm9, %zmm16, %zmm16
+ vpxorq %zmm9, %zmm17, %zmm17
+ vpxorq %zmm9, %zmm18, %zmm18
+ vpxorq %zmm9, %zmm19, %zmm19
+ vaesenc %zmm10, %zmm16, %zmm16
+ vaesenc %zmm10, %zmm17, %zmm17
+ vaesenc %zmm10, %zmm18, %zmm18
+ vaesenc %zmm10, %zmm19, %zmm19
+ vaesenc %zmm11, %zmm16, %zmm16
+ vaesenc %zmm11, %zmm17, %zmm17
+ vaesenc %zmm11, %zmm18, %zmm18
+ vaesenc %zmm11, %zmm19, %zmm19
+ vaesenc %zmm12, %zmm16, %zmm16
+ vaesenc %zmm12, %zmm17, %zmm17
+ vaesenc %zmm12, %zmm18, %zmm18
+ vaesenc %zmm12, %zmm19, %zmm19
+ vaesenc %zmm13, %zmm16, %zmm16
+ vaesenc %zmm13, %zmm17, %zmm17
+ vaesenc %zmm13, %zmm18, %zmm18
+ vaesenc %zmm13, %zmm19, %zmm19
+ vaesenc %zmm14, %zmm16, %zmm16
+ vaesenc %zmm14, %zmm17, %zmm17
+ vaesenc %zmm14, %zmm18, %zmm18
+ vaesenc %zmm14, %zmm19, %zmm19
+ vaesenc %zmm15, %zmm16, %zmm16
+ vaesenc %zmm15, %zmm17, %zmm17
+ vaesenc %zmm15, %zmm18, %zmm18
+ vaesenc %zmm15, %zmm19, %zmm19
+ vaesenc %zmm1, %zmm16, %zmm16
+ vaesenc %zmm1, %zmm17, %zmm17
+ vaesenc %zmm1, %zmm18, %zmm18
+ vaesenc %zmm1, %zmm19, %zmm19
+ vaesenc %zmm2, %zmm16, %zmm16
+ vaesenc %zmm2, %zmm17, %zmm17
+ vaesenc %zmm2, %zmm18, %zmm18
+ vaesenc %zmm2, %zmm19, %zmm19
+ vaesenc %zmm3, %zmm16, %zmm16
+ vaesenc %zmm3, %zmm17, %zmm17
+ vaesenc %zmm3, %zmm18, %zmm18
+ vaesenc %zmm3, %zmm19, %zmm19
+ cmpl $11, %esi
+ vbroadcasti32x4 160(%rdi), %zmm20
+ jl L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 176(%rdi), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ cmpl $13, %esi
+ vbroadcasti32x4 192(%rdi), %zmm20
+ jl L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 208(%rdi), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 224(%rdi), %zmm20
+L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last:
+ vaesenclast %zmm20, %zmm16, %zmm16
+ vaesenclast %zmm20, %zmm17, %zmm17
+ vaesenclast %zmm20, %zmm18, %zmm18
+ vaesenclast %zmm20, %zmm19, %zmm19
+ leaq (%r11,%r14,1), %rcx
+ leaq (%r10,%r14,1), %rdx
+ vmovdqu64 (%rcx), %zmm21
+ vpxorq %zmm21, %zmm16, %zmm16
+ vmovdqu64 %zmm16, (%rdx)
+ vmovdqu64 64(%rcx), %zmm21
+ vpxorq %zmm21, %zmm17, %zmm17
+ vmovdqu64 %zmm17, 64(%rdx)
+ vmovdqu64 128(%rcx), %zmm21
+ vpxorq %zmm21, %zmm18, %zmm18
+ vmovdqu64 %zmm18, 128(%rdx)
+ vmovdqu64 192(%rcx), %zmm21
+ vpxorq %zmm21, %zmm19, %zmm19
+ vmovdqu64 %zmm19, 192(%rdx)
+ vpxorq %zmm20, %zmm20, %zmm20
+ vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20
+ vmovdqu64 (%r15), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpxorq %zmm20, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm23, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19
+ vmovdqa64 %zmm16, %zmm27
+ vpxorq %zmm17, %zmm18, %zmm28
+ vmovdqa64 %zmm19, %zmm29
+ vmovdqu64 64(%r15), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm24, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 128(%r15), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm25, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 192(%r15), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm26, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21
+ vpshufd $0x4e, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm21, %zmm27, %zmm28
+ vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21
+ vpshufd $0x4e, %zmm28, %zmm28
+ vpternlogq $0x96, %zmm21, %zmm28, %zmm29
+ vextracti32x4 $0x01, %zmm29, %xmm0
+ vextracti32x4 $2, %zmm29, %xmm4
+ vextracti32x4 $3, %zmm29, %xmm5
+ vpxorq %xmm0, %xmm29, %xmm6
+ vpternlogq $0x96, %xmm4, %xmm5, %xmm6
+ movq %rdx, %r15
+ addl $0x100, %r14d
+ cmpl %r13d, %r14d
+ jl L_AES_GCM_encrypt_update_avx512_ghash_128
+L_AES_GCM_encrypt_update_avx512_last_ghash:
+ vpxorq %zmm20, %zmm20, %zmm20
+ vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20
+ vmovdqu64 (%r15), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpxorq %zmm20, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm23, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19
+ vmovdqa64 %zmm16, %zmm27
+ vpxorq %zmm17, %zmm18, %zmm28
+ vmovdqa64 %zmm19, %zmm29
+ vmovdqu64 64(%r15), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm24, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 128(%r15), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm25, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 192(%r15), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm26, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21
+ vpshufd $0x4e, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm21, %zmm27, %zmm28
+ vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21
+ vpshufd $0x4e, %zmm28, %zmm28
+ vpternlogq $0x96, %zmm21, %zmm28, %zmm29
+ vextracti32x4 $0x01, %zmm29, %xmm0
+ vextracti32x4 $2, %zmm29, %xmm4
+ vextracti32x4 $3, %zmm29, %xmm5
+ vpxorq %xmm0, %xmm29, %xmm6
+ vpternlogq $0x96, %xmm4, %xmm5, %xmm6
+L_AES_GCM_encrypt_update_avx512_after_256:
+ vmovdqu (%rsp), %xmm5
+L_AES_GCM_encrypt_update_avx512_done_128:
+ movl %r8d, %edx
+ cmpl %edx, %r14d
+ jge L_AES_GCM_encrypt_update_avx512_done_enc
+ movl %r8d, %r13d
+ andl $0xfffffff0, %r13d
+ cmpl %r13d, %r14d
+ jge L_AES_GCM_encrypt_update_avx512_last_block_done
+ vmovdqu (%r12), %xmm9
+ vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8
+ vpaddd L_avx512_aes_gcm_one(%rip), %xmm9, %xmm9
+ vmovdqu %xmm9, (%r12)
+ vpxor (%rdi), %xmm8, %xmm8
+ vaesenc 16(%rdi), %xmm8, %xmm8
+ vaesenc 32(%rdi), %xmm8, %xmm8
+ vaesenc 48(%rdi), %xmm8, %xmm8
+ vaesenc 64(%rdi), %xmm8, %xmm8
+ vaesenc 80(%rdi), %xmm8, %xmm8
+ vaesenc 96(%rdi), %xmm8, %xmm8
+ vaesenc 112(%rdi), %xmm8, %xmm8
+ vaesenc 128(%rdi), %xmm8, %xmm8
+ vaesenc 144(%rdi), %xmm8, %xmm8
+ cmpl $11, %esi
+ vmovdqa 160(%rdi), %xmm9
+ jl L_AES_GCM_encrypt_update_avx512_aesenc_block_last
+ vaesenc %xmm9, %xmm8, %xmm8
+ vaesenc 176(%rdi), %xmm8, %xmm8
+ cmpl $13, %esi
+ vmovdqa 192(%rdi), %xmm9
+ jl L_AES_GCM_encrypt_update_avx512_aesenc_block_last
+ vaesenc %xmm9, %xmm8, %xmm8
+ vaesenc 208(%rdi), %xmm8, %xmm8
+ vmovdqa 224(%rdi), %xmm9
+L_AES_GCM_encrypt_update_avx512_aesenc_block_last:
+ vaesenclast %xmm9, %xmm8, %xmm8
+ vmovdqu (%r11,%r14,1), %xmm9
+ vpxor %xmm9, %xmm8, %xmm8
+ vmovdqu %xmm8, (%r10,%r14,1)
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
+ vpxor %xmm8, %xmm6, %xmm6
+ addl $16, %r14d
+ cmpl %r13d, %r14d
+ jge L_AES_GCM_encrypt_update_avx512_last_block_ghash
+L_AES_GCM_encrypt_update_avx512_last_block_start:
+ vmovdqu (%r11,%r14,1), %xmm13
+ vmovdqu (%r12), %xmm9
+ vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8
+ vpaddd L_avx512_aes_gcm_one(%rip), %xmm9, %xmm9
+ vmovdqu %xmm9, (%r12)
+ vpxor (%rdi), %xmm8, %xmm8
+ vpclmulqdq $16, %xmm5, %xmm6, %xmm10
+ vaesenc 16(%rdi), %xmm8, %xmm8
+ vaesenc 32(%rdi), %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm5, %xmm6, %xmm11
+ vaesenc 48(%rdi), %xmm8, %xmm8
+ vaesenc 64(%rdi), %xmm8, %xmm8
+ vpclmulqdq $0x00, %xmm5, %xmm6, %xmm12
+ vaesenc 80(%rdi), %xmm8, %xmm8
+ vpclmulqdq $0x11, %xmm5, %xmm6, %xmm1
+ vaesenc 96(%rdi), %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpslldq $8, %xmm10, %xmm2
+ vpsrldq $8, %xmm10, %xmm10
+ vaesenc 112(%rdi), %xmm8, %xmm8
+ vpxor %xmm12, %xmm2, %xmm2
+ vpxor %xmm10, %xmm1, %xmm3
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm0
+ vpclmulqdq $16, %xmm0, %xmm2, %xmm11
+ vaesenc 128(%rdi), %xmm8, %xmm8
+ vpshufd $0x4e, %xmm2, %xmm10
+ vpxor %xmm11, %xmm10, %xmm10
+ vpclmulqdq $16, %xmm0, %xmm10, %xmm11
+ vaesenc 144(%rdi), %xmm8, %xmm8
+ vpshufd $0x4e, %xmm10, %xmm10
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm3, %xmm10, %xmm6
+ cmpl $11, %esi
+ vmovdqa 160(%rdi), %xmm9
+ jl L_AES_GCM_encrypt_update_avx512_aesenc_gfmul_last
+ vaesenc %xmm9, %xmm8, %xmm8
+ vaesenc 176(%rdi), %xmm8, %xmm8
+ cmpl $13, %esi
+ vmovdqa 192(%rdi), %xmm9
+ jl L_AES_GCM_encrypt_update_avx512_aesenc_gfmul_last
+ vaesenc %xmm9, %xmm8, %xmm8
+ vaesenc 208(%rdi), %xmm8, %xmm8
+ vmovdqa 224(%rdi), %xmm9
+L_AES_GCM_encrypt_update_avx512_aesenc_gfmul_last:
+ vaesenclast %xmm9, %xmm8, %xmm8
+ vmovdqa %xmm13, %xmm0
+ vpxor %xmm0, %xmm8, %xmm8
+ vmovdqu %xmm8, (%r10,%r14,1)
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
+ addl $16, %r14d
+ vpxor %xmm8, %xmm6, %xmm6
+ cmpl %r13d, %r14d
+ jl L_AES_GCM_encrypt_update_avx512_last_block_start
+L_AES_GCM_encrypt_update_avx512_last_block_ghash:
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm5, %xmm9
+ vpxor %xmm5, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm6, %xmm10
+ vpxor %xmm6, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm5, %xmm6, %xmm8
+ vpclmulqdq $0x11, %xmm5, %xmm6, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm6
+L_AES_GCM_encrypt_update_avx512_last_block_done:
+L_AES_GCM_encrypt_update_avx512_done_enc:
+ vmovdqa %xmm6, (%r9)
+ vzeroupper
+ addq $0x410, %rsp
+ popq %rbp
+ popq %rbx
+ popq %r15
+ popq %r14
+ popq %r12
+ popq %r13
+ repz retq
+#ifndef __APPLE__
+.size AES_GCM_encrypt_update_avx512,.-AES_GCM_encrypt_update_avx512
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_GCM_encrypt_final_avx512
+.type AES_GCM_encrypt_final_avx512,@function
+.align 16
+AES_GCM_encrypt_final_avx512:
+#else
+.section __TEXT,__text
+.globl _AES_GCM_encrypt_final_avx512
+.p2align 4
+_AES_GCM_encrypt_final_avx512:
+#endif /* __APPLE__ */
+ pushq %r13
+ movl %edx, %eax
+ movl %ecx, %r10d
+ movl %r8d, %r11d
+ movq 16(%rsp), %r8
+ subq $16, %rsp
+ vmovdqa (%rdi), %xmm4
+ vmovdqa (%r9), %xmm5
+ vmovdqa (%r8), %xmm6
+ vpsrlq $63, %xmm5, %xmm8
+ vpsllq $0x01, %xmm5, %xmm7
+ vpslldq $8, %xmm8, %xmm8
+ vpor %xmm8, %xmm7, %xmm7
+ vpshufd $0xff, %xmm5, %xmm5
+ vpsrad $31, %xmm5, %xmm5
+ vpand L_avx512_aes_gcm_mod2_128(%rip), %xmm5, %xmm5
+ vpxor %xmm7, %xmm5, %xmm5
+ movl %r10d, %edx
+ movl %r11d, %ecx
+ shlq $3, %rdx
+ shlq $3, %rcx
+ vmovq %rdx, %xmm0
+ vmovq %rcx, %xmm1
+ vpunpcklqdq %xmm1, %xmm0, %xmm0
+ vpxor %xmm0, %xmm4, %xmm4
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm5, %xmm8
+ vpxor %xmm5, %xmm8, %xmm8
+ vpshufd $0x4e, %xmm4, %xmm9
+ vpxor %xmm4, %xmm9, %xmm9
+ vpclmulqdq $0x00, %xmm5, %xmm4, %xmm7
+ vpclmulqdq $0x11, %xmm5, %xmm4, %xmm10
+ vpclmulqdq $0x00, %xmm9, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm7, %xmm10, %xmm8
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpternlogq $0x96, %xmm11, %xmm7, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm11, %xmm8, %xmm10
+ vmovdqa %xmm10, %xmm4
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4
+ vpxor %xmm6, %xmm4, %xmm0
+ cmpl $16, %eax
+ je L_AES_GCM_encrypt_final_avx512_store_tag_16
+ xorq %rcx, %rcx
+ vmovdqu %xmm0, (%rsp)
+L_AES_GCM_encrypt_final_avx512_store_tag_loop:
+ movzbl (%rsp,%rcx,1), %r13d
+ movb %r13b, (%rsi,%rcx,1)
+ incl %ecx
+ cmpl %eax, %ecx
+ jne L_AES_GCM_encrypt_final_avx512_store_tag_loop
+ jmp L_AES_GCM_encrypt_final_avx512_store_tag_done
+L_AES_GCM_encrypt_final_avx512_store_tag_16:
+ vmovdqu %xmm0, (%rsi)
+L_AES_GCM_encrypt_final_avx512_store_tag_done:
+ vzeroupper
+ addq $16, %rsp
+ popq %r13
+ repz retq
+#ifndef __APPLE__
+.size AES_GCM_encrypt_final_avx512,.-AES_GCM_encrypt_final_avx512
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_GCM_decrypt_update_avx512
+.type AES_GCM_decrypt_update_avx512,@function
+.align 16
+AES_GCM_decrypt_update_avx512:
+#else
+.section __TEXT,__text
+.globl _AES_GCM_decrypt_update_avx512
+.p2align 4
+_AES_GCM_decrypt_update_avx512:
+#endif /* __APPLE__ */
+ pushq %r13
+ pushq %r12
+ pushq %r14
+ pushq %r15
+ pushq %rbx
+ movq %rdx, %r10
+ movq %rcx, %r11
+ movq 48(%rsp), %rax
+ movq 56(%rsp), %r12
+ subq $0x410, %rsp
+ vmovdqa (%r9), %xmm6
+ vmovdqa (%rax), %xmm5
+ vpsrlq $63, %xmm5, %xmm9
+ vpsllq $0x01, %xmm5, %xmm8
+ vpslldq $8, %xmm9, %xmm9
+ vpor %xmm9, %xmm8, %xmm8
+ vpshufd $0xff, %xmm5, %xmm5
+ vpsrad $31, %xmm5, %xmm5
+ vpand L_avx512_aes_gcm_mod2_128(%rip), %xmm5, %xmm5
+ vpxor %xmm8, %xmm5, %xmm5
+ xorl %r14d, %r14d
+ cmpl $0x100, %r8d
+ jl L_AES_GCM_decrypt_update_avx512_done_128
+ vmovdqa %xmm6, %xmm2
+ # H ^ 1
+ vmovdqu %xmm5, (%rsp)
+ # H ^ 2
+ vpclmulqdq $0x00, %xmm5, %xmm5, %xmm8
+ vpclmulqdq $0x11, %xmm5, %xmm5, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm0
+ vmovdqu %xmm0, 16(%rsp)
+ # H ^ 3
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm5, %xmm9
+ vpxor %xmm5, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm0, %xmm10
+ vpxor %xmm0, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm5, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm5, %xmm0, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm1
+ vmovdqu %xmm1, 32(%rsp)
+ # H ^ 4
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm3
+ vmovdqu %xmm3, 48(%rsp)
+ # H ^ 5
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 64(%rsp)
+ # H ^ 6
+ vpclmulqdq $0x00, %xmm1, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm1, %xmm1, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 80(%rsp)
+ # H ^ 7
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm1, %xmm9
+ vpxor %xmm1, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm3, %xmm10
+ vpxor %xmm3, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm1, %xmm3, %xmm8
+ vpclmulqdq $0x11, %xmm1, %xmm3, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 96(%rsp)
+ # H ^ 8
+ vpclmulqdq $0x00, %xmm3, %xmm3, %xmm8
+ vpclmulqdq $0x11, %xmm3, %xmm3, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 112(%rsp)
+ # H ^ 9
+ vmovdqu 48(%rsp), %xmm0
+ vmovdqu 64(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 128(%rsp)
+ # H ^ 10
+ vmovdqu 64(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 144(%rsp)
+ # H ^ 11
+ vmovdqu 64(%rsp), %xmm0
+ vmovdqu 80(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 160(%rsp)
+ # H ^ 12
+ vmovdqu 80(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 176(%rsp)
+ # H ^ 13
+ vmovdqu 80(%rsp), %xmm0
+ vmovdqu 96(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 192(%rsp)
+ # H ^ 14
+ vmovdqu 96(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 208(%rsp)
+ # H ^ 15
+ vmovdqu 96(%rsp), %xmm0
+ vmovdqu 112(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 224(%rsp)
+ # H ^ 16
+ vmovdqu 112(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 240(%rsp)
+ cmpl $0x200, %r8d
+ jl L_AES_GCM_decrypt_update_avx512_no_ext
+ # H ^ 17
+ vmovdqu 112(%rsp), %xmm0
+ vmovdqu 128(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 256(%rsp)
+ # H ^ 18
+ vmovdqu 128(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 272(%rsp)
+ # H ^ 19
+ vmovdqu 128(%rsp), %xmm0
+ vmovdqu 144(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 288(%rsp)
+ # H ^ 20
+ vmovdqu 144(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 304(%rsp)
+ # H ^ 21
+ vmovdqu 144(%rsp), %xmm0
+ vmovdqu 160(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 320(%rsp)
+ # H ^ 22
+ vmovdqu 160(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 336(%rsp)
+ # H ^ 23
+ vmovdqu 160(%rsp), %xmm0
+ vmovdqu 176(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 352(%rsp)
+ # H ^ 24
+ vmovdqu 176(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 368(%rsp)
+ # H ^ 25
+ vmovdqu 176(%rsp), %xmm0
+ vmovdqu 192(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 384(%rsp)
+ # H ^ 26
+ vmovdqu 192(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 400(%rsp)
+ # H ^ 27
+ vmovdqu 192(%rsp), %xmm0
+ vmovdqu 208(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 416(%rsp)
+ # H ^ 28
+ vmovdqu 208(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 432(%rsp)
+ # H ^ 29
+ vmovdqu 208(%rsp), %xmm0
+ vmovdqu 224(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 448(%rsp)
+ # H ^ 30
+ vmovdqu 224(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 464(%rsp)
+ # H ^ 31
+ vmovdqu 224(%rsp), %xmm0
+ vmovdqu 240(%rsp), %xmm1
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm0, %xmm9
+ vpxor %xmm0, %xmm9, %xmm9
+ vpshufd $0x4e, %xmm1, %xmm10
+ vpxor %xmm1, %xmm10, %xmm10
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11
+ vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm8, %xmm11, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 480(%rsp)
+ # H ^ 32
+ vmovdqu 240(%rsp), %xmm0
+ vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11
+ vpxor %xmm9, %xmm9, %xmm9
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10
+ vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm12, %xmm8, %xmm9
+ vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12
+ vpshufd $0x4e, %xmm9, %xmm9
+ vpternlogq $0x96, %xmm12, %xmm9, %xmm11
+ vmovdqa %xmm11, %xmm7
+ vmovdqu %xmm7, 496(%rsp)
+L_AES_GCM_decrypt_update_avx512_no_ext:
+ vbroadcasti32x4 L_avx512_aes_gcm_bswap_epi64(%rip), %zmm22
+ vbroadcasti32x4 L_avx512_aes_gcm_bswap_mask(%rip), %zmm30
+ vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31
+ vbroadcasti32x4 (%rdi), %zmm9
+ vbroadcasti32x4 16(%rdi), %zmm10
+ vbroadcasti32x4 32(%rdi), %zmm11
+ vbroadcasti32x4 48(%rdi), %zmm12
+ vbroadcasti32x4 64(%rdi), %zmm13
+ vbroadcasti32x4 80(%rdi), %zmm14
+ vbroadcasti32x4 96(%rdi), %zmm15
+ vbroadcasti32x4 112(%rdi), %zmm1
+ vbroadcasti32x4 128(%rdi), %zmm2
+ vbroadcasti32x4 144(%rdi), %zmm3
+ cmpl $0x200, %r8d
+ jl L_AES_GCM_decrypt_update_avx512_no_windows
+ movl %r8d, %r13d
+ andl $0xfffffe00, %r13d
+ vmovdqu64 448(%rsp), %zmm23
+ vshufi64x2 $27, %zmm23, %zmm23, %zmm23
+ vmovdqu64 384(%rsp), %zmm24
+ vshufi64x2 $27, %zmm24, %zmm24, %zmm24
+ vmovdqu64 320(%rsp), %zmm25
+ vshufi64x2 $27, %zmm25, %zmm25, %zmm25
+ vmovdqu64 256(%rsp), %zmm26
+ vshufi64x2 $27, %zmm26, %zmm26, %zmm26
+ vmovdqu64 %zmm23, 512(%rsp)
+ vmovdqu64 %zmm24, 576(%rsp)
+ vmovdqu64 %zmm25, 640(%rsp)
+ vmovdqu64 %zmm26, 704(%rsp)
+ vmovdqu64 192(%rsp), %zmm23
+ vshufi64x2 $27, %zmm23, %zmm23, %zmm23
+ vmovdqu64 128(%rsp), %zmm24
+ vshufi64x2 $27, %zmm24, %zmm24, %zmm24
+ vmovdqu64 64(%rsp), %zmm25
+ vshufi64x2 $27, %zmm25, %zmm25, %zmm25
+ vmovdqu64 (%rsp), %zmm26
+ vshufi64x2 $27, %zmm26, %zmm26, %zmm26
+ vmovdqu64 %zmm23, 768(%rsp)
+ vmovdqu64 %zmm24, 832(%rsp)
+ vmovdqu64 %zmm25, 896(%rsp)
+ vmovdqu64 %zmm26, 960(%rsp)
+ # 512 bytes of input
+ xorl %r15d, %r15d
+ leaq (%r11,%r14,1), %rbx
+ vpxorq %zmm20, %zmm20, %zmm20
+ vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20
+ vmovdqu64 512(%rsp), %zmm23
+ vmovdqu64 576(%rsp), %zmm24
+ vmovdqu64 640(%rsp), %zmm25
+ vmovdqu64 704(%rsp), %zmm26
+ vmovdqu64 (%rbx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpxorq %zmm20, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm23, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19
+ vmovdqa64 %zmm16, %zmm27
+ vpxorq %zmm17, %zmm18, %zmm28
+ vmovdqa64 %zmm19, %zmm29
+ vmovdqu64 64(%rbx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm24, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 128(%rbx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm25, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 192(%rbx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm26, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 768(%rsp), %zmm23
+ vmovdqu64 832(%rsp), %zmm24
+ vmovdqu64 896(%rsp), %zmm25
+ vmovdqu64 960(%rsp), %zmm26
+ vmovdqu64 256(%rbx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm23, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 320(%rbx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm24, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 384(%rbx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm25, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 448(%rbx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm26, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21
+ vpshufd $0x4e, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm21, %zmm27, %zmm28
+ vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21
+ vpshufd $0x4e, %zmm28, %zmm28
+ vpternlogq $0x96, %zmm21, %zmm28, %zmm29
+ vextracti32x4 $0x01, %zmm29, %xmm0
+ vextracti32x4 $2, %zmm29, %xmm4
+ vextracti32x4 $3, %zmm29, %xmm5
+ vpxorq %xmm0, %xmm29, %xmm6
+ vpternlogq $0x96, %xmm4, %xmm5, %xmm6
+ addl $0x200, %r14d
+ cmpl %r13d, %r14d
+ jge L_AES_GCM_decrypt_update_avx512_last_aes
+L_AES_GCM_decrypt_update_avx512_win_loop:
+ leaq (%r11,%r14,1), %rbx
+ vpxorq %zmm21, %zmm21, %zmm21
+ vinserti32x4 $0x00, %xmm6, %zmm21, %zmm21
+ vbroadcasti32x4 (%r12), %zmm20
+ vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16
+ vpshufb %zmm22, %zmm16, %zmm16
+ vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17
+ vpshufb %zmm22, %zmm17, %zmm17
+ vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18
+ vpshufb %zmm22, %zmm18, %zmm18
+ vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19
+ vpshufb %zmm22, %zmm19, %zmm19
+ vmovdqu (%r12), %xmm8
+ vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, (%r12)
+ vpxorq %zmm9, %zmm16, %zmm16
+ vpxorq %zmm9, %zmm17, %zmm17
+ vpxorq %zmm9, %zmm18, %zmm18
+ vpxorq %zmm9, %zmm19, %zmm19
+ vmovdqu64 (%rbx), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vpxorq %zmm21, %zmm31, %zmm31
+ vaesenc %zmm10, %zmm16, %zmm16
+ vaesenc %zmm10, %zmm17, %zmm17
+ vaesenc %zmm10, %zmm18, %zmm18
+ vaesenc %zmm10, %zmm19, %zmm19
+ vpclmulqdq $0x00, 512(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 512(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 512(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 512(%rsp), %zmm31, %zmm26
+ vmovdqa64 %zmm23, %zmm27
+ vpxorq %zmm24, %zmm25, %zmm28
+ vmovdqa64 %zmm26, %zmm29
+ vaesenc %zmm11, %zmm16, %zmm16
+ vaesenc %zmm11, %zmm17, %zmm17
+ vaesenc %zmm11, %zmm18, %zmm18
+ vaesenc %zmm11, %zmm19, %zmm19
+ vmovdqu64 64(%rbx), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm12, %zmm16, %zmm16
+ vaesenc %zmm12, %zmm17, %zmm17
+ vaesenc %zmm12, %zmm18, %zmm18
+ vaesenc %zmm12, %zmm19, %zmm19
+ vpclmulqdq $0x00, 576(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 576(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 576(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 576(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm13, %zmm16, %zmm16
+ vaesenc %zmm13, %zmm17, %zmm17
+ vaesenc %zmm13, %zmm18, %zmm18
+ vaesenc %zmm13, %zmm19, %zmm19
+ vmovdqu64 128(%rbx), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm14, %zmm16, %zmm16
+ vaesenc %zmm14, %zmm17, %zmm17
+ vaesenc %zmm14, %zmm18, %zmm18
+ vaesenc %zmm14, %zmm19, %zmm19
+ vpclmulqdq $0x00, 640(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 640(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 640(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 640(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm15, %zmm16, %zmm16
+ vaesenc %zmm15, %zmm17, %zmm17
+ vaesenc %zmm15, %zmm18, %zmm18
+ vaesenc %zmm15, %zmm19, %zmm19
+ vmovdqu64 192(%rbx), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm1, %zmm16, %zmm16
+ vaesenc %zmm1, %zmm17, %zmm17
+ vaesenc %zmm1, %zmm18, %zmm18
+ vaesenc %zmm1, %zmm19, %zmm19
+ vpclmulqdq $0x00, 704(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 704(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 704(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 704(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm2, %zmm16, %zmm16
+ vaesenc %zmm2, %zmm17, %zmm17
+ vaesenc %zmm2, %zmm18, %zmm18
+ vaesenc %zmm2, %zmm19, %zmm19
+ vaesenc %zmm3, %zmm16, %zmm16
+ vaesenc %zmm3, %zmm17, %zmm17
+ vaesenc %zmm3, %zmm18, %zmm18
+ vaesenc %zmm3, %zmm19, %zmm19
+ cmpl $11, %esi
+ vbroadcasti32x4 160(%rdi), %zmm20
+ jl L_AES_GCM_decrypt_update_avx512_a_il_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 176(%rdi), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ cmpl $13, %esi
+ vbroadcasti32x4 192(%rdi), %zmm20
+ jl L_AES_GCM_decrypt_update_avx512_a_il_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 208(%rdi), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 224(%rdi), %zmm20
+L_AES_GCM_decrypt_update_avx512_a_il_last:
+ vaesenclast %zmm20, %zmm16, %zmm16
+ vaesenclast %zmm20, %zmm17, %zmm17
+ vaesenclast %zmm20, %zmm18, %zmm18
+ vaesenclast %zmm20, %zmm19, %zmm19
+ leaq (%r11,%r15,1), %rcx
+ leaq (%r10,%r15,1), %rdx
+ vmovdqu64 (%rcx), %zmm21
+ vpxorq %zmm21, %zmm16, %zmm16
+ vmovdqu64 %zmm16, (%rdx)
+ vmovdqu64 64(%rcx), %zmm21
+ vpxorq %zmm21, %zmm17, %zmm17
+ vmovdqu64 %zmm17, 64(%rdx)
+ vmovdqu64 128(%rcx), %zmm21
+ vpxorq %zmm21, %zmm18, %zmm18
+ vmovdqu64 %zmm18, 128(%rdx)
+ vmovdqu64 192(%rcx), %zmm21
+ vpxorq %zmm21, %zmm19, %zmm19
+ vmovdqu64 %zmm19, 192(%rdx)
+ addl $0x100, %r15d
+ vbroadcasti32x4 (%r12), %zmm20
+ vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16
+ vpshufb %zmm22, %zmm16, %zmm16
+ vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17
+ vpshufb %zmm22, %zmm17, %zmm17
+ vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18
+ vpshufb %zmm22, %zmm18, %zmm18
+ vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19
+ vpshufb %zmm22, %zmm19, %zmm19
+ vmovdqu (%r12), %xmm8
+ vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, (%r12)
+ vpxorq %zmm9, %zmm16, %zmm16
+ vpxorq %zmm9, %zmm17, %zmm17
+ vpxorq %zmm9, %zmm18, %zmm18
+ vpxorq %zmm9, %zmm19, %zmm19
+ vmovdqu64 256(%rbx), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm10, %zmm16, %zmm16
+ vaesenc %zmm10, %zmm17, %zmm17
+ vaesenc %zmm10, %zmm18, %zmm18
+ vaesenc %zmm10, %zmm19, %zmm19
+ vpclmulqdq $0x00, 768(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 768(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 768(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 768(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm11, %zmm16, %zmm16
+ vaesenc %zmm11, %zmm17, %zmm17
+ vaesenc %zmm11, %zmm18, %zmm18
+ vaesenc %zmm11, %zmm19, %zmm19
+ vmovdqu64 320(%rbx), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm12, %zmm16, %zmm16
+ vaesenc %zmm12, %zmm17, %zmm17
+ vaesenc %zmm12, %zmm18, %zmm18
+ vaesenc %zmm12, %zmm19, %zmm19
+ vpclmulqdq $0x00, 832(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 832(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 832(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 832(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm13, %zmm16, %zmm16
+ vaesenc %zmm13, %zmm17, %zmm17
+ vaesenc %zmm13, %zmm18, %zmm18
+ vaesenc %zmm13, %zmm19, %zmm19
+ vmovdqu64 384(%rbx), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm14, %zmm16, %zmm16
+ vaesenc %zmm14, %zmm17, %zmm17
+ vaesenc %zmm14, %zmm18, %zmm18
+ vaesenc %zmm14, %zmm19, %zmm19
+ vpclmulqdq $0x00, 896(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 896(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 896(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 896(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm15, %zmm16, %zmm16
+ vaesenc %zmm15, %zmm17, %zmm17
+ vaesenc %zmm15, %zmm18, %zmm18
+ vaesenc %zmm15, %zmm19, %zmm19
+ vmovdqu64 448(%rbx), %zmm31
+ vpshufb %zmm30, %zmm31, %zmm31
+ vaesenc %zmm1, %zmm16, %zmm16
+ vaesenc %zmm1, %zmm17, %zmm17
+ vaesenc %zmm1, %zmm18, %zmm18
+ vaesenc %zmm1, %zmm19, %zmm19
+ vpclmulqdq $0x00, 960(%rsp), %zmm31, %zmm23
+ vpclmulqdq $0x01, 960(%rsp), %zmm31, %zmm24
+ vpclmulqdq $16, 960(%rsp), %zmm31, %zmm25
+ vpclmulqdq $0x11, 960(%rsp), %zmm31, %zmm26
+ vpxorq %zmm23, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm24, %zmm25, %zmm28
+ vpxorq %zmm26, %zmm29, %zmm29
+ vaesenc %zmm2, %zmm16, %zmm16
+ vaesenc %zmm2, %zmm17, %zmm17
+ vaesenc %zmm2, %zmm18, %zmm18
+ vaesenc %zmm2, %zmm19, %zmm19
+ vaesenc %zmm3, %zmm16, %zmm16
+ vaesenc %zmm3, %zmm17, %zmm17
+ vaesenc %zmm3, %zmm18, %zmm18
+ vaesenc %zmm3, %zmm19, %zmm19
+ cmpl $11, %esi
+ vbroadcasti32x4 160(%rdi), %zmm20
+ jl L_AES_GCM_decrypt_update_avx512_b_il_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 176(%rdi), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ cmpl $13, %esi
+ vbroadcasti32x4 192(%rdi), %zmm20
+ jl L_AES_GCM_decrypt_update_avx512_b_il_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 208(%rdi), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 224(%rdi), %zmm20
+L_AES_GCM_decrypt_update_avx512_b_il_last:
+ vaesenclast %zmm20, %zmm16, %zmm16
+ vaesenclast %zmm20, %zmm17, %zmm17
+ vaesenclast %zmm20, %zmm18, %zmm18
+ vaesenclast %zmm20, %zmm19, %zmm19
+ leaq (%r11,%r15,1), %rcx
+ leaq (%r10,%r15,1), %rdx
+ vmovdqu64 (%rcx), %zmm21
+ vpxorq %zmm21, %zmm16, %zmm16
+ vmovdqu64 %zmm16, (%rdx)
+ vmovdqu64 64(%rcx), %zmm21
+ vpxorq %zmm21, %zmm17, %zmm17
+ vmovdqu64 %zmm17, 64(%rdx)
+ vmovdqu64 128(%rcx), %zmm21
+ vpxorq %zmm21, %zmm18, %zmm18
+ vmovdqu64 %zmm18, 128(%rdx)
+ vmovdqu64 192(%rcx), %zmm21
+ vpxorq %zmm21, %zmm19, %zmm19
+ vmovdqu64 %zmm19, 192(%rdx)
+ addl $0x100, %r15d
+ vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31
+ vpclmulqdq $0x01, %zmm27, %zmm31, %zmm23
+ vpshufd $0x4e, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm23, %zmm27, %zmm28
+ vpclmulqdq $0x01, %zmm28, %zmm31, %zmm23
+ vpshufd $0x4e, %zmm28, %zmm28
+ vpternlogq $0x96, %zmm23, %zmm28, %zmm29
+ vextracti32x4 $0x01, %zmm29, %xmm0
+ vextracti32x4 $2, %zmm29, %xmm4
+ vextracti32x4 $3, %zmm29, %xmm5
+ vpxorq %xmm0, %xmm29, %xmm6
+ vpternlogq $0x96, %xmm4, %xmm5, %xmm6
+ addl $0x200, %r14d
+ cmpl %r13d, %r14d
+ jl L_AES_GCM_decrypt_update_avx512_win_loop
+L_AES_GCM_decrypt_update_avx512_last_aes:
+ vbroadcasti32x4 (%r12), %zmm20
+ vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16
+ vpshufb %zmm22, %zmm16, %zmm16
+ vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17
+ vpshufb %zmm22, %zmm17, %zmm17
+ vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18
+ vpshufb %zmm22, %zmm18, %zmm18
+ vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19
+ vpshufb %zmm22, %zmm19, %zmm19
+ vmovdqu (%r12), %xmm8
+ vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, (%r12)
+ vpxorq %zmm9, %zmm16, %zmm16
+ vpxorq %zmm9, %zmm17, %zmm17
+ vpxorq %zmm9, %zmm18, %zmm18
+ vpxorq %zmm9, %zmm19, %zmm19
+ vaesenc %zmm10, %zmm16, %zmm16
+ vaesenc %zmm10, %zmm17, %zmm17
+ vaesenc %zmm10, %zmm18, %zmm18
+ vaesenc %zmm10, %zmm19, %zmm19
+ vaesenc %zmm11, %zmm16, %zmm16
+ vaesenc %zmm11, %zmm17, %zmm17
+ vaesenc %zmm11, %zmm18, %zmm18
+ vaesenc %zmm11, %zmm19, %zmm19
+ vaesenc %zmm12, %zmm16, %zmm16
+ vaesenc %zmm12, %zmm17, %zmm17
+ vaesenc %zmm12, %zmm18, %zmm18
+ vaesenc %zmm12, %zmm19, %zmm19
+ vaesenc %zmm13, %zmm16, %zmm16
+ vaesenc %zmm13, %zmm17, %zmm17
+ vaesenc %zmm13, %zmm18, %zmm18
+ vaesenc %zmm13, %zmm19, %zmm19
+ vaesenc %zmm14, %zmm16, %zmm16
+ vaesenc %zmm14, %zmm17, %zmm17
+ vaesenc %zmm14, %zmm18, %zmm18
+ vaesenc %zmm14, %zmm19, %zmm19
+ vaesenc %zmm15, %zmm16, %zmm16
+ vaesenc %zmm15, %zmm17, %zmm17
+ vaesenc %zmm15, %zmm18, %zmm18
+ vaesenc %zmm15, %zmm19, %zmm19
+ vaesenc %zmm1, %zmm16, %zmm16
+ vaesenc %zmm1, %zmm17, %zmm17
+ vaesenc %zmm1, %zmm18, %zmm18
+ vaesenc %zmm1, %zmm19, %zmm19
+ vaesenc %zmm2, %zmm16, %zmm16
+ vaesenc %zmm2, %zmm17, %zmm17
+ vaesenc %zmm2, %zmm18, %zmm18
+ vaesenc %zmm2, %zmm19, %zmm19
+ vaesenc %zmm3, %zmm16, %zmm16
+ vaesenc %zmm3, %zmm17, %zmm17
+ vaesenc %zmm3, %zmm18, %zmm18
+ vaesenc %zmm3, %zmm19, %zmm19
+ cmpl $11, %esi
+ vbroadcasti32x4 160(%rdi), %zmm20
+ jl L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 176(%rdi), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ cmpl $13, %esi
+ vbroadcasti32x4 192(%rdi), %zmm20
+ jl L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 208(%rdi), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 224(%rdi), %zmm20
+L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last:
+ vaesenclast %zmm20, %zmm16, %zmm16
+ vaesenclast %zmm20, %zmm17, %zmm17
+ vaesenclast %zmm20, %zmm18, %zmm18
+ vaesenclast %zmm20, %zmm19, %zmm19
+ leaq (%r11,%r15,1), %rcx
+ leaq (%r10,%r15,1), %rdx
+ vmovdqu64 (%rcx), %zmm21
+ vpxorq %zmm21, %zmm16, %zmm16
+ vmovdqu64 %zmm16, (%rdx)
+ vmovdqu64 64(%rcx), %zmm21
+ vpxorq %zmm21, %zmm17, %zmm17
+ vmovdqu64 %zmm17, 64(%rdx)
+ vmovdqu64 128(%rcx), %zmm21
+ vpxorq %zmm21, %zmm18, %zmm18
+ vmovdqu64 %zmm18, 128(%rdx)
+ vmovdqu64 192(%rcx), %zmm21
+ vpxorq %zmm21, %zmm19, %zmm19
+ vmovdqu64 %zmm19, 192(%rdx)
+ addl $0x100, %r15d
+ vbroadcasti32x4 (%r12), %zmm20
+ vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16
+ vpshufb %zmm22, %zmm16, %zmm16
+ vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17
+ vpshufb %zmm22, %zmm17, %zmm17
+ vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18
+ vpshufb %zmm22, %zmm18, %zmm18
+ vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19
+ vpshufb %zmm22, %zmm19, %zmm19
+ vmovdqu (%r12), %xmm8
+ vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, (%r12)
+ vpxorq %zmm9, %zmm16, %zmm16
+ vpxorq %zmm9, %zmm17, %zmm17
+ vpxorq %zmm9, %zmm18, %zmm18
+ vpxorq %zmm9, %zmm19, %zmm19
+ vaesenc %zmm10, %zmm16, %zmm16
+ vaesenc %zmm10, %zmm17, %zmm17
+ vaesenc %zmm10, %zmm18, %zmm18
+ vaesenc %zmm10, %zmm19, %zmm19
+ vaesenc %zmm11, %zmm16, %zmm16
+ vaesenc %zmm11, %zmm17, %zmm17
+ vaesenc %zmm11, %zmm18, %zmm18
+ vaesenc %zmm11, %zmm19, %zmm19
+ vaesenc %zmm12, %zmm16, %zmm16
+ vaesenc %zmm12, %zmm17, %zmm17
+ vaesenc %zmm12, %zmm18, %zmm18
+ vaesenc %zmm12, %zmm19, %zmm19
+ vaesenc %zmm13, %zmm16, %zmm16
+ vaesenc %zmm13, %zmm17, %zmm17
+ vaesenc %zmm13, %zmm18, %zmm18
+ vaesenc %zmm13, %zmm19, %zmm19
+ vaesenc %zmm14, %zmm16, %zmm16
+ vaesenc %zmm14, %zmm17, %zmm17
+ vaesenc %zmm14, %zmm18, %zmm18
+ vaesenc %zmm14, %zmm19, %zmm19
+ vaesenc %zmm15, %zmm16, %zmm16
+ vaesenc %zmm15, %zmm17, %zmm17
+ vaesenc %zmm15, %zmm18, %zmm18
+ vaesenc %zmm15, %zmm19, %zmm19
+ vaesenc %zmm1, %zmm16, %zmm16
+ vaesenc %zmm1, %zmm17, %zmm17
+ vaesenc %zmm1, %zmm18, %zmm18
+ vaesenc %zmm1, %zmm19, %zmm19
+ vaesenc %zmm2, %zmm16, %zmm16
+ vaesenc %zmm2, %zmm17, %zmm17
+ vaesenc %zmm2, %zmm18, %zmm18
+ vaesenc %zmm2, %zmm19, %zmm19
+ vaesenc %zmm3, %zmm16, %zmm16
+ vaesenc %zmm3, %zmm17, %zmm17
+ vaesenc %zmm3, %zmm18, %zmm18
+ vaesenc %zmm3, %zmm19, %zmm19
+ cmpl $11, %esi
+ vbroadcasti32x4 160(%rdi), %zmm20
+ jl L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 176(%rdi), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ cmpl $13, %esi
+ vbroadcasti32x4 192(%rdi), %zmm20
+ jl L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 208(%rdi), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 224(%rdi), %zmm20
+L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last:
+ vaesenclast %zmm20, %zmm16, %zmm16
+ vaesenclast %zmm20, %zmm17, %zmm17
+ vaesenclast %zmm20, %zmm18, %zmm18
+ vaesenclast %zmm20, %zmm19, %zmm19
+ leaq (%r11,%r15,1), %rcx
+ leaq (%r10,%r15,1), %rdx
+ vmovdqu64 (%rcx), %zmm21
+ vpxorq %zmm21, %zmm16, %zmm16
+ vmovdqu64 %zmm16, (%rdx)
+ vmovdqu64 64(%rcx), %zmm21
+ vpxorq %zmm21, %zmm17, %zmm17
+ vmovdqu64 %zmm17, 64(%rdx)
+ vmovdqu64 128(%rcx), %zmm21
+ vpxorq %zmm21, %zmm18, %zmm18
+ vmovdqu64 %zmm18, 128(%rdx)
+ vmovdqu64 192(%rcx), %zmm21
+ vpxorq %zmm21, %zmm19, %zmm19
+ vmovdqu64 %zmm19, 192(%rdx)
+ addl $0x100, %r15d
+L_AES_GCM_decrypt_update_avx512_no_windows:
+ vmovdqu64 192(%rsp), %zmm23
+ vshufi64x2 $27, %zmm23, %zmm23, %zmm23
+ vmovdqu64 128(%rsp), %zmm24
+ vshufi64x2 $27, %zmm24, %zmm24, %zmm24
+ vmovdqu64 64(%rsp), %zmm25
+ vshufi64x2 $27, %zmm25, %zmm25, %zmm25
+ vmovdqu64 (%rsp), %zmm26
+ vshufi64x2 $27, %zmm26, %zmm26, %zmm26
+ movl %r8d, %r13d
+ andl $0xffffff00, %r13d
+ cmpl %r13d, %r14d
+ jge L_AES_GCM_decrypt_update_avx512_after_256
+ # 256 bytes of input
+ leaq (%r11,%r14,1), %rbx
+ vpxorq %zmm20, %zmm20, %zmm20
+ vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20
+ vmovdqu64 (%rbx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpxorq %zmm20, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm23, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19
+ vmovdqa64 %zmm16, %zmm27
+ vpxorq %zmm17, %zmm18, %zmm28
+ vmovdqa64 %zmm19, %zmm29
+ vmovdqu64 64(%rbx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm24, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 128(%rbx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm25, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vmovdqu64 192(%rbx), %zmm21
+ vpshufb %zmm30, %zmm21, %zmm21
+ vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16
+ vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17
+ vpclmulqdq $16, %zmm26, %zmm21, %zmm18
+ vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19
+ vpxorq %zmm16, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm17, %zmm18, %zmm28
+ vpxorq %zmm19, %zmm29, %zmm29
+ vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21
+ vpshufd $0x4e, %zmm27, %zmm27
+ vpternlogq $0x96, %zmm21, %zmm27, %zmm28
+ vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21
+ vpshufd $0x4e, %zmm28, %zmm28
+ vpternlogq $0x96, %zmm21, %zmm28, %zmm29
+ vextracti32x4 $0x01, %zmm29, %xmm0
+ vextracti32x4 $2, %zmm29, %xmm4
+ vextracti32x4 $3, %zmm29, %xmm5
+ vpxorq %xmm0, %xmm29, %xmm6
+ vpternlogq $0x96, %xmm4, %xmm5, %xmm6
+ vbroadcasti32x4 (%r12), %zmm20
+ vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16
+ vpshufb %zmm22, %zmm16, %zmm16
+ vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17
+ vpshufb %zmm22, %zmm17, %zmm17
+ vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18
+ vpshufb %zmm22, %zmm18, %zmm18
+ vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19
+ vpshufb %zmm22, %zmm19, %zmm19
+ vmovdqu (%r12), %xmm8
+ vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8
+ vmovdqu %xmm8, (%r12)
+ vpxorq %zmm9, %zmm16, %zmm16
+ vpxorq %zmm9, %zmm17, %zmm17
+ vpxorq %zmm9, %zmm18, %zmm18
+ vpxorq %zmm9, %zmm19, %zmm19
+ vaesenc %zmm10, %zmm16, %zmm16
+ vaesenc %zmm10, %zmm17, %zmm17
+ vaesenc %zmm10, %zmm18, %zmm18
+ vaesenc %zmm10, %zmm19, %zmm19
+ vaesenc %zmm11, %zmm16, %zmm16
+ vaesenc %zmm11, %zmm17, %zmm17
+ vaesenc %zmm11, %zmm18, %zmm18
+ vaesenc %zmm11, %zmm19, %zmm19
+ vaesenc %zmm12, %zmm16, %zmm16
+ vaesenc %zmm12, %zmm17, %zmm17
+ vaesenc %zmm12, %zmm18, %zmm18
+ vaesenc %zmm12, %zmm19, %zmm19
+ vaesenc %zmm13, %zmm16, %zmm16
+ vaesenc %zmm13, %zmm17, %zmm17
+ vaesenc %zmm13, %zmm18, %zmm18
+ vaesenc %zmm13, %zmm19, %zmm19
+ vaesenc %zmm14, %zmm16, %zmm16
+ vaesenc %zmm14, %zmm17, %zmm17
+ vaesenc %zmm14, %zmm18, %zmm18
+ vaesenc %zmm14, %zmm19, %zmm19
+ vaesenc %zmm15, %zmm16, %zmm16
+ vaesenc %zmm15, %zmm17, %zmm17
+ vaesenc %zmm15, %zmm18, %zmm18
+ vaesenc %zmm15, %zmm19, %zmm19
+ vaesenc %zmm1, %zmm16, %zmm16
+ vaesenc %zmm1, %zmm17, %zmm17
+ vaesenc %zmm1, %zmm18, %zmm18
+ vaesenc %zmm1, %zmm19, %zmm19
+ vaesenc %zmm2, %zmm16, %zmm16
+ vaesenc %zmm2, %zmm17, %zmm17
+ vaesenc %zmm2, %zmm18, %zmm18
+ vaesenc %zmm2, %zmm19, %zmm19
+ vaesenc %zmm3, %zmm16, %zmm16
+ vaesenc %zmm3, %zmm17, %zmm17
+ vaesenc %zmm3, %zmm18, %zmm18
+ vaesenc %zmm3, %zmm19, %zmm19
+ cmpl $11, %esi
+ vbroadcasti32x4 160(%rdi), %zmm20
+ jl L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 176(%rdi), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ cmpl $13, %esi
+ vbroadcasti32x4 192(%rdi), %zmm20
+ jl L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 208(%rdi), %zmm20
+ vaesenc %zmm20, %zmm16, %zmm16
+ vaesenc %zmm20, %zmm17, %zmm17
+ vaesenc %zmm20, %zmm18, %zmm18
+ vaesenc %zmm20, %zmm19, %zmm19
+ vbroadcasti32x4 224(%rdi), %zmm20
+L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last:
+ vaesenclast %zmm20, %zmm16, %zmm16
+ vaesenclast %zmm20, %zmm17, %zmm17
+ vaesenclast %zmm20, %zmm18, %zmm18
+ vaesenclast %zmm20, %zmm19, %zmm19
+ leaq (%r11,%r14,1), %rcx
+ leaq (%r10,%r14,1), %rdx
+ vmovdqu64 (%rcx), %zmm21
+ vpxorq %zmm21, %zmm16, %zmm16
+ vmovdqu64 %zmm16, (%rdx)
+ vmovdqu64 64(%rcx), %zmm21
+ vpxorq %zmm21, %zmm17, %zmm17
+ vmovdqu64 %zmm17, 64(%rdx)
+ vmovdqu64 128(%rcx), %zmm21
+ vpxorq %zmm21, %zmm18, %zmm18
+ vmovdqu64 %zmm18, 128(%rdx)
+ vmovdqu64 192(%rcx), %zmm21
+ vpxorq %zmm21, %zmm19, %zmm19
+ vmovdqu64 %zmm19, 192(%rdx)
+ addl $0x100, %r14d
+L_AES_GCM_decrypt_update_avx512_after_256:
+ vmovdqu (%rsp), %xmm5
+L_AES_GCM_decrypt_update_avx512_done_128:
+ movl %r8d, %edx
+ cmpl %edx, %r14d
+ jge L_AES_GCM_decrypt_update_avx512_done_dec
+ movl %r8d, %r13d
+ andl $0xfffffff0, %r13d
+ cmpl %r13d, %r14d
+ jge L_AES_GCM_decrypt_update_avx512_last_block_done
+L_AES_GCM_decrypt_update_avx512_last_block_start:
+ vmovdqu (%r11,%r14,1), %xmm13
+ vmovdqa %xmm5, %xmm0
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm13, %xmm1
+ vpxor %xmm6, %xmm1, %xmm1
+ vmovdqu (%r12), %xmm9
+ vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8
+ vpaddd L_avx512_aes_gcm_one(%rip), %xmm9, %xmm9
+ vmovdqu %xmm9, (%r12)
+ vpxor (%rdi), %xmm8, %xmm8
+ vpclmulqdq $16, %xmm0, %xmm1, %xmm10
+ vaesenc 16(%rdi), %xmm8, %xmm8
+ vaesenc 32(%rdi), %xmm8, %xmm8
+ vpclmulqdq $0x01, %xmm0, %xmm1, %xmm11
+ vaesenc 48(%rdi), %xmm8, %xmm8
+ vaesenc 64(%rdi), %xmm8, %xmm8
+ vpclmulqdq $0x00, %xmm0, %xmm1, %xmm12
+ vaesenc 80(%rdi), %xmm8, %xmm8
+ vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1
+ vaesenc 96(%rdi), %xmm8, %xmm8
+ vpxor %xmm11, %xmm10, %xmm10
+ vpslldq $8, %xmm10, %xmm2
+ vpsrldq $8, %xmm10, %xmm10
+ vaesenc 112(%rdi), %xmm8, %xmm8
+ vpxor %xmm12, %xmm2, %xmm2
+ vpxor %xmm10, %xmm1, %xmm3
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm0
+ vpclmulqdq $16, %xmm0, %xmm2, %xmm11
+ vaesenc 128(%rdi), %xmm8, %xmm8
+ vpshufd $0x4e, %xmm2, %xmm10
+ vpxor %xmm11, %xmm10, %xmm10
+ vpclmulqdq $16, %xmm0, %xmm10, %xmm11
+ vaesenc 144(%rdi), %xmm8, %xmm8
+ vpshufd $0x4e, %xmm10, %xmm10
+ vpxor %xmm11, %xmm10, %xmm10
+ vpxor %xmm3, %xmm10, %xmm6
+ cmpl $11, %esi
+ vmovdqa 160(%rdi), %xmm9
+ jl L_AES_GCM_decrypt_update_avx512_aesenc_gfmul_last
+ vaesenc %xmm9, %xmm8, %xmm8
+ vaesenc 176(%rdi), %xmm8, %xmm8
+ cmpl $13, %esi
+ vmovdqa 192(%rdi), %xmm9
+ jl L_AES_GCM_decrypt_update_avx512_aesenc_gfmul_last
+ vaesenc %xmm9, %xmm8, %xmm8
+ vaesenc 208(%rdi), %xmm8, %xmm8
+ vmovdqa 224(%rdi), %xmm9
+L_AES_GCM_decrypt_update_avx512_aesenc_gfmul_last:
+ vaesenclast %xmm9, %xmm8, %xmm8
+ vmovdqa %xmm13, %xmm0
+ vpxor %xmm0, %xmm8, %xmm8
+ vmovdqu %xmm8, (%r10,%r14,1)
+ addl $16, %r14d
+ cmpl %r13d, %r14d
+ jl L_AES_GCM_decrypt_update_avx512_last_block_start
+L_AES_GCM_decrypt_update_avx512_last_block_done:
+L_AES_GCM_decrypt_update_avx512_done_dec:
+ vmovdqa %xmm6, (%r9)
+ vzeroupper
+ addq $0x410, %rsp
+ popq %rbx
+ popq %r15
+ popq %r14
+ popq %r12
+ popq %r13
+ repz retq
+#ifndef __APPLE__
+.size AES_GCM_decrypt_update_avx512,.-AES_GCM_decrypt_update_avx512
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_GCM_decrypt_final_avx512
+.type AES_GCM_decrypt_final_avx512,@function
+.align 16
+AES_GCM_decrypt_final_avx512:
+#else
+.section __TEXT,__text
+.globl _AES_GCM_decrypt_final_avx512
+.p2align 4
+_AES_GCM_decrypt_final_avx512:
+#endif /* __APPLE__ */
+ pushq %r13
+ pushq %rbp
+ pushq %r12
+ movl %edx, %eax
+ movl %ecx, %r10d
+ movl %r8d, %r11d
+ movq 32(%rsp), %r8
+ movq 40(%rsp), %rbp
+ subq $16, %rsp
+ vmovdqa (%rdi), %xmm6
+ vmovdqa (%r9), %xmm5
+ vmovdqa (%r8), %xmm15
+ vpsrlq $63, %xmm5, %xmm8
+ vpsllq $0x01, %xmm5, %xmm7
+ vpslldq $8, %xmm8, %xmm8
+ vpor %xmm8, %xmm7, %xmm7
+ vpshufd $0xff, %xmm5, %xmm5
+ vpsrad $31, %xmm5, %xmm5
+ vpand L_avx512_aes_gcm_mod2_128(%rip), %xmm5, %xmm5
+ vpxor %xmm7, %xmm5, %xmm5
+ movl %r10d, %edx
+ movl %r11d, %ecx
+ shlq $3, %rdx
+ shlq $3, %rcx
+ vmovq %rdx, %xmm0
+ vmovq %rcx, %xmm1
+ vpunpcklqdq %xmm1, %xmm0, %xmm0
+ vpxor %xmm0, %xmm6, %xmm6
+ # ghash_gfmul_red_avx
+ vpshufd $0x4e, %xmm5, %xmm8
+ vpxor %xmm5, %xmm8, %xmm8
+ vpshufd $0x4e, %xmm6, %xmm9
+ vpxor %xmm6, %xmm9, %xmm9
+ vpclmulqdq $0x00, %xmm5, %xmm6, %xmm7
+ vpclmulqdq $0x11, %xmm5, %xmm6, %xmm10
+ vpclmulqdq $0x00, %xmm9, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm7, %xmm10, %xmm8
+ vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm9
+ vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm7, %xmm7
+ vpternlogq $0x96, %xmm11, %xmm7, %xmm8
+ vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11
+ vpshufd $0x4e, %xmm8, %xmm8
+ vpternlogq $0x96, %xmm11, %xmm8, %xmm10
+ vmovdqa %xmm10, %xmm6
+ vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6
+ vpxor %xmm15, %xmm6, %xmm0
+ cmpl $16, %eax
+ je L_AES_GCM_decrypt_final_avx512_cmp_tag_16
+ subq $16, %rsp
+ xorq %rcx, %rcx
+ xorq %r12, %r12
+ vmovdqu %xmm0, (%rsp)
+L_AES_GCM_decrypt_final_avx512_cmp_tag_loop:
+ movzbl (%rsp,%rcx,1), %r13d
+ xorb (%rsi,%rcx,1), %r13b
+ orb %r13b, %r12b
+ incl %ecx
+ cmpl %eax, %ecx
+ jne L_AES_GCM_decrypt_final_avx512_cmp_tag_loop
+ cmpb $0x00, %r12b
+ sete %r12b
+ addq $16, %rsp
+ xorq %rcx, %rcx
+ jmp L_AES_GCM_decrypt_final_avx512_cmp_tag_done
+L_AES_GCM_decrypt_final_avx512_cmp_tag_16:
+ vmovdqu (%rsi), %xmm1
+ vpcmpeqb %xmm1, %xmm0, %xmm0
+ vpmovmskb %xmm0, %rdx
+ # %%edx == 0xFFFF then return 1 else => return 0
+ xorl %r12d, %r12d
+ cmpl $0xffff, %edx
+ sete %r12b
+L_AES_GCM_decrypt_final_avx512_cmp_tag_done:
+ movl %r12d, (%rbp)
+ vzeroupper
+ addq $16, %rsp
+ popq %r12
+ popq %rbp
+ popq %r13
+ repz retq
+#ifndef __APPLE__
+.size AES_GCM_decrypt_final_avx512,.-AES_GCM_decrypt_final_avx512
+#endif /* __APPLE__ */
+#endif /* WOLFSSL_AESGCM_STREAM */
+#endif /* HAVE_INTEL_AVX512 */
#endif /* WOLFSSL_X86_64_BUILD */
#if defined(__linux__) && defined(__ELF__)
diff --git a/wolfcrypt/src/aes_gcm_asm.asm b/wolfcrypt/src/aes_gcm_asm.asm
index d222bc14478..34f68476310 100644
--- a/wolfcrypt/src/aes_gcm_asm.asm
+++ b/wolfcrypt/src/aes_gcm_asm.asm
@@ -171,10 +171,10 @@ GCM_generate_m0_aesni PROC
por xmm1, xmm5
por xmm2, xmm6
por xmm3, xmm7
- vpshufb xmm0, xmm0, xmm9
- vpshufb xmm1, xmm1, xmm9
- vpshufb xmm2, xmm2, xmm9
- vpshufb xmm3, xmm3, xmm9
+ pshufb xmm0, xmm9
+ pshufb xmm1, xmm9
+ pshufb xmm2, xmm9
+ pshufb xmm3, xmm9
movdqu OWORD PTR [rdx+256], xmm0
movdqu OWORD PTR [rdx+272], xmm1
movdqu OWORD PTR [rdx+288], xmm2
@@ -207,10 +207,10 @@ GCM_generate_m0_aesni PROC
por xmm1, xmm5
por xmm2, xmm6
por xmm3, xmm7
- vpshufb xmm0, xmm0, xmm9
- vpshufb xmm1, xmm1, xmm9
- vpshufb xmm2, xmm2, xmm9
- vpshufb xmm3, xmm3, xmm9
+ pshufb xmm0, xmm9
+ pshufb xmm1, xmm9
+ pshufb xmm2, xmm9
+ pshufb xmm3, xmm9
movdqu OWORD PTR [rdx+320], xmm0
movdqu OWORD PTR [rdx+336], xmm1
movdqu OWORD PTR [rdx+352], xmm2
@@ -243,10 +243,10 @@ GCM_generate_m0_aesni PROC
por xmm1, xmm5
por xmm2, xmm6
por xmm3, xmm7
- vpshufb xmm0, xmm0, xmm9
- vpshufb xmm1, xmm1, xmm9
- vpshufb xmm2, xmm2, xmm9
- vpshufb xmm3, xmm3, xmm9
+ pshufb xmm0, xmm9
+ pshufb xmm1, xmm9
+ pshufb xmm2, xmm9
+ pshufb xmm3, xmm9
movdqu OWORD PTR [rdx+384], xmm0
movdqu OWORD PTR [rdx+400], xmm1
movdqu OWORD PTR [rdx+416], xmm2
@@ -279,10 +279,10 @@ GCM_generate_m0_aesni PROC
por xmm1, xmm5
por xmm2, xmm6
por xmm3, xmm7
- vpshufb xmm0, xmm0, xmm9
- vpshufb xmm1, xmm1, xmm9
- vpshufb xmm2, xmm2, xmm9
- vpshufb xmm3, xmm3, xmm9
+ pshufb xmm0, xmm9
+ pshufb xmm1, xmm9
+ pshufb xmm2, xmm9
+ pshufb xmm3, xmm9
movdqu OWORD PTR [rdx+448], xmm0
movdqu OWORD PTR [rdx+464], xmm1
movdqu OWORD PTR [rdx+480], xmm2
@@ -16518,4 +16518,14153 @@ L_AES_GCM_decrypt_final_avx2_cmp_tag_done:
AES_GCM_decrypt_final_avx2 ENDP
_TEXT ENDS
ENDIF
+IFDEF HAVE_INTEL_VAES
+_DATA SEGMENT
+ALIGN 16
+L_vaes_aes_gcm_inc_y0 QWORD \
+ 0000000000000000h, 0000000000000000h,
+ 0000000000000000h, 0000000000000001h
+ptr_L_vaes_aes_gcm_inc_y0 QWORD L_vaes_aes_gcm_inc_y0
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_vaes_aes_gcm_inc_y1 QWORD \
+ 0000000000000000h, 0000000000000002h,
+ 0000000000000000h, 0000000000000003h
+ptr_L_vaes_aes_gcm_inc_y1 QWORD L_vaes_aes_gcm_inc_y1
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_vaes_aes_gcm_inc_y2 QWORD \
+ 0000000000000000h, 0000000000000004h,
+ 0000000000000000h, 0000000000000005h
+ptr_L_vaes_aes_gcm_inc_y2 QWORD L_vaes_aes_gcm_inc_y2
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_vaes_aes_gcm_inc_y3 QWORD \
+ 0000000000000000h, 0000000000000006h,
+ 0000000000000000h, 0000000000000007h
+ptr_L_vaes_aes_gcm_inc_y3 QWORD L_vaes_aes_gcm_inc_y3
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_GCM_generate_m0_vaes_rev8 QWORD \
+ 08090a0b0c0d0e0fh, 0001020304050607h
+ptr_L_GCM_generate_m0_vaes_rev8 QWORD L_GCM_generate_m0_vaes_rev8
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_GCM_generate_m0_vaes_mod2_128 QWORD \
+ 0000000000000000h, 0e100000000000000h
+ptr_L_GCM_generate_m0_vaes_mod2_128 QWORD L_GCM_generate_m0_vaes_mod2_128
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+GCM_generate_m0_vaes PROC
+ sub rsp, 80
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu xmm9, OWORD PTR L_GCM_generate_m0_vaes_rev8
+ vmovdqu xmm10, OWORD PTR L_GCM_generate_m0_vaes_mod2_128
+ vpxor xmm8, xmm8, xmm8
+ vmovdqu xmm0, OWORD PTR [rcx]
+ vmovdqu OWORD PTR [rdx], xmm8
+ vmovdqu xmm8, xmm0
+ vpshufb xmm0, xmm0, xmm9
+ vpsllq xmm5, xmm0, 63
+ vpsrlq xmm4, xmm0, 1
+ vpslldq xmm1, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpshufd xmm1, xmm1, 255
+ vpor xmm4, xmm4, xmm5
+ vpsrad xmm1, xmm1, 31
+ vpand xmm1, xmm1, xmm10
+ vpxor xmm1, xmm1, xmm4
+ vpsllq xmm5, xmm1, 63
+ vpsrlq xmm4, xmm1, 1
+ vpslldq xmm2, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpshufd xmm2, xmm2, 255
+ vpor xmm4, xmm4, xmm5
+ vpsrad xmm2, xmm2, 31
+ vpand xmm2, xmm2, xmm10
+ vpxor xmm2, xmm2, xmm4
+ vpsllq xmm5, xmm2, 63
+ vpsrlq xmm4, xmm2, 1
+ vpslldq xmm3, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpshufd xmm3, xmm3, 255
+ vpor xmm4, xmm4, xmm5
+ vpsrad xmm3, xmm3, 31
+ vpand xmm3, xmm3, xmm10
+ vpxor xmm3, xmm3, xmm4
+ vpshufb xmm3, xmm3, xmm9
+ vpshufb xmm2, xmm2, xmm9
+ vpshufb xmm1, xmm1, xmm9
+ vpshufb xmm0, xmm0, xmm9
+ vpxor xmm8, xmm3, xmm2
+ vmovdqu OWORD PTR [rdx+16], xmm3
+ vmovdqu OWORD PTR [rdx+32], xmm2
+ vmovdqu OWORD PTR [rdx+48], xmm8
+ vmovdqu OWORD PTR [rdx+64], xmm1
+ vpxor xmm4, xmm3, xmm1
+ vpxor xmm5, xmm2, xmm1
+ vpxor xmm6, xmm8, xmm1
+ vmovdqu OWORD PTR [rdx+80], xmm4
+ vmovdqu OWORD PTR [rdx+96], xmm5
+ vmovdqu OWORD PTR [rdx+112], xmm6
+ vmovdqu OWORD PTR [rdx+128], xmm0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm4, xmm3, xmm0
+ vpxor xmm6, xmm2, xmm0
+ vmovdqu OWORD PTR [rdx+144], xmm4
+ vmovdqu OWORD PTR [rdx+160], xmm6
+ vpxor xmm6, xmm3, xmm6
+ vmovdqu OWORD PTR [rdx+176], xmm6
+ vmovdqu OWORD PTR [rdx+192], xmm1
+ vpxor xmm4, xmm3, xmm1
+ vpxor xmm5, xmm2, xmm1
+ vpxor xmm6, xmm8, xmm1
+ vmovdqu OWORD PTR [rdx+208], xmm4
+ vmovdqu OWORD PTR [rdx+224], xmm5
+ vmovdqu OWORD PTR [rdx+240], xmm6
+ vmovdqu xmm0, OWORD PTR [rdx]
+ vmovdqu xmm1, OWORD PTR [rdx+16]
+ vmovdqu xmm2, OWORD PTR [rdx+32]
+ vmovdqu xmm3, OWORD PTR [rdx+48]
+ vpshufb xmm0, xmm0, xmm9
+ vpshufb xmm1, xmm1, xmm9
+ vpshufb xmm2, xmm2, xmm9
+ vpshufb xmm3, xmm3, xmm9
+ vpsllq xmm4, xmm0, 60
+ vpsllq xmm5, xmm1, 60
+ vpsllq xmm6, xmm2, 60
+ vpsllq xmm7, xmm3, 60
+ vpsrlq xmm0, xmm0, 4
+ vpsrlq xmm1, xmm1, 4
+ vpsrlq xmm2, xmm2, 4
+ vpsrlq xmm3, xmm3, 4
+ vpsrldq xmm4, xmm4, 8
+ vpsrldq xmm5, xmm5, 8
+ vpsrldq xmm6, xmm6, 8
+ vpsrldq xmm7, xmm7, 8
+ vpor xmm0, xmm0, xmm4
+ vpor xmm1, xmm1, xmm5
+ vpor xmm2, xmm2, xmm6
+ vpor xmm3, xmm3, xmm7
+ vpshufb xmm0, xmm0, xmm9
+ vpshufb xmm1, xmm1, xmm9
+ vpshufb xmm2, xmm2, xmm9
+ vpshufb xmm3, xmm3, xmm9
+ vmovdqu OWORD PTR [rdx+256], xmm0
+ vmovdqu OWORD PTR [rdx+272], xmm1
+ vmovdqu OWORD PTR [rdx+288], xmm2
+ vmovdqu OWORD PTR [rdx+304], xmm3
+ vmovdqu xmm0, OWORD PTR [rdx+64]
+ vmovdqu xmm1, OWORD PTR [rdx+80]
+ vmovdqu xmm2, OWORD PTR [rdx+96]
+ vmovdqu xmm3, OWORD PTR [rdx+112]
+ vpshufb xmm0, xmm0, xmm9
+ vpshufb xmm1, xmm1, xmm9
+ vpshufb xmm2, xmm2, xmm9
+ vpshufb xmm3, xmm3, xmm9
+ vpsllq xmm4, xmm0, 60
+ vpsllq xmm5, xmm1, 60
+ vpsllq xmm6, xmm2, 60
+ vpsllq xmm7, xmm3, 60
+ vpsrlq xmm0, xmm0, 4
+ vpsrlq xmm1, xmm1, 4
+ vpsrlq xmm2, xmm2, 4
+ vpsrlq xmm3, xmm3, 4
+ vpsrldq xmm4, xmm4, 8
+ vpsrldq xmm5, xmm5, 8
+ vpsrldq xmm6, xmm6, 8
+ vpsrldq xmm7, xmm7, 8
+ vpor xmm0, xmm0, xmm4
+ vpor xmm1, xmm1, xmm5
+ vpor xmm2, xmm2, xmm6
+ vpor xmm3, xmm3, xmm7
+ vpshufb xmm0, xmm0, xmm9
+ vpshufb xmm1, xmm1, xmm9
+ vpshufb xmm2, xmm2, xmm9
+ vpshufb xmm3, xmm3, xmm9
+ vmovdqu OWORD PTR [rdx+320], xmm0
+ vmovdqu OWORD PTR [rdx+336], xmm1
+ vmovdqu OWORD PTR [rdx+352], xmm2
+ vmovdqu OWORD PTR [rdx+368], xmm3
+ vmovdqu xmm0, OWORD PTR [rdx+128]
+ vmovdqu xmm1, OWORD PTR [rdx+144]
+ vmovdqu xmm2, OWORD PTR [rdx+160]
+ vmovdqu xmm3, OWORD PTR [rdx+176]
+ vpshufb xmm0, xmm0, xmm9
+ vpshufb xmm1, xmm1, xmm9
+ vpshufb xmm2, xmm2, xmm9
+ vpshufb xmm3, xmm3, xmm9
+ vpsllq xmm4, xmm0, 60
+ vpsllq xmm5, xmm1, 60
+ vpsllq xmm6, xmm2, 60
+ vpsllq xmm7, xmm3, 60
+ vpsrlq xmm0, xmm0, 4
+ vpsrlq xmm1, xmm1, 4
+ vpsrlq xmm2, xmm2, 4
+ vpsrlq xmm3, xmm3, 4
+ vpsrldq xmm4, xmm4, 8
+ vpsrldq xmm5, xmm5, 8
+ vpsrldq xmm6, xmm6, 8
+ vpsrldq xmm7, xmm7, 8
+ vpor xmm0, xmm0, xmm4
+ vpor xmm1, xmm1, xmm5
+ vpor xmm2, xmm2, xmm6
+ vpor xmm3, xmm3, xmm7
+ vpshufb xmm0, xmm0, xmm9
+ vpshufb xmm1, xmm1, xmm9
+ vpshufb xmm2, xmm2, xmm9
+ vpshufb xmm3, xmm3, xmm9
+ vmovdqu OWORD PTR [rdx+384], xmm0
+ vmovdqu OWORD PTR [rdx+400], xmm1
+ vmovdqu OWORD PTR [rdx+416], xmm2
+ vmovdqu OWORD PTR [rdx+432], xmm3
+ vmovdqu xmm0, OWORD PTR [rdx+192]
+ vmovdqu xmm1, OWORD PTR [rdx+208]
+ vmovdqu xmm2, OWORD PTR [rdx+224]
+ vmovdqu xmm3, OWORD PTR [rdx+240]
+ vpshufb xmm0, xmm0, xmm9
+ vpshufb xmm1, xmm1, xmm9
+ vpshufb xmm2, xmm2, xmm9
+ vpshufb xmm3, xmm3, xmm9
+ vpsllq xmm4, xmm0, 60
+ vpsllq xmm5, xmm1, 60
+ vpsllq xmm6, xmm2, 60
+ vpsllq xmm7, xmm3, 60
+ vpsrlq xmm0, xmm0, 4
+ vpsrlq xmm1, xmm1, 4
+ vpsrlq xmm2, xmm2, 4
+ vpsrlq xmm3, xmm3, 4
+ vpsrldq xmm4, xmm4, 8
+ vpsrldq xmm5, xmm5, 8
+ vpsrldq xmm6, xmm6, 8
+ vpsrldq xmm7, xmm7, 8
+ vpor xmm0, xmm0, xmm4
+ vpor xmm1, xmm1, xmm5
+ vpor xmm2, xmm2, xmm6
+ vpor xmm3, xmm3, xmm7
+ vpshufb xmm0, xmm0, xmm9
+ vpshufb xmm1, xmm1, xmm9
+ vpshufb xmm2, xmm2, xmm9
+ vpshufb xmm3, xmm3, xmm9
+ vmovdqu OWORD PTR [rdx+448], xmm0
+ vmovdqu OWORD PTR [rdx+464], xmm1
+ vmovdqu OWORD PTR [rdx+480], xmm2
+ vmovdqu OWORD PTR [rdx+496], xmm3
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ add rsp, 80
+ ret
+GCM_generate_m0_vaes ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_vaes_aes_gcm_one QWORD \
+ 0000000000000000h, 0000000000000001h
+ptr_L_vaes_aes_gcm_one QWORD L_vaes_aes_gcm_one
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_vaes_aes_gcm_two QWORD \
+ 0000000000000000h, 0000000000000002h
+ptr_L_vaes_aes_gcm_two QWORD L_vaes_aes_gcm_two
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_vaes_aes_gcm_three QWORD \
+ 0000000000000000h, 0000000000000003h
+ptr_L_vaes_aes_gcm_three QWORD L_vaes_aes_gcm_three
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_vaes_aes_gcm_four QWORD \
+ 0000000000000000h, 0000000000000004h
+ptr_L_vaes_aes_gcm_four QWORD L_vaes_aes_gcm_four
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_vaes_aes_gcm_five QWORD \
+ 0000000000000000h, 0000000000000005h
+ptr_L_vaes_aes_gcm_five QWORD L_vaes_aes_gcm_five
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_vaes_aes_gcm_six QWORD \
+ 0000000000000000h, 0000000000000006h
+ptr_L_vaes_aes_gcm_six QWORD L_vaes_aes_gcm_six
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_vaes_aes_gcm_seven QWORD \
+ 0000000000000000h, 0000000000000007h
+ptr_L_vaes_aes_gcm_seven QWORD L_vaes_aes_gcm_seven
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_vaes_aes_gcm_eight QWORD \
+ 0000000000000000h, 0000000000000008h
+ptr_L_vaes_aes_gcm_eight QWORD L_vaes_aes_gcm_eight
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_vaes_aes_gcm_bswap_epi64 QWORD \
+ 0001020304050607h, 08090a0b0c0d0e0fh
+ptr_L_vaes_aes_gcm_bswap_epi64 QWORD L_vaes_aes_gcm_bswap_epi64
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_vaes_aes_gcm_bswap_mask QWORD \
+ 08090a0b0c0d0e0fh, 0001020304050607h
+ptr_L_vaes_aes_gcm_bswap_mask QWORD L_vaes_aes_gcm_bswap_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_vaes_aes_gcm_mod2_128 QWORD \
+ 0000000000000001h, 0c200000000000000h
+ptr_L_vaes_aes_gcm_mod2_128 QWORD L_vaes_aes_gcm_mod2_128
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_encrypt_vaes PROC
+ push r13
+ push rdi
+ push rsi
+ push r12
+ push rbx
+ push r14
+ push r15
+ mov rdi, rcx
+ mov rsi, rdx
+ mov r12, r8
+ mov rax, r9
+ mov r8, QWORD PTR [rsp+96]
+ mov r9d, DWORD PTR [rsp+104]
+ mov r11d, DWORD PTR [rsp+112]
+ mov ebx, DWORD PTR [rsp+120]
+ mov r14d, DWORD PTR [rsp+128]
+ mov r15, QWORD PTR [rsp+136]
+ mov r10d, DWORD PTR [rsp+144]
+ sub rsp, 720
+ vmovdqu OWORD PTR [rsp+560], xmm6
+ vmovdqu OWORD PTR [rsp+576], xmm7
+ vmovdqu OWORD PTR [rsp+592], xmm8
+ vmovdqu OWORD PTR [rsp+608], xmm9
+ vmovdqu OWORD PTR [rsp+624], xmm10
+ vmovdqu OWORD PTR [rsp+640], xmm11
+ vmovdqu OWORD PTR [rsp+656], xmm12
+ vmovdqu OWORD PTR [rsp+672], xmm13
+ vmovdqu OWORD PTR [rsp+688], xmm14
+ vmovdqu OWORD PTR [rsp+704], xmm15
+ vpxor xmm5, xmm5, xmm5
+ vpxor xmm15, xmm15, xmm15
+ mov edx, ebx
+ cmp edx, 12
+ jne L_AES_GCM_encrypt_vaes_iv_not_12
+ ; # Calculate values when IV is 12 bytes
+ ; Set counter based on IV
+ mov ecx, 16777216
+ vmovq xmm5, QWORD PTR [rax]
+ vpinsrd xmm5, xmm5, DWORD PTR [rax+8], 2
+ vpinsrd xmm5, xmm5, ecx, 3
+ ; H = Encrypt X(=0) and T = Encrypt counter
+ vmovdqa xmm6, OWORD PTR [r15]
+ vpxor xmm1, xmm5, xmm6
+ vmovdqa xmm4, OWORD PTR [r15+16]
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqa xmm4, OWORD PTR [r15+32]
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqa xmm4, OWORD PTR [r15+48]
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqa xmm4, OWORD PTR [r15+64]
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqa xmm4, OWORD PTR [r15+80]
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqa xmm4, OWORD PTR [r15+96]
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqa xmm4, OWORD PTR [r15+112]
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqa xmm4, OWORD PTR [r15+128]
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqa xmm4, OWORD PTR [r15+144]
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ cmp r10d, 11
+ vmovdqa xmm4, OWORD PTR [r15+160]
+ jl L_AES_GCM_encrypt_vaes_calc_iv_12_last
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqa xmm4, OWORD PTR [r15+176]
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ cmp r10d, 13
+ vmovdqa xmm4, OWORD PTR [r15+192]
+ jl L_AES_GCM_encrypt_vaes_calc_iv_12_last
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqa xmm4, OWORD PTR [r15+208]
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqa xmm4, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_vaes_calc_iv_12_last:
+ vaesenclast xmm6, xmm6, xmm4
+ vaesenclast xmm1, xmm1, xmm4
+ vpshufb xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vmovdqu OWORD PTR [rsp+528], xmm1
+ jmp L_AES_GCM_encrypt_vaes_iv_done
+L_AES_GCM_encrypt_vaes_iv_not_12:
+ ; Calculate values when IV is not 12 bytes
+ ; H = Encrypt X(=0)
+ vmovdqa xmm6, OWORD PTR [r15]
+ vaesenc xmm6, xmm6, [r15+16]
+ vaesenc xmm6, xmm6, [r15+32]
+ vaesenc xmm6, xmm6, [r15+48]
+ vaesenc xmm6, xmm6, [r15+64]
+ vaesenc xmm6, xmm6, [r15+80]
+ vaesenc xmm6, xmm6, [r15+96]
+ vaesenc xmm6, xmm6, [r15+112]
+ vaesenc xmm6, xmm6, [r15+128]
+ vaesenc xmm6, xmm6, [r15+144]
+ cmp r10d, 11
+ vmovdqa xmm8, OWORD PTR [r15+160]
+ jl L_AES_GCM_encrypt_vaes_calc_iv_1_aesenc_avx_last
+ vaesenc xmm6, xmm6, xmm8
+ vaesenc xmm6, xmm6, [r15+176]
+ cmp r10d, 13
+ vmovdqa xmm8, OWORD PTR [r15+192]
+ jl L_AES_GCM_encrypt_vaes_calc_iv_1_aesenc_avx_last
+ vaesenc xmm6, xmm6, xmm8
+ vaesenc xmm6, xmm6, [r15+208]
+ vmovdqa xmm8, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_vaes_calc_iv_1_aesenc_avx_last:
+ vaesenclast xmm6, xmm6, xmm8
+ vpshufb xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ ; Calc counter
+ ; Initialization vector
+ cmp edx, 0
+ mov rcx, 0
+ je L_AES_GCM_encrypt_vaes_calc_iv_done
+ cmp edx, 16
+ jl L_AES_GCM_encrypt_vaes_calc_iv_lt16
+ and edx, 4294967280
+L_AES_GCM_encrypt_vaes_calc_iv_16_loop:
+ vmovdqu xmm7, OWORD PTR [rax+rcx]
+ vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vpxor xmm5, xmm5, xmm7
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm5, 78
+ vpshufd xmm2, xmm6, 78
+ vpclmulqdq xmm3, xmm6, xmm5, 17
+ vpclmulqdq xmm0, xmm6, xmm5, 0
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm2, xmm2, xmm6
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm4, xmm0
+ vmovdqa xmm5, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm4, xmm4, xmm2
+ vpxor xmm5, xmm5, xmm1
+ vpsrld xmm0, xmm4, 31
+ vpsrld xmm1, xmm5, 31
+ vpslld xmm4, xmm4, 1
+ vpslld xmm5, xmm5, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm5, xmm5, xmm2
+ vpor xmm4, xmm4, xmm0
+ vpor xmm5, xmm5, xmm1
+ vpslld xmm0, xmm4, 31
+ vpslld xmm1, xmm4, 30
+ vpslld xmm2, xmm4, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm4, xmm4, xmm0
+ vpsrld xmm2, xmm4, 1
+ vpsrld xmm3, xmm4, 2
+ vpsrld xmm0, xmm4, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm4
+ vpxor xmm5, xmm5, xmm2
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_encrypt_vaes_calc_iv_16_loop
+ mov edx, ebx
+ cmp ecx, edx
+ je L_AES_GCM_encrypt_vaes_calc_iv_done
+L_AES_GCM_encrypt_vaes_calc_iv_lt16:
+ sub rsp, 16
+ vpxor xmm7, xmm7, xmm7
+ xor ebx, ebx
+ vmovdqu OWORD PTR [rsp], xmm7
+L_AES_GCM_encrypt_vaes_calc_iv_loop:
+ movzx r13d, BYTE PTR [rax+rcx]
+ mov BYTE PTR [rsp+rbx], r13b
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_encrypt_vaes_calc_iv_loop
+ vmovdqu xmm7, OWORD PTR [rsp]
+ add rsp, 16
+ vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vpxor xmm5, xmm5, xmm7
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm5, 78
+ vpshufd xmm2, xmm6, 78
+ vpclmulqdq xmm3, xmm6, xmm5, 17
+ vpclmulqdq xmm0, xmm6, xmm5, 0
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm2, xmm2, xmm6
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm4, xmm0
+ vmovdqa xmm5, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm4, xmm4, xmm2
+ vpxor xmm5, xmm5, xmm1
+ vpsrld xmm0, xmm4, 31
+ vpsrld xmm1, xmm5, 31
+ vpslld xmm4, xmm4, 1
+ vpslld xmm5, xmm5, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm5, xmm5, xmm2
+ vpor xmm4, xmm4, xmm0
+ vpor xmm5, xmm5, xmm1
+ vpslld xmm0, xmm4, 31
+ vpslld xmm1, xmm4, 30
+ vpslld xmm2, xmm4, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm4, xmm4, xmm0
+ vpsrld xmm2, xmm4, 1
+ vpsrld xmm3, xmm4, 2
+ vpsrld xmm0, xmm4, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm4
+ vpxor xmm5, xmm5, xmm2
+L_AES_GCM_encrypt_vaes_calc_iv_done:
+ ; T = Encrypt counter
+ vpxor xmm0, xmm0, xmm0
+ shl edx, 3
+ vmovq xmm0, rdx
+ vpxor xmm5, xmm5, xmm0
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm5, 78
+ vpshufd xmm2, xmm6, 78
+ vpclmulqdq xmm3, xmm6, xmm5, 17
+ vpclmulqdq xmm0, xmm6, xmm5, 0
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm2, xmm2, xmm6
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm4, xmm0
+ vmovdqa xmm5, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm4, xmm4, xmm2
+ vpxor xmm5, xmm5, xmm1
+ vpsrld xmm0, xmm4, 31
+ vpsrld xmm1, xmm5, 31
+ vpslld xmm4, xmm4, 1
+ vpslld xmm5, xmm5, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm5, xmm5, xmm2
+ vpor xmm4, xmm4, xmm0
+ vpor xmm5, xmm5, xmm1
+ vpslld xmm0, xmm4, 31
+ vpslld xmm1, xmm4, 30
+ vpslld xmm2, xmm4, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm4, xmm4, xmm0
+ vpsrld xmm2, xmm4, 1
+ vpsrld xmm3, xmm4, 2
+ vpsrld xmm0, xmm4, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm4
+ vpxor xmm5, xmm5, xmm2
+ vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ ; Encrypt counter
+ vmovdqa xmm7, OWORD PTR [r15]
+ vpxor xmm7, xmm7, xmm5
+ vaesenc xmm7, xmm7, [r15+16]
+ vaesenc xmm7, xmm7, [r15+32]
+ vaesenc xmm7, xmm7, [r15+48]
+ vaesenc xmm7, xmm7, [r15+64]
+ vaesenc xmm7, xmm7, [r15+80]
+ vaesenc xmm7, xmm7, [r15+96]
+ vaesenc xmm7, xmm7, [r15+112]
+ vaesenc xmm7, xmm7, [r15+128]
+ vaesenc xmm7, xmm7, [r15+144]
+ cmp r10d, 11
+ vmovdqa xmm8, OWORD PTR [r15+160]
+ jl L_AES_GCM_encrypt_vaes_calc_iv_2_aesenc_avx_last
+ vaesenc xmm7, xmm7, xmm8
+ vaesenc xmm7, xmm7, [r15+176]
+ cmp r10d, 13
+ vmovdqa xmm8, OWORD PTR [r15+192]
+ jl L_AES_GCM_encrypt_vaes_calc_iv_2_aesenc_avx_last
+ vaesenc xmm7, xmm7, xmm8
+ vaesenc xmm7, xmm7, [r15+208]
+ vmovdqa xmm8, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_vaes_calc_iv_2_aesenc_avx_last:
+ vaesenclast xmm7, xmm7, xmm8
+ vmovdqu OWORD PTR [rsp+528], xmm7
+L_AES_GCM_encrypt_vaes_iv_done:
+ ; Additional authentication data
+ mov edx, r11d
+ cmp edx, 0
+ je L_AES_GCM_encrypt_vaes_calc_aad_done
+ xor ecx, ecx
+ cmp edx, 16
+ jl L_AES_GCM_encrypt_vaes_calc_aad_lt16
+ and edx, 4294967280
+L_AES_GCM_encrypt_vaes_calc_aad_16_loop:
+ vmovdqu xmm7, OWORD PTR [r12+rcx]
+ vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vpxor xmm15, xmm15, xmm7
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm15, 78
+ vpshufd xmm2, xmm6, 78
+ vpclmulqdq xmm3, xmm6, xmm15, 17
+ vpclmulqdq xmm0, xmm6, xmm15, 0
+ vpxor xmm1, xmm1, xmm15
+ vpxor xmm2, xmm2, xmm6
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm4, xmm0
+ vmovdqa xmm15, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm4, xmm4, xmm2
+ vpxor xmm15, xmm15, xmm1
+ vpsrld xmm0, xmm4, 31
+ vpsrld xmm1, xmm15, 31
+ vpslld xmm4, xmm4, 1
+ vpslld xmm15, xmm15, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm15, xmm15, xmm2
+ vpor xmm4, xmm4, xmm0
+ vpor xmm15, xmm15, xmm1
+ vpslld xmm0, xmm4, 31
+ vpslld xmm1, xmm4, 30
+ vpslld xmm2, xmm4, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm4, xmm4, xmm0
+ vpsrld xmm2, xmm4, 1
+ vpsrld xmm3, xmm4, 2
+ vpsrld xmm0, xmm4, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm4
+ vpxor xmm15, xmm15, xmm2
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_encrypt_vaes_calc_aad_16_loop
+ mov edx, r11d
+ cmp ecx, edx
+ je L_AES_GCM_encrypt_vaes_calc_aad_done
+L_AES_GCM_encrypt_vaes_calc_aad_lt16:
+ sub rsp, 16
+ vpxor xmm7, xmm7, xmm7
+ xor ebx, ebx
+ vmovdqu OWORD PTR [rsp], xmm7
+L_AES_GCM_encrypt_vaes_calc_aad_loop:
+ movzx r13d, BYTE PTR [r12+rcx]
+ mov BYTE PTR [rsp+rbx], r13b
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_encrypt_vaes_calc_aad_loop
+ vmovdqu xmm7, OWORD PTR [rsp]
+ add rsp, 16
+ vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vpxor xmm15, xmm15, xmm7
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm15, 78
+ vpshufd xmm2, xmm6, 78
+ vpclmulqdq xmm3, xmm6, xmm15, 17
+ vpclmulqdq xmm0, xmm6, xmm15, 0
+ vpxor xmm1, xmm1, xmm15
+ vpxor xmm2, xmm2, xmm6
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm4, xmm0
+ vmovdqa xmm15, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm4, xmm4, xmm2
+ vpxor xmm15, xmm15, xmm1
+ vpsrld xmm0, xmm4, 31
+ vpsrld xmm1, xmm15, 31
+ vpslld xmm4, xmm4, 1
+ vpslld xmm15, xmm15, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm15, xmm15, xmm2
+ vpor xmm4, xmm4, xmm0
+ vpor xmm15, xmm15, xmm1
+ vpslld xmm0, xmm4, 31
+ vpslld xmm1, xmm4, 30
+ vpslld xmm2, xmm4, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm4, xmm4, xmm0
+ vpsrld xmm2, xmm4, 1
+ vpsrld xmm3, xmm4, 2
+ vpsrld xmm0, xmm4, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm4
+ vpxor xmm15, xmm15, xmm2
+L_AES_GCM_encrypt_vaes_calc_aad_done:
+ ; Calculate counter and H
+ vpsrlq xmm8, xmm6, 63
+ vpsllq xmm7, xmm6, 1
+ vpslldq xmm8, xmm8, 8
+ vpor xmm7, xmm7, xmm8
+ vpshufd xmm6, xmm6, 255
+ vpsrad xmm6, xmm6, 31
+ vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_epi64
+ vpand xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpaddd xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_one
+ vpxor xmm6, xmm6, xmm7
+ vmovdqu OWORD PTR [rsp+512], xmm5
+ xor ebx, ebx
+ cmp r9d, 128
+ jl L_AES_GCM_encrypt_vaes_done_128
+ vmovdqa xmm2, xmm15
+ ; H ^ 1
+ vmovdqu OWORD PTR [rsp], xmm6
+ ; H ^ 2
+ vpclmulqdq xmm7, xmm6, xmm6, 0
+ vpclmulqdq xmm10, xmm6, xmm6, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm0, xmm10
+ vmovdqu OWORD PTR [rsp+16], xmm0
+ ; H ^ 3
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm0, xmm6, 0
+ vpclmulqdq xmm8, xmm0, xmm6, 1
+ vpclmulqdq xmm9, xmm0, xmm6, 16
+ vpclmulqdq xmm10, xmm0, xmm6, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm1, xmm10
+ vmovdqu OWORD PTR [rsp+32], xmm1
+ ; H ^ 4
+ vpclmulqdq xmm7, xmm0, xmm0, 0
+ vpclmulqdq xmm10, xmm0, xmm0, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm3, xmm10
+ vmovdqu OWORD PTR [rsp+48], xmm3
+ ; H ^ 5
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm1, xmm0, 0
+ vpclmulqdq xmm8, xmm1, xmm0, 1
+ vpclmulqdq xmm9, xmm1, xmm0, 16
+ vpclmulqdq xmm10, xmm1, xmm0, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+64], xmm4
+ ; H ^ 6
+ vpclmulqdq xmm7, xmm1, xmm1, 0
+ vpclmulqdq xmm10, xmm1, xmm1, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm4
+ ; H ^ 7
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm3, xmm1, 0
+ vpclmulqdq xmm8, xmm3, xmm1, 1
+ vpclmulqdq xmm9, xmm3, xmm1, 16
+ vpclmulqdq xmm10, xmm3, xmm1, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+96], xmm4
+ ; H ^ 8
+ vpclmulqdq xmm7, xmm3, xmm3, 0
+ vpclmulqdq xmm10, xmm3, xmm3, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+112], xmm4
+ cmp r9d, 256
+ jl L_AES_GCM_encrypt_vaes_no_ext
+ ; H ^ 9
+ vmovdqu xmm0, OWORD PTR [rsp+48]
+ vmovdqu xmm1, OWORD PTR [rsp+64]
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm1, xmm0, 0
+ vpclmulqdq xmm8, xmm1, xmm0, 1
+ vpclmulqdq xmm9, xmm1, xmm0, 16
+ vpclmulqdq xmm10, xmm1, xmm0, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+128], xmm4
+ ; H ^ 10
+ vmovdqu xmm0, OWORD PTR [rsp+64]
+ vpclmulqdq xmm7, xmm0, xmm0, 0
+ vpclmulqdq xmm10, xmm0, xmm0, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+144], xmm4
+ ; H ^ 11
+ vmovdqu xmm0, OWORD PTR [rsp+64]
+ vmovdqu xmm1, OWORD PTR [rsp+80]
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm1, xmm0, 0
+ vpclmulqdq xmm8, xmm1, xmm0, 1
+ vpclmulqdq xmm9, xmm1, xmm0, 16
+ vpclmulqdq xmm10, xmm1, xmm0, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+160], xmm4
+ ; H ^ 12
+ vmovdqu xmm0, OWORD PTR [rsp+80]
+ vpclmulqdq xmm7, xmm0, xmm0, 0
+ vpclmulqdq xmm10, xmm0, xmm0, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+176], xmm4
+ ; H ^ 13
+ vmovdqu xmm0, OWORD PTR [rsp+80]
+ vmovdqu xmm1, OWORD PTR [rsp+96]
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm1, xmm0, 0
+ vpclmulqdq xmm8, xmm1, xmm0, 1
+ vpclmulqdq xmm9, xmm1, xmm0, 16
+ vpclmulqdq xmm10, xmm1, xmm0, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+192], xmm4
+ ; H ^ 14
+ vmovdqu xmm0, OWORD PTR [rsp+96]
+ vpclmulqdq xmm7, xmm0, xmm0, 0
+ vpclmulqdq xmm10, xmm0, xmm0, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+208], xmm4
+ ; H ^ 15
+ vmovdqu xmm0, OWORD PTR [rsp+96]
+ vmovdqu xmm1, OWORD PTR [rsp+112]
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm1, xmm0, 0
+ vpclmulqdq xmm8, xmm1, xmm0, 1
+ vpclmulqdq xmm9, xmm1, xmm0, 16
+ vpclmulqdq xmm10, xmm1, xmm0, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+224], xmm4
+ ; H ^ 16
+ vmovdqu xmm0, OWORD PTR [rsp+112]
+ vpclmulqdq xmm7, xmm0, xmm0, 0
+ vpclmulqdq xmm10, xmm0, xmm0, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+240], xmm4
+ vmovdqu ymm7, YMMWORD PTR [rsp+224]
+ vpermq ymm7, ymm7, 78
+ vmovdqu ymm8, YMMWORD PTR [rsp+192]
+ vpermq ymm8, ymm8, 78
+ vmovdqu ymm9, YMMWORD PTR [rsp+160]
+ vpermq ymm9, ymm9, 78
+ vmovdqu ymm10, YMMWORD PTR [rsp+128]
+ vpermq ymm10, ymm10, 78
+ vmovdqu YMMWORD PTR [rsp+256], ymm7
+ vmovdqu YMMWORD PTR [rsp+288], ymm8
+ vmovdqu YMMWORD PTR [rsp+320], ymm9
+ vmovdqu YMMWORD PTR [rsp+352], ymm10
+ vmovdqu ymm7, YMMWORD PTR [rsp+96]
+ vpermq ymm7, ymm7, 78
+ vmovdqu ymm8, YMMWORD PTR [rsp+64]
+ vpermq ymm8, ymm8, 78
+ vmovdqu ymm9, YMMWORD PTR [rsp+32]
+ vpermq ymm9, ymm9, 78
+ vmovdqu ymm10, YMMWORD PTR [rsp]
+ vpermq ymm10, ymm10, 78
+ vmovdqu YMMWORD PTR [rsp+384], ymm7
+ vmovdqu YMMWORD PTR [rsp+416], ymm8
+ vmovdqu YMMWORD PTR [rsp+448], ymm9
+ vmovdqu YMMWORD PTR [rsp+480], ymm10
+L_AES_GCM_encrypt_vaes_no_ext:
+ vbroadcasti128 ymm14, ptr_L_vaes_aes_gcm_mod2_128
+ cmp r9d, 256
+ jl L_AES_GCM_encrypt_vaes_after_256
+ mov r13d, r9d
+ and r13d, 4294967040
+L_AES_GCM_encrypt_vaes_loop_256:
+ ; 256 bytes of input
+ lea rcx, QWORD PTR [rsi+rbx]
+ mov QWORD PTR [rsp+544], rcx
+ vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64
+ vbroadcasti128 ymm4, [rsp+512]
+ vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
+ vpshufb ymm0, ymm0, ymm6
+ vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
+ vpshufb ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2
+ vpshufb ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3
+ vpshufb ymm3, ymm3, ymm6
+ vmovdqu xmm7, OWORD PTR [rsp+512]
+ vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
+ vmovdqu OWORD PTR [rsp+512], xmm7
+ vbroadcasti128 ymm4, [r15]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm1, ymm1, ymm4
+ vpxor ymm2, ymm2, ymm4
+ vpxor ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+16]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+32]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+48]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+64]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+80]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+96]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+112]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+128]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+144]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r10d, 11
+ vbroadcasti128 ymm4, [r15+160]
+ jl L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+176]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r10d, 13
+ vbroadcasti128 ymm4, [r15+192]
+ jl L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+208]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+224]
+L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last:
+ vaesenclast ymm0, ymm0, ymm4
+ vaesenclast ymm1, ymm1, ymm4
+ vaesenclast ymm2, ymm2, ymm4
+ vaesenclast ymm3, ymm3, ymm4
+ lea rcx, QWORD PTR [rdi+rbx]
+ lea rdx, QWORD PTR [rsi+rbx]
+ vmovdqu ymm5, YMMWORD PTR [rcx]
+ vpxor ymm0, ymm0, ymm5
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu ymm5, YMMWORD PTR [rcx+32]
+ vpxor ymm1, ymm1, ymm5
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu ymm5, YMMWORD PTR [rcx+64]
+ vpxor ymm2, ymm2, ymm5
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vmovdqu ymm5, YMMWORD PTR [rcx+96]
+ vpxor ymm3, ymm3, ymm5
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ add ebx, 128
+ vbroadcasti128 ymm4, [rsp+512]
+ vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
+ vpshufb ymm0, ymm0, ymm6
+ vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
+ vpshufb ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2
+ vpshufb ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3
+ vpshufb ymm3, ymm3, ymm6
+ vmovdqu xmm7, OWORD PTR [rsp+512]
+ vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
+ vmovdqu OWORD PTR [rsp+512], xmm7
+ vbroadcasti128 ymm4, [r15]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm1, ymm1, ymm4
+ vpxor ymm2, ymm2, ymm4
+ vpxor ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+16]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+32]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+48]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+64]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+80]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+96]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+112]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+128]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+144]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r10d, 11
+ vbroadcasti128 ymm4, [r15+160]
+ jl L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+176]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r10d, 13
+ vbroadcasti128 ymm4, [r15+192]
+ jl L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+208]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+224]
+L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last:
+ vaesenclast ymm0, ymm0, ymm4
+ vaesenclast ymm1, ymm1, ymm4
+ vaesenclast ymm2, ymm2, ymm4
+ vaesenclast ymm3, ymm3, ymm4
+ lea rcx, QWORD PTR [rdi+rbx]
+ lea rdx, QWORD PTR [rsi+rbx]
+ vmovdqu ymm5, YMMWORD PTR [rcx]
+ vpxor ymm0, ymm0, ymm5
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu ymm5, YMMWORD PTR [rcx+32]
+ vpxor ymm1, ymm1, ymm5
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu ymm5, YMMWORD PTR [rcx+64]
+ vpxor ymm2, ymm2, ymm5
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vmovdqu ymm5, YMMWORD PTR [rcx+96]
+ vpxor ymm3, ymm3, ymm5
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ add ebx, 128
+ vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask
+ mov rcx, QWORD PTR [rsp+544]
+ vpxor ymm4, ymm4, ymm4
+ vinserti128 ymm4, ymm4, xmm15, 0
+ vmovdqu ymm7, YMMWORD PTR [rsp+256]
+ vmovdqu ymm8, YMMWORD PTR [rsp+288]
+ vmovdqu ymm9, YMMWORD PTR [rsp+320]
+ vmovdqu ymm10, YMMWORD PTR [rsp+352]
+ vmovdqu ymm5, YMMWORD PTR [rcx]
+ vpshufb ymm5, ymm5, ymm6
+ vpxor ymm5, ymm5, ymm4
+ vpclmulqdq ymm0, ymm5, ymm7, 0
+ vpclmulqdq ymm1, ymm5, ymm7, 1
+ vpclmulqdq ymm2, ymm5, ymm7, 16
+ vpclmulqdq ymm3, ymm5, ymm7, 17
+ vmovdqa ymm11, ymm0
+ vpxor ymm12, ymm2, ymm1
+ vmovdqa ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rcx+32]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm8, 0
+ vpclmulqdq ymm1, ymm5, ymm8, 1
+ vpclmulqdq ymm2, ymm5, ymm8, 16
+ vpclmulqdq ymm3, ymm5, ymm8, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rcx+64]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm9, 0
+ vpclmulqdq ymm1, ymm5, ymm9, 1
+ vpclmulqdq ymm2, ymm5, ymm9, 16
+ vpclmulqdq ymm3, ymm5, ymm9, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rcx+96]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm10, 0
+ vpclmulqdq ymm1, ymm5, ymm10, 1
+ vpclmulqdq ymm2, ymm5, ymm10, 16
+ vpclmulqdq ymm3, ymm5, ymm10, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm7, YMMWORD PTR [rsp+384]
+ vmovdqu ymm8, YMMWORD PTR [rsp+416]
+ vmovdqu ymm9, YMMWORD PTR [rsp+448]
+ vmovdqu ymm10, YMMWORD PTR [rsp+480]
+ vmovdqu ymm5, YMMWORD PTR [rcx+128]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm7, 0
+ vpclmulqdq ymm1, ymm5, ymm7, 1
+ vpclmulqdq ymm2, ymm5, ymm7, 16
+ vpclmulqdq ymm3, ymm5, ymm7, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rcx+160]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm8, 0
+ vpclmulqdq ymm1, ymm5, ymm8, 1
+ vpclmulqdq ymm2, ymm5, ymm8, 16
+ vpclmulqdq ymm3, ymm5, ymm8, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rcx+192]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm9, 0
+ vpclmulqdq ymm1, ymm5, ymm9, 1
+ vpclmulqdq ymm2, ymm5, ymm9, 16
+ vpclmulqdq ymm3, ymm5, ymm9, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rcx+224]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm10, 0
+ vpclmulqdq ymm1, ymm5, ymm10, 1
+ vpclmulqdq ymm2, ymm5, ymm10, 16
+ vpclmulqdq ymm3, ymm5, ymm10, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vpclmulqdq ymm5, ymm14, ymm11, 1
+ vpshufd ymm11, ymm11, 78
+ vpxor ymm12, ymm12, ymm5
+ vpxor ymm12, ymm12, ymm11
+ vpclmulqdq ymm5, ymm14, ymm12, 1
+ vpshufd ymm12, ymm12, 78
+ vpxor ymm13, ymm13, ymm5
+ vpxor ymm13, ymm13, ymm12
+ vextracti128 xmm0, ymm13, 1
+ vpxor xmm15, xmm13, xmm0
+ cmp ebx, r13d
+ jl L_AES_GCM_encrypt_vaes_loop_256
+L_AES_GCM_encrypt_vaes_after_256:
+ mov r13d, r9d
+ and r13d, 4294967168
+ cmp ebx, r13d
+ jge L_AES_GCM_encrypt_vaes_after_128
+ ; 128 bytes of input
+ lea rcx, QWORD PTR [rsi+rbx]
+ mov QWORD PTR [rsp+544], rcx
+ vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64
+ vbroadcasti128 ymm4, [rsp+512]
+ vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
+ vpshufb ymm0, ymm0, ymm6
+ vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
+ vpshufb ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2
+ vpshufb ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3
+ vpshufb ymm3, ymm3, ymm6
+ vmovdqu xmm7, OWORD PTR [rsp+512]
+ vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
+ vmovdqu OWORD PTR [rsp+512], xmm7
+ vbroadcasti128 ymm4, [r15]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm1, ymm1, ymm4
+ vpxor ymm2, ymm2, ymm4
+ vpxor ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+16]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+32]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+48]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+64]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+80]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+96]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+112]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+128]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+144]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r10d, 11
+ vbroadcasti128 ymm4, [r15+160]
+ jl L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+176]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r10d, 13
+ vbroadcasti128 ymm4, [r15+192]
+ jl L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+208]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+224]
+L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last:
+ vaesenclast ymm0, ymm0, ymm4
+ vaesenclast ymm1, ymm1, ymm4
+ vaesenclast ymm2, ymm2, ymm4
+ vaesenclast ymm3, ymm3, ymm4
+ lea rcx, QWORD PTR [rdi+rbx]
+ lea rdx, QWORD PTR [rsi+rbx]
+ vmovdqu ymm5, YMMWORD PTR [rcx]
+ vpxor ymm0, ymm0, ymm5
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu ymm5, YMMWORD PTR [rcx+32]
+ vpxor ymm1, ymm1, ymm5
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu ymm5, YMMWORD PTR [rcx+64]
+ vpxor ymm2, ymm2, ymm5
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vmovdqu ymm5, YMMWORD PTR [rcx+96]
+ vpxor ymm3, ymm3, ymm5
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ add ebx, 128
+ vmovdqu ymm7, YMMWORD PTR [rsp+96]
+ vpermq ymm7, ymm7, 78
+ vmovdqu ymm8, YMMWORD PTR [rsp+64]
+ vpermq ymm8, ymm8, 78
+ vmovdqu ymm9, YMMWORD PTR [rsp+32]
+ vpermq ymm9, ymm9, 78
+ vmovdqu ymm10, YMMWORD PTR [rsp]
+ vpermq ymm10, ymm10, 78
+ vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask
+ mov rcx, QWORD PTR [rsp+544]
+ vpxor ymm4, ymm4, ymm4
+ vinserti128 ymm4, ymm4, xmm15, 0
+ vmovdqu ymm5, YMMWORD PTR [rcx]
+ vpshufb ymm5, ymm5, ymm6
+ vpxor ymm5, ymm5, ymm4
+ vpclmulqdq ymm0, ymm5, ymm7, 0
+ vpclmulqdq ymm1, ymm5, ymm7, 1
+ vpclmulqdq ymm2, ymm5, ymm7, 16
+ vpclmulqdq ymm3, ymm5, ymm7, 17
+ vmovdqa ymm11, ymm0
+ vpxor ymm12, ymm2, ymm1
+ vmovdqa ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rcx+32]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm8, 0
+ vpclmulqdq ymm1, ymm5, ymm8, 1
+ vpclmulqdq ymm2, ymm5, ymm8, 16
+ vpclmulqdq ymm3, ymm5, ymm8, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rcx+64]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm9, 0
+ vpclmulqdq ymm1, ymm5, ymm9, 1
+ vpclmulqdq ymm2, ymm5, ymm9, 16
+ vpclmulqdq ymm3, ymm5, ymm9, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rcx+96]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm10, 0
+ vpclmulqdq ymm1, ymm5, ymm10, 1
+ vpclmulqdq ymm2, ymm5, ymm10, 16
+ vpclmulqdq ymm3, ymm5, ymm10, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vpclmulqdq ymm5, ymm14, ymm11, 1
+ vpshufd ymm11, ymm11, 78
+ vpxor ymm12, ymm12, ymm5
+ vpxor ymm12, ymm12, ymm11
+ vpclmulqdq ymm5, ymm14, ymm12, 1
+ vpshufd ymm12, ymm12, 78
+ vpxor ymm13, ymm13, ymm5
+ vpxor ymm13, ymm13, ymm12
+ vextracti128 xmm0, ymm13, 1
+ vpxor xmm15, xmm13, xmm0
+L_AES_GCM_encrypt_vaes_after_128:
+ vmovdqu xmm6, OWORD PTR [rsp]
+L_AES_GCM_encrypt_vaes_done_128:
+ mov edx, r9d
+ cmp ebx, edx
+ jge L_AES_GCM_encrypt_vaes_done_enc
+ mov r13d, r9d
+ and r13d, 4294967280
+ cmp ebx, r13d
+ jge L_AES_GCM_encrypt_vaes_last_block_done
+ vmovdqu xmm8, OWORD PTR [rsp+512]
+ vpshufb xmm7, xmm8, OWORD PTR L_vaes_aes_gcm_bswap_epi64
+ vpaddd xmm8, xmm8, OWORD PTR L_vaes_aes_gcm_one
+ vmovdqu OWORD PTR [rsp+512], xmm8
+ vpxor xmm7, xmm7, [r15]
+ vaesenc xmm7, xmm7, [r15+16]
+ vaesenc xmm7, xmm7, [r15+32]
+ vaesenc xmm7, xmm7, [r15+48]
+ vaesenc xmm7, xmm7, [r15+64]
+ vaesenc xmm7, xmm7, [r15+80]
+ vaesenc xmm7, xmm7, [r15+96]
+ vaesenc xmm7, xmm7, [r15+112]
+ vaesenc xmm7, xmm7, [r15+128]
+ vaesenc xmm7, xmm7, [r15+144]
+ cmp r10d, 11
+ vmovdqa xmm8, OWORD PTR [r15+160]
+ jl L_AES_GCM_encrypt_vaes_aesenc_block_last
+ vaesenc xmm7, xmm7, xmm8
+ vaesenc xmm7, xmm7, [r15+176]
+ cmp r10d, 13
+ vmovdqa xmm8, OWORD PTR [r15+192]
+ jl L_AES_GCM_encrypt_vaes_aesenc_block_last
+ vaesenc xmm7, xmm7, xmm8
+ vaesenc xmm7, xmm7, [r15+208]
+ vmovdqa xmm8, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_vaes_aesenc_block_last:
+ vaesenclast xmm7, xmm7, xmm8
+ vmovdqu xmm8, OWORD PTR [rdi+rbx]
+ vpxor xmm7, xmm7, xmm8
+ vmovdqu OWORD PTR [rsi+rbx], xmm7
+ vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vpxor xmm15, xmm15, xmm7
+ add ebx, 16
+ cmp ebx, r13d
+ jge L_AES_GCM_encrypt_vaes_last_block_ghash
+L_AES_GCM_encrypt_vaes_last_block_start:
+ vmovdqu xmm12, OWORD PTR [rdi+rbx]
+ vmovdqu xmm8, OWORD PTR [rsp+512]
+ vpshufb xmm7, xmm8, OWORD PTR L_vaes_aes_gcm_bswap_epi64
+ vpaddd xmm8, xmm8, OWORD PTR L_vaes_aes_gcm_one
+ vmovdqu OWORD PTR [rsp+512], xmm8
+ vpxor xmm7, xmm7, [r15]
+ vpclmulqdq xmm9, xmm15, xmm6, 16
+ vaesenc xmm7, xmm7, [r15+16]
+ vaesenc xmm7, xmm7, [r15+32]
+ vpclmulqdq xmm10, xmm15, xmm6, 1
+ vaesenc xmm7, xmm7, [r15+48]
+ vaesenc xmm7, xmm7, [r15+64]
+ vpclmulqdq xmm11, xmm15, xmm6, 0
+ vaesenc xmm7, xmm7, [r15+80]
+ vpclmulqdq xmm1, xmm15, xmm6, 17
+ vaesenc xmm7, xmm7, [r15+96]
+ vpxor xmm9, xmm9, xmm10
+ vpslldq xmm2, xmm9, 8
+ vpsrldq xmm9, xmm9, 8
+ vaesenc xmm7, xmm7, [r15+112]
+ vpxor xmm2, xmm2, xmm11
+ vpxor xmm3, xmm1, xmm9
+ vmovdqa xmm0, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm10, xmm2, xmm0, 16
+ vaesenc xmm7, xmm7, [r15+128]
+ vpshufd xmm9, xmm2, 78
+ vpxor xmm9, xmm9, xmm10
+ vpclmulqdq xmm10, xmm9, xmm0, 16
+ vaesenc xmm7, xmm7, [r15+144]
+ vpshufd xmm9, xmm9, 78
+ vpxor xmm9, xmm9, xmm10
+ vpxor xmm15, xmm9, xmm3
+ cmp r10d, 11
+ vmovdqa xmm8, OWORD PTR [r15+160]
+ jl L_AES_GCM_encrypt_vaes_aesenc_gfmul_last
+ vaesenc xmm7, xmm7, xmm8
+ vaesenc xmm7, xmm7, [r15+176]
+ cmp r10d, 13
+ vmovdqa xmm8, OWORD PTR [r15+192]
+ jl L_AES_GCM_encrypt_vaes_aesenc_gfmul_last
+ vaesenc xmm7, xmm7, xmm8
+ vaesenc xmm7, xmm7, [r15+208]
+ vmovdqa xmm8, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_vaes_aesenc_gfmul_last:
+ vaesenclast xmm7, xmm7, xmm8
+ vmovdqa xmm0, xmm12
+ vpxor xmm7, xmm7, xmm0
+ vmovdqu OWORD PTR [rsi+rbx], xmm7
+ vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ add ebx, 16
+ vpxor xmm15, xmm15, xmm7
+ cmp ebx, r13d
+ jl L_AES_GCM_encrypt_vaes_last_block_start
+L_AES_GCM_encrypt_vaes_last_block_ghash:
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm15, xmm6, 0
+ vpclmulqdq xmm8, xmm15, xmm6, 1
+ vpclmulqdq xmm9, xmm15, xmm6, 16
+ vpclmulqdq xmm10, xmm15, xmm6, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm15, xmm10
+L_AES_GCM_encrypt_vaes_last_block_done:
+ mov ecx, r9d
+ mov edx, ecx
+ and ecx, 15
+ jz L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_done
+ vmovdqu xmm5, OWORD PTR [rsp+512]
+ vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_epi64
+ vpxor xmm5, xmm5, [r15]
+ vaesenc xmm5, xmm5, [r15+16]
+ vaesenc xmm5, xmm5, [r15+32]
+ vaesenc xmm5, xmm5, [r15+48]
+ vaesenc xmm5, xmm5, [r15+64]
+ vaesenc xmm5, xmm5, [r15+80]
+ vaesenc xmm5, xmm5, [r15+96]
+ vaesenc xmm5, xmm5, [r15+112]
+ vaesenc xmm5, xmm5, [r15+128]
+ vaesenc xmm5, xmm5, [r15+144]
+ cmp r10d, 11
+ vmovdqa xmm8, OWORD PTR [r15+160]
+ jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_aesenc_avx_last
+ vaesenc xmm5, xmm5, xmm8
+ vaesenc xmm5, xmm5, [r15+176]
+ cmp r10d, 13
+ vmovdqa xmm8, OWORD PTR [r15+192]
+ jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_aesenc_avx_last
+ vaesenc xmm5, xmm5, xmm8
+ vaesenc xmm5, xmm5, [r15+208]
+ vmovdqa xmm8, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_aesenc_avx_last:
+ vaesenclast xmm5, xmm5, xmm8
+ sub rsp, 16
+ xor ecx, ecx
+ vmovdqu OWORD PTR [rsp], xmm5
+L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_loop:
+ movzx r13d, BYTE PTR [rdi+rbx]
+ xor r13b, BYTE PTR [rsp+rcx]
+ mov BYTE PTR [rsi+rbx], r13b
+ mov BYTE PTR [rsp+rcx], r13b
+ inc ebx
+ inc ecx
+ cmp ebx, edx
+ jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_loop
+ xor r13, r13
+ cmp ecx, 16
+ je L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_finish_enc
+L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_byte_loop:
+ mov BYTE PTR [rsp+rcx], r13b
+ inc ecx
+ cmp ecx, 16
+ jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_byte_loop
+L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_finish_enc:
+ vmovdqu xmm5, OWORD PTR [rsp]
+ add rsp, 16
+ vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vpxor xmm15, xmm15, xmm5
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm15, xmm6, 0
+ vpclmulqdq xmm8, xmm15, xmm6, 1
+ vpclmulqdq xmm9, xmm15, xmm6, 16
+ vpclmulqdq xmm10, xmm15, xmm6, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm15, xmm10
+L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_done:
+L_AES_GCM_encrypt_vaes_done_enc:
+ mov edx, r9d
+ mov ecx, r11d
+ shl rdx, 3
+ shl rcx, 3
+ vmovq xmm0, rdx
+ vmovq xmm1, rcx
+ vpunpcklqdq xmm0, xmm0, xmm1
+ vpxor xmm15, xmm15, xmm0
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm15, xmm6, 0
+ vpclmulqdq xmm8, xmm15, xmm6, 1
+ vpclmulqdq xmm9, xmm15, xmm6, 16
+ vpclmulqdq xmm10, xmm15, xmm6, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm15, xmm10
+ vpshufb xmm15, xmm15, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vmovdqu xmm0, OWORD PTR [rsp+528]
+ vpxor xmm0, xmm0, xmm15
+ cmp r14d, 16
+ je L_AES_GCM_encrypt_vaes_store_tag_16
+ xor rcx, rcx
+ vmovdqu OWORD PTR [rsp], xmm0
+L_AES_GCM_encrypt_vaes_store_tag_loop:
+ movzx r13d, BYTE PTR [rsp+rcx]
+ mov BYTE PTR [r8+rcx], r13b
+ inc ecx
+ cmp ecx, r14d
+ jne L_AES_GCM_encrypt_vaes_store_tag_loop
+ jmp L_AES_GCM_encrypt_vaes_store_tag_done
+L_AES_GCM_encrypt_vaes_store_tag_16:
+ vmovdqu OWORD PTR [r8], xmm0
+L_AES_GCM_encrypt_vaes_store_tag_done:
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp+560]
+ vmovdqu xmm7, OWORD PTR [rsp+576]
+ vmovdqu xmm8, OWORD PTR [rsp+592]
+ vmovdqu xmm9, OWORD PTR [rsp+608]
+ vmovdqu xmm10, OWORD PTR [rsp+624]
+ vmovdqu xmm11, OWORD PTR [rsp+640]
+ vmovdqu xmm12, OWORD PTR [rsp+656]
+ vmovdqu xmm13, OWORD PTR [rsp+672]
+ vmovdqu xmm14, OWORD PTR [rsp+688]
+ vmovdqu xmm15, OWORD PTR [rsp+704]
+ add rsp, 720
+ pop r15
+ pop r14
+ pop rbx
+ pop r12
+ pop rsi
+ pop rdi
+ pop r13
+ ret
+AES_GCM_encrypt_vaes ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_decrypt_vaes PROC
+ push r13
+ push rdi
+ push rsi
+ push r12
+ push rbx
+ push r14
+ push r15
+ push rbp
+ mov rdi, rcx
+ mov rsi, rdx
+ mov r12, r8
+ mov rax, r9
+ mov r8, QWORD PTR [rsp+104]
+ mov r9d, DWORD PTR [rsp+112]
+ mov r11d, DWORD PTR [rsp+120]
+ mov ebx, DWORD PTR [rsp+128]
+ mov r14d, DWORD PTR [rsp+136]
+ mov r15, QWORD PTR [rsp+144]
+ mov r10d, DWORD PTR [rsp+152]
+ mov rbp, QWORD PTR [rsp+160]
+ sub rsp, 704
+ vmovdqu OWORD PTR [rsp+544], xmm6
+ vmovdqu OWORD PTR [rsp+560], xmm7
+ vmovdqu OWORD PTR [rsp+576], xmm8
+ vmovdqu OWORD PTR [rsp+592], xmm9
+ vmovdqu OWORD PTR [rsp+608], xmm10
+ vmovdqu OWORD PTR [rsp+624], xmm11
+ vmovdqu OWORD PTR [rsp+640], xmm12
+ vmovdqu OWORD PTR [rsp+656], xmm13
+ vmovdqu OWORD PTR [rsp+672], xmm14
+ vmovdqu OWORD PTR [rsp+688], xmm15
+ vpxor xmm5, xmm5, xmm5
+ vpxor xmm15, xmm15, xmm15
+ cmp ebx, 12
+ mov edx, ebx
+ jne L_AES_GCM_decrypt_vaes_iv_not_12
+ ; # Calculate values when IV is 12 bytes
+ ; Set counter based on IV
+ mov ecx, 16777216
+ vmovq xmm5, QWORD PTR [rax]
+ vpinsrd xmm5, xmm5, DWORD PTR [rax+8], 2
+ vpinsrd xmm5, xmm5, ecx, 3
+ ; H = Encrypt X(=0) and T = Encrypt counter
+ vmovdqa xmm6, OWORD PTR [r15]
+ vpxor xmm1, xmm5, xmm6
+ vmovdqa xmm4, OWORD PTR [r15+16]
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqa xmm4, OWORD PTR [r15+32]
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqa xmm4, OWORD PTR [r15+48]
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqa xmm4, OWORD PTR [r15+64]
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqa xmm4, OWORD PTR [r15+80]
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqa xmm4, OWORD PTR [r15+96]
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqa xmm4, OWORD PTR [r15+112]
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqa xmm4, OWORD PTR [r15+128]
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqa xmm4, OWORD PTR [r15+144]
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ cmp r10d, 11
+ vmovdqa xmm4, OWORD PTR [r15+160]
+ jl L_AES_GCM_decrypt_vaes_calc_iv_12_last
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqa xmm4, OWORD PTR [r15+176]
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ cmp r10d, 13
+ vmovdqa xmm4, OWORD PTR [r15+192]
+ jl L_AES_GCM_decrypt_vaes_calc_iv_12_last
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqa xmm4, OWORD PTR [r15+208]
+ vaesenc xmm6, xmm6, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqa xmm4, OWORD PTR [r15+224]
+L_AES_GCM_decrypt_vaes_calc_iv_12_last:
+ vaesenclast xmm6, xmm6, xmm4
+ vaesenclast xmm1, xmm1, xmm4
+ vpshufb xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vmovdqu OWORD PTR [rsp+528], xmm1
+ jmp L_AES_GCM_decrypt_vaes_iv_done
+L_AES_GCM_decrypt_vaes_iv_not_12:
+ ; Calculate values when IV is not 12 bytes
+ ; H = Encrypt X(=0)
+ vmovdqa xmm6, OWORD PTR [r15]
+ vaesenc xmm6, xmm6, [r15+16]
+ vaesenc xmm6, xmm6, [r15+32]
+ vaesenc xmm6, xmm6, [r15+48]
+ vaesenc xmm6, xmm6, [r15+64]
+ vaesenc xmm6, xmm6, [r15+80]
+ vaesenc xmm6, xmm6, [r15+96]
+ vaesenc xmm6, xmm6, [r15+112]
+ vaesenc xmm6, xmm6, [r15+128]
+ vaesenc xmm6, xmm6, [r15+144]
+ cmp r10d, 11
+ vmovdqa xmm8, OWORD PTR [r15+160]
+ jl L_AES_GCM_decrypt_vaes_calc_iv_1_aesenc_avx_last
+ vaesenc xmm6, xmm6, xmm8
+ vaesenc xmm6, xmm6, [r15+176]
+ cmp r10d, 13
+ vmovdqa xmm8, OWORD PTR [r15+192]
+ jl L_AES_GCM_decrypt_vaes_calc_iv_1_aesenc_avx_last
+ vaesenc xmm6, xmm6, xmm8
+ vaesenc xmm6, xmm6, [r15+208]
+ vmovdqa xmm8, OWORD PTR [r15+224]
+L_AES_GCM_decrypt_vaes_calc_iv_1_aesenc_avx_last:
+ vaesenclast xmm6, xmm6, xmm8
+ vpshufb xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ ; Calc counter
+ ; Initialization vector
+ cmp edx, 0
+ mov rcx, 0
+ je L_AES_GCM_decrypt_vaes_calc_iv_done
+ cmp edx, 16
+ jl L_AES_GCM_decrypt_vaes_calc_iv_lt16
+ and edx, 4294967280
+L_AES_GCM_decrypt_vaes_calc_iv_16_loop:
+ vmovdqu xmm7, OWORD PTR [rax+rcx]
+ vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vpxor xmm5, xmm5, xmm7
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm5, 78
+ vpshufd xmm2, xmm6, 78
+ vpclmulqdq xmm3, xmm6, xmm5, 17
+ vpclmulqdq xmm0, xmm6, xmm5, 0
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm2, xmm2, xmm6
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm4, xmm0
+ vmovdqa xmm5, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm4, xmm4, xmm2
+ vpxor xmm5, xmm5, xmm1
+ vpsrld xmm0, xmm4, 31
+ vpsrld xmm1, xmm5, 31
+ vpslld xmm4, xmm4, 1
+ vpslld xmm5, xmm5, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm5, xmm5, xmm2
+ vpor xmm4, xmm4, xmm0
+ vpor xmm5, xmm5, xmm1
+ vpslld xmm0, xmm4, 31
+ vpslld xmm1, xmm4, 30
+ vpslld xmm2, xmm4, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm4, xmm4, xmm0
+ vpsrld xmm2, xmm4, 1
+ vpsrld xmm3, xmm4, 2
+ vpsrld xmm0, xmm4, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm4
+ vpxor xmm5, xmm5, xmm2
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_decrypt_vaes_calc_iv_16_loop
+ mov edx, ebx
+ cmp ecx, edx
+ je L_AES_GCM_decrypt_vaes_calc_iv_done
+L_AES_GCM_decrypt_vaes_calc_iv_lt16:
+ sub rsp, 16
+ vpxor xmm7, xmm7, xmm7
+ xor ebx, ebx
+ vmovdqu OWORD PTR [rsp], xmm7
+L_AES_GCM_decrypt_vaes_calc_iv_loop:
+ movzx r13d, BYTE PTR [rax+rcx]
+ mov BYTE PTR [rsp+rbx], r13b
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_decrypt_vaes_calc_iv_loop
+ vmovdqu xmm7, OWORD PTR [rsp]
+ add rsp, 16
+ vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vpxor xmm5, xmm5, xmm7
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm5, 78
+ vpshufd xmm2, xmm6, 78
+ vpclmulqdq xmm3, xmm6, xmm5, 17
+ vpclmulqdq xmm0, xmm6, xmm5, 0
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm2, xmm2, xmm6
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm4, xmm0
+ vmovdqa xmm5, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm4, xmm4, xmm2
+ vpxor xmm5, xmm5, xmm1
+ vpsrld xmm0, xmm4, 31
+ vpsrld xmm1, xmm5, 31
+ vpslld xmm4, xmm4, 1
+ vpslld xmm5, xmm5, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm5, xmm5, xmm2
+ vpor xmm4, xmm4, xmm0
+ vpor xmm5, xmm5, xmm1
+ vpslld xmm0, xmm4, 31
+ vpslld xmm1, xmm4, 30
+ vpslld xmm2, xmm4, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm4, xmm4, xmm0
+ vpsrld xmm2, xmm4, 1
+ vpsrld xmm3, xmm4, 2
+ vpsrld xmm0, xmm4, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm4
+ vpxor xmm5, xmm5, xmm2
+L_AES_GCM_decrypt_vaes_calc_iv_done:
+ ; T = Encrypt counter
+ vpxor xmm0, xmm0, xmm0
+ shl edx, 3
+ vmovq xmm0, rdx
+ vpxor xmm5, xmm5, xmm0
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm5, 78
+ vpshufd xmm2, xmm6, 78
+ vpclmulqdq xmm3, xmm6, xmm5, 17
+ vpclmulqdq xmm0, xmm6, xmm5, 0
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm2, xmm2, xmm6
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm4, xmm0
+ vmovdqa xmm5, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm4, xmm4, xmm2
+ vpxor xmm5, xmm5, xmm1
+ vpsrld xmm0, xmm4, 31
+ vpsrld xmm1, xmm5, 31
+ vpslld xmm4, xmm4, 1
+ vpslld xmm5, xmm5, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm5, xmm5, xmm2
+ vpor xmm4, xmm4, xmm0
+ vpor xmm5, xmm5, xmm1
+ vpslld xmm0, xmm4, 31
+ vpslld xmm1, xmm4, 30
+ vpslld xmm2, xmm4, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm4, xmm4, xmm0
+ vpsrld xmm2, xmm4, 1
+ vpsrld xmm3, xmm4, 2
+ vpsrld xmm0, xmm4, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm4
+ vpxor xmm5, xmm5, xmm2
+ vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ ; Encrypt counter
+ vmovdqa xmm7, OWORD PTR [r15]
+ vpxor xmm7, xmm7, xmm5
+ vaesenc xmm7, xmm7, [r15+16]
+ vaesenc xmm7, xmm7, [r15+32]
+ vaesenc xmm7, xmm7, [r15+48]
+ vaesenc xmm7, xmm7, [r15+64]
+ vaesenc xmm7, xmm7, [r15+80]
+ vaesenc xmm7, xmm7, [r15+96]
+ vaesenc xmm7, xmm7, [r15+112]
+ vaesenc xmm7, xmm7, [r15+128]
+ vaesenc xmm7, xmm7, [r15+144]
+ cmp r10d, 11
+ vmovdqa xmm8, OWORD PTR [r15+160]
+ jl L_AES_GCM_decrypt_vaes_calc_iv_2_aesenc_avx_last
+ vaesenc xmm7, xmm7, xmm8
+ vaesenc xmm7, xmm7, [r15+176]
+ cmp r10d, 13
+ vmovdqa xmm8, OWORD PTR [r15+192]
+ jl L_AES_GCM_decrypt_vaes_calc_iv_2_aesenc_avx_last
+ vaesenc xmm7, xmm7, xmm8
+ vaesenc xmm7, xmm7, [r15+208]
+ vmovdqa xmm8, OWORD PTR [r15+224]
+L_AES_GCM_decrypt_vaes_calc_iv_2_aesenc_avx_last:
+ vaesenclast xmm7, xmm7, xmm8
+ vmovdqu OWORD PTR [rsp+528], xmm7
+L_AES_GCM_decrypt_vaes_iv_done:
+ ; Additional authentication data
+ mov edx, r11d
+ cmp edx, 0
+ je L_AES_GCM_decrypt_vaes_calc_aad_done
+ xor ecx, ecx
+ cmp edx, 16
+ jl L_AES_GCM_decrypt_vaes_calc_aad_lt16
+ and edx, 4294967280
+L_AES_GCM_decrypt_vaes_calc_aad_16_loop:
+ vmovdqu xmm7, OWORD PTR [r12+rcx]
+ vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vpxor xmm15, xmm15, xmm7
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm15, 78
+ vpshufd xmm2, xmm6, 78
+ vpclmulqdq xmm3, xmm6, xmm15, 17
+ vpclmulqdq xmm0, xmm6, xmm15, 0
+ vpxor xmm1, xmm1, xmm15
+ vpxor xmm2, xmm2, xmm6
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm4, xmm0
+ vmovdqa xmm15, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm4, xmm4, xmm2
+ vpxor xmm15, xmm15, xmm1
+ vpsrld xmm0, xmm4, 31
+ vpsrld xmm1, xmm15, 31
+ vpslld xmm4, xmm4, 1
+ vpslld xmm15, xmm15, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm15, xmm15, xmm2
+ vpor xmm4, xmm4, xmm0
+ vpor xmm15, xmm15, xmm1
+ vpslld xmm0, xmm4, 31
+ vpslld xmm1, xmm4, 30
+ vpslld xmm2, xmm4, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm4, xmm4, xmm0
+ vpsrld xmm2, xmm4, 1
+ vpsrld xmm3, xmm4, 2
+ vpsrld xmm0, xmm4, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm4
+ vpxor xmm15, xmm15, xmm2
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_decrypt_vaes_calc_aad_16_loop
+ mov edx, r11d
+ cmp ecx, edx
+ je L_AES_GCM_decrypt_vaes_calc_aad_done
+L_AES_GCM_decrypt_vaes_calc_aad_lt16:
+ sub rsp, 16
+ vpxor xmm7, xmm7, xmm7
+ xor ebx, ebx
+ vmovdqu OWORD PTR [rsp], xmm7
+L_AES_GCM_decrypt_vaes_calc_aad_loop:
+ movzx r13d, BYTE PTR [r12+rcx]
+ mov BYTE PTR [rsp+rbx], r13b
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_decrypt_vaes_calc_aad_loop
+ vmovdqu xmm7, OWORD PTR [rsp]
+ add rsp, 16
+ vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vpxor xmm15, xmm15, xmm7
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm15, 78
+ vpshufd xmm2, xmm6, 78
+ vpclmulqdq xmm3, xmm6, xmm15, 17
+ vpclmulqdq xmm0, xmm6, xmm15, 0
+ vpxor xmm1, xmm1, xmm15
+ vpxor xmm2, xmm2, xmm6
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm4, xmm0
+ vmovdqa xmm15, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm4, xmm4, xmm2
+ vpxor xmm15, xmm15, xmm1
+ vpsrld xmm0, xmm4, 31
+ vpsrld xmm1, xmm15, 31
+ vpslld xmm4, xmm4, 1
+ vpslld xmm15, xmm15, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm15, xmm15, xmm2
+ vpor xmm4, xmm4, xmm0
+ vpor xmm15, xmm15, xmm1
+ vpslld xmm0, xmm4, 31
+ vpslld xmm1, xmm4, 30
+ vpslld xmm2, xmm4, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm4, xmm4, xmm0
+ vpsrld xmm2, xmm4, 1
+ vpsrld xmm3, xmm4, 2
+ vpsrld xmm0, xmm4, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm4
+ vpxor xmm15, xmm15, xmm2
+L_AES_GCM_decrypt_vaes_calc_aad_done:
+ ; Calculate counter and H
+ vpsrlq xmm8, xmm6, 63
+ vpsllq xmm7, xmm6, 1
+ vpslldq xmm8, xmm8, 8
+ vpor xmm7, xmm7, xmm8
+ vpshufd xmm6, xmm6, 255
+ vpsrad xmm6, xmm6, 31
+ vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_epi64
+ vpand xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpaddd xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_one
+ vpxor xmm6, xmm6, xmm7
+ vmovdqu OWORD PTR [rsp+512], xmm5
+ xor ebx, ebx
+ cmp r9d, 128
+ jl L_AES_GCM_decrypt_vaes_done_128
+ vmovdqa xmm2, xmm15
+ ; H ^ 1
+ vmovdqu OWORD PTR [rsp], xmm6
+ ; H ^ 2
+ vpclmulqdq xmm7, xmm6, xmm6, 0
+ vpclmulqdq xmm10, xmm6, xmm6, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm0, xmm10
+ vmovdqu OWORD PTR [rsp+16], xmm0
+ ; H ^ 3
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm0, xmm6, 0
+ vpclmulqdq xmm8, xmm0, xmm6, 1
+ vpclmulqdq xmm9, xmm0, xmm6, 16
+ vpclmulqdq xmm10, xmm0, xmm6, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm1, xmm10
+ vmovdqu OWORD PTR [rsp+32], xmm1
+ ; H ^ 4
+ vpclmulqdq xmm7, xmm0, xmm0, 0
+ vpclmulqdq xmm10, xmm0, xmm0, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm3, xmm10
+ vmovdqu OWORD PTR [rsp+48], xmm3
+ ; H ^ 5
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm1, xmm0, 0
+ vpclmulqdq xmm8, xmm1, xmm0, 1
+ vpclmulqdq xmm9, xmm1, xmm0, 16
+ vpclmulqdq xmm10, xmm1, xmm0, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+64], xmm4
+ ; H ^ 6
+ vpclmulqdq xmm7, xmm1, xmm1, 0
+ vpclmulqdq xmm10, xmm1, xmm1, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm4
+ ; H ^ 7
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm3, xmm1, 0
+ vpclmulqdq xmm8, xmm3, xmm1, 1
+ vpclmulqdq xmm9, xmm3, xmm1, 16
+ vpclmulqdq xmm10, xmm3, xmm1, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+96], xmm4
+ ; H ^ 8
+ vpclmulqdq xmm7, xmm3, xmm3, 0
+ vpclmulqdq xmm10, xmm3, xmm3, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+112], xmm4
+ cmp r9d, 256
+ jl L_AES_GCM_decrypt_vaes_no_ext
+ ; H ^ 9
+ vmovdqu xmm0, OWORD PTR [rsp+48]
+ vmovdqu xmm1, OWORD PTR [rsp+64]
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm1, xmm0, 0
+ vpclmulqdq xmm8, xmm1, xmm0, 1
+ vpclmulqdq xmm9, xmm1, xmm0, 16
+ vpclmulqdq xmm10, xmm1, xmm0, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+128], xmm4
+ ; H ^ 10
+ vmovdqu xmm0, OWORD PTR [rsp+64]
+ vpclmulqdq xmm7, xmm0, xmm0, 0
+ vpclmulqdq xmm10, xmm0, xmm0, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+144], xmm4
+ ; H ^ 11
+ vmovdqu xmm0, OWORD PTR [rsp+64]
+ vmovdqu xmm1, OWORD PTR [rsp+80]
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm1, xmm0, 0
+ vpclmulqdq xmm8, xmm1, xmm0, 1
+ vpclmulqdq xmm9, xmm1, xmm0, 16
+ vpclmulqdq xmm10, xmm1, xmm0, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+160], xmm4
+ ; H ^ 12
+ vmovdqu xmm0, OWORD PTR [rsp+80]
+ vpclmulqdq xmm7, xmm0, xmm0, 0
+ vpclmulqdq xmm10, xmm0, xmm0, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+176], xmm4
+ ; H ^ 13
+ vmovdqu xmm0, OWORD PTR [rsp+80]
+ vmovdqu xmm1, OWORD PTR [rsp+96]
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm1, xmm0, 0
+ vpclmulqdq xmm8, xmm1, xmm0, 1
+ vpclmulqdq xmm9, xmm1, xmm0, 16
+ vpclmulqdq xmm10, xmm1, xmm0, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+192], xmm4
+ ; H ^ 14
+ vmovdqu xmm0, OWORD PTR [rsp+96]
+ vpclmulqdq xmm7, xmm0, xmm0, 0
+ vpclmulqdq xmm10, xmm0, xmm0, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+208], xmm4
+ ; H ^ 15
+ vmovdqu xmm0, OWORD PTR [rsp+96]
+ vmovdqu xmm1, OWORD PTR [rsp+112]
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm1, xmm0, 0
+ vpclmulqdq xmm8, xmm1, xmm0, 1
+ vpclmulqdq xmm9, xmm1, xmm0, 16
+ vpclmulqdq xmm10, xmm1, xmm0, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+224], xmm4
+ ; H ^ 16
+ vmovdqu xmm0, OWORD PTR [rsp+112]
+ vpclmulqdq xmm7, xmm0, xmm0, 0
+ vpclmulqdq xmm10, xmm0, xmm0, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+240], xmm4
+ vmovdqu ymm7, YMMWORD PTR [rsp+224]
+ vpermq ymm7, ymm7, 78
+ vmovdqu ymm8, YMMWORD PTR [rsp+192]
+ vpermq ymm8, ymm8, 78
+ vmovdqu ymm9, YMMWORD PTR [rsp+160]
+ vpermq ymm9, ymm9, 78
+ vmovdqu ymm10, YMMWORD PTR [rsp+128]
+ vpermq ymm10, ymm10, 78
+ vmovdqu YMMWORD PTR [rsp+256], ymm7
+ vmovdqu YMMWORD PTR [rsp+288], ymm8
+ vmovdqu YMMWORD PTR [rsp+320], ymm9
+ vmovdqu YMMWORD PTR [rsp+352], ymm10
+ vmovdqu ymm7, YMMWORD PTR [rsp+96]
+ vpermq ymm7, ymm7, 78
+ vmovdqu ymm8, YMMWORD PTR [rsp+64]
+ vpermq ymm8, ymm8, 78
+ vmovdqu ymm9, YMMWORD PTR [rsp+32]
+ vpermq ymm9, ymm9, 78
+ vmovdqu ymm10, YMMWORD PTR [rsp]
+ vpermq ymm10, ymm10, 78
+ vmovdqu YMMWORD PTR [rsp+384], ymm7
+ vmovdqu YMMWORD PTR [rsp+416], ymm8
+ vmovdqu YMMWORD PTR [rsp+448], ymm9
+ vmovdqu YMMWORD PTR [rsp+480], ymm10
+L_AES_GCM_decrypt_vaes_no_ext:
+ vbroadcasti128 ymm14, ptr_L_vaes_aes_gcm_mod2_128
+ cmp r9d, 256
+ jl L_AES_GCM_decrypt_vaes_after_256
+ mov r13d, r9d
+ and r13d, 4294967040
+L_AES_GCM_decrypt_vaes_loop_256:
+ ; 256 bytes of input
+ lea rax, QWORD PTR [rdi+rbx]
+ vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask
+ vpxor ymm4, ymm4, ymm4
+ vinserti128 ymm4, ymm4, xmm15, 0
+ vmovdqu ymm7, YMMWORD PTR [rsp+256]
+ vmovdqu ymm8, YMMWORD PTR [rsp+288]
+ vmovdqu ymm9, YMMWORD PTR [rsp+320]
+ vmovdqu ymm10, YMMWORD PTR [rsp+352]
+ vmovdqu ymm5, YMMWORD PTR [rax]
+ vpshufb ymm5, ymm5, ymm6
+ vpxor ymm5, ymm5, ymm4
+ vpclmulqdq ymm0, ymm5, ymm7, 0
+ vpclmulqdq ymm1, ymm5, ymm7, 1
+ vpclmulqdq ymm2, ymm5, ymm7, 16
+ vpclmulqdq ymm3, ymm5, ymm7, 17
+ vmovdqa ymm11, ymm0
+ vpxor ymm12, ymm2, ymm1
+ vmovdqa ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rax+32]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm8, 0
+ vpclmulqdq ymm1, ymm5, ymm8, 1
+ vpclmulqdq ymm2, ymm5, ymm8, 16
+ vpclmulqdq ymm3, ymm5, ymm8, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rax+64]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm9, 0
+ vpclmulqdq ymm1, ymm5, ymm9, 1
+ vpclmulqdq ymm2, ymm5, ymm9, 16
+ vpclmulqdq ymm3, ymm5, ymm9, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rax+96]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm10, 0
+ vpclmulqdq ymm1, ymm5, ymm10, 1
+ vpclmulqdq ymm2, ymm5, ymm10, 16
+ vpclmulqdq ymm3, ymm5, ymm10, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm7, YMMWORD PTR [rsp+384]
+ vmovdqu ymm8, YMMWORD PTR [rsp+416]
+ vmovdqu ymm9, YMMWORD PTR [rsp+448]
+ vmovdqu ymm10, YMMWORD PTR [rsp+480]
+ vmovdqu ymm5, YMMWORD PTR [rax+128]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm7, 0
+ vpclmulqdq ymm1, ymm5, ymm7, 1
+ vpclmulqdq ymm2, ymm5, ymm7, 16
+ vpclmulqdq ymm3, ymm5, ymm7, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rax+160]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm8, 0
+ vpclmulqdq ymm1, ymm5, ymm8, 1
+ vpclmulqdq ymm2, ymm5, ymm8, 16
+ vpclmulqdq ymm3, ymm5, ymm8, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rax+192]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm9, 0
+ vpclmulqdq ymm1, ymm5, ymm9, 1
+ vpclmulqdq ymm2, ymm5, ymm9, 16
+ vpclmulqdq ymm3, ymm5, ymm9, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rax+224]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm10, 0
+ vpclmulqdq ymm1, ymm5, ymm10, 1
+ vpclmulqdq ymm2, ymm5, ymm10, 16
+ vpclmulqdq ymm3, ymm5, ymm10, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vpclmulqdq ymm5, ymm14, ymm11, 1
+ vpshufd ymm11, ymm11, 78
+ vpxor ymm12, ymm12, ymm5
+ vpxor ymm12, ymm12, ymm11
+ vpclmulqdq ymm5, ymm14, ymm12, 1
+ vpshufd ymm12, ymm12, 78
+ vpxor ymm13, ymm13, ymm5
+ vpxor ymm13, ymm13, ymm12
+ vextracti128 xmm0, ymm13, 1
+ vpxor xmm15, xmm13, xmm0
+ vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64
+ vbroadcasti128 ymm4, [rsp+512]
+ vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
+ vpshufb ymm0, ymm0, ymm6
+ vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
+ vpshufb ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2
+ vpshufb ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3
+ vpshufb ymm3, ymm3, ymm6
+ vmovdqu xmm7, OWORD PTR [rsp+512]
+ vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
+ vmovdqu OWORD PTR [rsp+512], xmm7
+ vbroadcasti128 ymm4, [r15]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm1, ymm1, ymm4
+ vpxor ymm2, ymm2, ymm4
+ vpxor ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+16]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+32]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+48]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+64]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+80]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+96]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+112]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+128]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+144]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r10d, 11
+ vbroadcasti128 ymm4, [r15+160]
+ jl L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+176]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r10d, 13
+ vbroadcasti128 ymm4, [r15+192]
+ jl L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+208]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+224]
+L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last:
+ vaesenclast ymm0, ymm0, ymm4
+ vaesenclast ymm1, ymm1, ymm4
+ vaesenclast ymm2, ymm2, ymm4
+ vaesenclast ymm3, ymm3, ymm4
+ lea rcx, QWORD PTR [rdi+rbx]
+ lea rdx, QWORD PTR [rsi+rbx]
+ vmovdqu ymm5, YMMWORD PTR [rcx]
+ vpxor ymm0, ymm0, ymm5
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu ymm5, YMMWORD PTR [rcx+32]
+ vpxor ymm1, ymm1, ymm5
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu ymm5, YMMWORD PTR [rcx+64]
+ vpxor ymm2, ymm2, ymm5
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vmovdqu ymm5, YMMWORD PTR [rcx+96]
+ vpxor ymm3, ymm3, ymm5
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ add ebx, 128
+ vbroadcasti128 ymm4, [rsp+512]
+ vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
+ vpshufb ymm0, ymm0, ymm6
+ vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
+ vpshufb ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2
+ vpshufb ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3
+ vpshufb ymm3, ymm3, ymm6
+ vmovdqu xmm7, OWORD PTR [rsp+512]
+ vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
+ vmovdqu OWORD PTR [rsp+512], xmm7
+ vbroadcasti128 ymm4, [r15]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm1, ymm1, ymm4
+ vpxor ymm2, ymm2, ymm4
+ vpxor ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+16]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+32]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+48]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+64]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+80]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+96]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+112]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+128]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+144]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r10d, 11
+ vbroadcasti128 ymm4, [r15+160]
+ jl L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+176]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r10d, 13
+ vbroadcasti128 ymm4, [r15+192]
+ jl L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+208]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+224]
+L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last:
+ vaesenclast ymm0, ymm0, ymm4
+ vaesenclast ymm1, ymm1, ymm4
+ vaesenclast ymm2, ymm2, ymm4
+ vaesenclast ymm3, ymm3, ymm4
+ lea rcx, QWORD PTR [rdi+rbx]
+ lea rdx, QWORD PTR [rsi+rbx]
+ vmovdqu ymm5, YMMWORD PTR [rcx]
+ vpxor ymm0, ymm0, ymm5
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu ymm5, YMMWORD PTR [rcx+32]
+ vpxor ymm1, ymm1, ymm5
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu ymm5, YMMWORD PTR [rcx+64]
+ vpxor ymm2, ymm2, ymm5
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vmovdqu ymm5, YMMWORD PTR [rcx+96]
+ vpxor ymm3, ymm3, ymm5
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ add ebx, 128
+ cmp ebx, r13d
+ jl L_AES_GCM_decrypt_vaes_loop_256
+L_AES_GCM_decrypt_vaes_after_256:
+ vmovdqu ymm7, YMMWORD PTR [rsp+96]
+ vpermq ymm7, ymm7, 78
+ vmovdqu ymm8, YMMWORD PTR [rsp+64]
+ vpermq ymm8, ymm8, 78
+ vmovdqu ymm9, YMMWORD PTR [rsp+32]
+ vpermq ymm9, ymm9, 78
+ vmovdqu ymm10, YMMWORD PTR [rsp]
+ vpermq ymm10, ymm10, 78
+ mov r13d, r9d
+ and r13d, 4294967168
+ cmp ebx, r13d
+ jge L_AES_GCM_decrypt_vaes_after_128
+ ; 128 bytes of input
+ lea rax, QWORD PTR [rdi+rbx]
+ vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask
+ vpxor ymm4, ymm4, ymm4
+ vinserti128 ymm4, ymm4, xmm15, 0
+ vmovdqu ymm5, YMMWORD PTR [rax]
+ vpshufb ymm5, ymm5, ymm6
+ vpxor ymm5, ymm5, ymm4
+ vpclmulqdq ymm0, ymm5, ymm7, 0
+ vpclmulqdq ymm1, ymm5, ymm7, 1
+ vpclmulqdq ymm2, ymm5, ymm7, 16
+ vpclmulqdq ymm3, ymm5, ymm7, 17
+ vmovdqa ymm11, ymm0
+ vpxor ymm12, ymm2, ymm1
+ vmovdqa ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rax+32]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm8, 0
+ vpclmulqdq ymm1, ymm5, ymm8, 1
+ vpclmulqdq ymm2, ymm5, ymm8, 16
+ vpclmulqdq ymm3, ymm5, ymm8, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rax+64]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm9, 0
+ vpclmulqdq ymm1, ymm5, ymm9, 1
+ vpclmulqdq ymm2, ymm5, ymm9, 16
+ vpclmulqdq ymm3, ymm5, ymm9, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rax+96]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm10, 0
+ vpclmulqdq ymm1, ymm5, ymm10, 1
+ vpclmulqdq ymm2, ymm5, ymm10, 16
+ vpclmulqdq ymm3, ymm5, ymm10, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vpclmulqdq ymm5, ymm14, ymm11, 1
+ vpshufd ymm11, ymm11, 78
+ vpxor ymm12, ymm12, ymm5
+ vpxor ymm12, ymm12, ymm11
+ vpclmulqdq ymm5, ymm14, ymm12, 1
+ vpshufd ymm12, ymm12, 78
+ vpxor ymm13, ymm13, ymm5
+ vpxor ymm13, ymm13, ymm12
+ vextracti128 xmm0, ymm13, 1
+ vpxor xmm15, xmm13, xmm0
+ vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64
+ vbroadcasti128 ymm4, [rsp+512]
+ vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
+ vpshufb ymm0, ymm0, ymm6
+ vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
+ vpshufb ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2
+ vpshufb ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3
+ vpshufb ymm3, ymm3, ymm6
+ vmovdqu xmm7, OWORD PTR [rsp+512]
+ vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
+ vmovdqu OWORD PTR [rsp+512], xmm7
+ vbroadcasti128 ymm4, [r15]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm1, ymm1, ymm4
+ vpxor ymm2, ymm2, ymm4
+ vpxor ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+16]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+32]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+48]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+64]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+80]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+96]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+112]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+128]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+144]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r10d, 11
+ vbroadcasti128 ymm4, [r15+160]
+ jl L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+176]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r10d, 13
+ vbroadcasti128 ymm4, [r15+192]
+ jl L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+208]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [r15+224]
+L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last:
+ vaesenclast ymm0, ymm0, ymm4
+ vaesenclast ymm1, ymm1, ymm4
+ vaesenclast ymm2, ymm2, ymm4
+ vaesenclast ymm3, ymm3, ymm4
+ lea rcx, QWORD PTR [rdi+rbx]
+ lea rdx, QWORD PTR [rsi+rbx]
+ vmovdqu ymm5, YMMWORD PTR [rcx]
+ vpxor ymm0, ymm0, ymm5
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu ymm5, YMMWORD PTR [rcx+32]
+ vpxor ymm1, ymm1, ymm5
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu ymm5, YMMWORD PTR [rcx+64]
+ vpxor ymm2, ymm2, ymm5
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vmovdqu ymm5, YMMWORD PTR [rcx+96]
+ vpxor ymm3, ymm3, ymm5
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ add ebx, 128
+L_AES_GCM_decrypt_vaes_after_128:
+ vmovdqu xmm6, OWORD PTR [rsp]
+L_AES_GCM_decrypt_vaes_done_128:
+ mov edx, r9d
+ cmp ebx, edx
+ jge L_AES_GCM_decrypt_vaes_done_dec
+ mov r13d, r9d
+ and r13d, 4294967280
+ cmp ebx, r13d
+ jge L_AES_GCM_decrypt_vaes_last_block_done
+L_AES_GCM_decrypt_vaes_last_block_start:
+ vmovdqu xmm12, OWORD PTR [rdi+rbx]
+ vmovdqa xmm0, xmm6
+ vpshufb xmm1, xmm12, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vpxor xmm1, xmm1, xmm15
+ vmovdqu xmm8, OWORD PTR [rsp+512]
+ vpshufb xmm7, xmm8, OWORD PTR L_vaes_aes_gcm_bswap_epi64
+ vpaddd xmm8, xmm8, OWORD PTR L_vaes_aes_gcm_one
+ vmovdqu OWORD PTR [rsp+512], xmm8
+ vpxor xmm7, xmm7, [r15]
+ vpclmulqdq xmm9, xmm1, xmm0, 16
+ vaesenc xmm7, xmm7, [r15+16]
+ vaesenc xmm7, xmm7, [r15+32]
+ vpclmulqdq xmm10, xmm1, xmm0, 1
+ vaesenc xmm7, xmm7, [r15+48]
+ vaesenc xmm7, xmm7, [r15+64]
+ vpclmulqdq xmm11, xmm1, xmm0, 0
+ vaesenc xmm7, xmm7, [r15+80]
+ vpclmulqdq xmm1, xmm1, xmm0, 17
+ vaesenc xmm7, xmm7, [r15+96]
+ vpxor xmm9, xmm9, xmm10
+ vpslldq xmm2, xmm9, 8
+ vpsrldq xmm9, xmm9, 8
+ vaesenc xmm7, xmm7, [r15+112]
+ vpxor xmm2, xmm2, xmm11
+ vpxor xmm3, xmm1, xmm9
+ vmovdqa xmm0, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm10, xmm2, xmm0, 16
+ vaesenc xmm7, xmm7, [r15+128]
+ vpshufd xmm9, xmm2, 78
+ vpxor xmm9, xmm9, xmm10
+ vpclmulqdq xmm10, xmm9, xmm0, 16
+ vaesenc xmm7, xmm7, [r15+144]
+ vpshufd xmm9, xmm9, 78
+ vpxor xmm9, xmm9, xmm10
+ vpxor xmm15, xmm9, xmm3
+ cmp r10d, 11
+ vmovdqa xmm8, OWORD PTR [r15+160]
+ jl L_AES_GCM_decrypt_vaes_aesenc_gfmul_last
+ vaesenc xmm7, xmm7, xmm8
+ vaesenc xmm7, xmm7, [r15+176]
+ cmp r10d, 13
+ vmovdqa xmm8, OWORD PTR [r15+192]
+ jl L_AES_GCM_decrypt_vaes_aesenc_gfmul_last
+ vaesenc xmm7, xmm7, xmm8
+ vaesenc xmm7, xmm7, [r15+208]
+ vmovdqa xmm8, OWORD PTR [r15+224]
+L_AES_GCM_decrypt_vaes_aesenc_gfmul_last:
+ vaesenclast xmm7, xmm7, xmm8
+ vmovdqa xmm0, xmm12
+ vpxor xmm7, xmm7, xmm0
+ vmovdqu OWORD PTR [rsi+rbx], xmm7
+ add ebx, 16
+ cmp ebx, r13d
+ jl L_AES_GCM_decrypt_vaes_last_block_start
+L_AES_GCM_decrypt_vaes_last_block_done:
+ mov ecx, r9d
+ mov edx, ecx
+ and ecx, 15
+ jz L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_done
+ vmovdqu xmm5, OWORD PTR [rsp+512]
+ vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_epi64
+ vpxor xmm5, xmm5, [r15]
+ vaesenc xmm5, xmm5, [r15+16]
+ vaesenc xmm5, xmm5, [r15+32]
+ vaesenc xmm5, xmm5, [r15+48]
+ vaesenc xmm5, xmm5, [r15+64]
+ vaesenc xmm5, xmm5, [r15+80]
+ vaesenc xmm5, xmm5, [r15+96]
+ vaesenc xmm5, xmm5, [r15+112]
+ vaesenc xmm5, xmm5, [r15+128]
+ vaesenc xmm5, xmm5, [r15+144]
+ cmp r10d, 11
+ vmovdqa xmm8, OWORD PTR [r15+160]
+ jl L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_aesenc_avx_last
+ vaesenc xmm5, xmm5, xmm8
+ vaesenc xmm5, xmm5, [r15+176]
+ cmp r10d, 13
+ vmovdqa xmm8, OWORD PTR [r15+192]
+ jl L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_aesenc_avx_last
+ vaesenc xmm5, xmm5, xmm8
+ vaesenc xmm5, xmm5, [r15+208]
+ vmovdqa xmm8, OWORD PTR [r15+224]
+L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_aesenc_avx_last:
+ vaesenclast xmm5, xmm5, xmm8
+ sub rsp, 32
+ xor ecx, ecx
+ vmovdqu OWORD PTR [rsp], xmm5
+ vpxor xmm0, xmm0, xmm0
+ vmovdqu OWORD PTR [rsp+16], xmm0
+L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_loop:
+ movzx r13d, BYTE PTR [rdi+rbx]
+ mov BYTE PTR [rsp+rcx+16], r13b
+ xor r13b, BYTE PTR [rsp+rcx]
+ mov BYTE PTR [rsi+rbx], r13b
+ inc ebx
+ inc ecx
+ cmp ebx, edx
+ jl L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_loop
+ vmovdqu xmm5, OWORD PTR [rsp+16]
+ add rsp, 32
+ vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vpxor xmm15, xmm15, xmm5
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm15, xmm6, 0
+ vpclmulqdq xmm8, xmm15, xmm6, 1
+ vpclmulqdq xmm9, xmm15, xmm6, 16
+ vpclmulqdq xmm10, xmm15, xmm6, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm15, xmm10
+L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_done:
+L_AES_GCM_decrypt_vaes_done_dec:
+ mov edx, r9d
+ mov ecx, r11d
+ shl rdx, 3
+ shl rcx, 3
+ vmovq xmm0, rdx
+ vmovq xmm1, rcx
+ vpunpcklqdq xmm0, xmm0, xmm1
+ vpxor xmm15, xmm15, xmm0
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm15, xmm6, 0
+ vpclmulqdq xmm8, xmm15, xmm6, 1
+ vpclmulqdq xmm9, xmm15, xmm6, 16
+ vpclmulqdq xmm10, xmm15, xmm6, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm15, xmm10
+ vpshufb xmm15, xmm15, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vmovdqu xmm0, OWORD PTR [rsp+528]
+ vpxor xmm0, xmm0, xmm15
+ cmp r14d, 16
+ je L_AES_GCM_decrypt_vaes_cmp_tag_16
+ sub rsp, 16
+ xor rcx, rcx
+ xor rbx, rbx
+ vmovdqu OWORD PTR [rsp], xmm0
+L_AES_GCM_decrypt_vaes_cmp_tag_loop:
+ movzx r13d, BYTE PTR [rsp+rcx]
+ xor r13b, BYTE PTR [r8+rcx]
+ or bl, r13b
+ inc ecx
+ cmp ecx, r14d
+ jne L_AES_GCM_decrypt_vaes_cmp_tag_loop
+ cmp bl, 0
+ sete bl
+ add rsp, 16
+ xor rcx, rcx
+ jmp L_AES_GCM_decrypt_vaes_cmp_tag_done
+L_AES_GCM_decrypt_vaes_cmp_tag_16:
+ vmovdqu xmm1, OWORD PTR [r8]
+ vpcmpeqb xmm0, xmm0, xmm1
+ vpmovmskb rdx, xmm0
+ ; %%edx == 0xFFFF then return 1 else => return 0
+ xor ebx, ebx
+ cmp edx, 65535
+ sete bl
+L_AES_GCM_decrypt_vaes_cmp_tag_done:
+ mov DWORD PTR [rbp], ebx
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp+544]
+ vmovdqu xmm7, OWORD PTR [rsp+560]
+ vmovdqu xmm8, OWORD PTR [rsp+576]
+ vmovdqu xmm9, OWORD PTR [rsp+592]
+ vmovdqu xmm10, OWORD PTR [rsp+608]
+ vmovdqu xmm11, OWORD PTR [rsp+624]
+ vmovdqu xmm12, OWORD PTR [rsp+640]
+ vmovdqu xmm13, OWORD PTR [rsp+656]
+ vmovdqu xmm14, OWORD PTR [rsp+672]
+ vmovdqu xmm15, OWORD PTR [rsp+688]
+ add rsp, 704
+ pop rbp
+ pop r15
+ pop r14
+ pop rbx
+ pop r12
+ pop rsi
+ pop rdi
+ pop r13
+ ret
+AES_GCM_decrypt_vaes ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_init_vaes PROC
+ push rdi
+ push rsi
+ push r12
+ push r13
+ mov rdi, rcx
+ mov rsi, rdx
+ mov r10, r8
+ mov r11d, r9d
+ mov rax, QWORD PTR [rsp+72]
+ mov r8, QWORD PTR [rsp+80]
+ mov r9, QWORD PTR [rsp+88]
+ sub rsp, 80
+ vmovdqu OWORD PTR [rsp+16], xmm6
+ vmovdqu OWORD PTR [rsp+32], xmm7
+ vmovdqu OWORD PTR [rsp+48], xmm8
+ vmovdqu OWORD PTR [rsp+64], xmm15
+ vpxor xmm4, xmm4, xmm4
+ mov edx, r11d
+ cmp edx, 12
+ jne L_AES_GCM_init_vaes_iv_not_12
+ ; # Calculate values when IV is 12 bytes
+ ; Set counter based on IV
+ mov ecx, 16777216
+ vmovq xmm4, QWORD PTR [r10]
+ vpinsrd xmm4, xmm4, DWORD PTR [r10+8], 2
+ vpinsrd xmm4, xmm4, ecx, 3
+ ; H = Encrypt X(=0) and T = Encrypt counter
+ vmovdqa xmm5, OWORD PTR [rdi]
+ vpxor xmm1, xmm4, xmm5
+ vmovdqa xmm6, OWORD PTR [rdi+16]
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ vmovdqa xmm6, OWORD PTR [rdi+32]
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ vmovdqa xmm6, OWORD PTR [rdi+48]
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ vmovdqa xmm6, OWORD PTR [rdi+64]
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ vmovdqa xmm6, OWORD PTR [rdi+80]
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ vmovdqa xmm6, OWORD PTR [rdi+96]
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ vmovdqa xmm6, OWORD PTR [rdi+112]
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ vmovdqa xmm6, OWORD PTR [rdi+128]
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ vmovdqa xmm6, OWORD PTR [rdi+144]
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ cmp esi, 11
+ vmovdqa xmm6, OWORD PTR [rdi+160]
+ jl L_AES_GCM_init_vaes_calc_iv_12_last
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ vmovdqa xmm6, OWORD PTR [rdi+176]
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ cmp esi, 13
+ vmovdqa xmm6, OWORD PTR [rdi+192]
+ jl L_AES_GCM_init_vaes_calc_iv_12_last
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ vmovdqa xmm6, OWORD PTR [rdi+208]
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ vmovdqa xmm6, OWORD PTR [rdi+224]
+L_AES_GCM_init_vaes_calc_iv_12_last:
+ vaesenclast xmm5, xmm5, xmm6
+ vaesenclast xmm1, xmm1, xmm6
+ vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vmovdqu xmm15, xmm1
+ jmp L_AES_GCM_init_vaes_iv_done
+L_AES_GCM_init_vaes_iv_not_12:
+ ; Calculate values when IV is not 12 bytes
+ ; H = Encrypt X(=0)
+ vmovdqa xmm5, OWORD PTR [rdi]
+ vaesenc xmm5, xmm5, [rdi+16]
+ vaesenc xmm5, xmm5, [rdi+32]
+ vaesenc xmm5, xmm5, [rdi+48]
+ vaesenc xmm5, xmm5, [rdi+64]
+ vaesenc xmm5, xmm5, [rdi+80]
+ vaesenc xmm5, xmm5, [rdi+96]
+ vaesenc xmm5, xmm5, [rdi+112]
+ vaesenc xmm5, xmm5, [rdi+128]
+ vaesenc xmm5, xmm5, [rdi+144]
+ cmp esi, 11
+ vmovdqa xmm8, OWORD PTR [rdi+160]
+ jl L_AES_GCM_init_vaes_calc_iv_1_aesenc_avx_last
+ vaesenc xmm5, xmm5, xmm8
+ vaesenc xmm5, xmm5, [rdi+176]
+ cmp esi, 13
+ vmovdqa xmm8, OWORD PTR [rdi+192]
+ jl L_AES_GCM_init_vaes_calc_iv_1_aesenc_avx_last
+ vaesenc xmm5, xmm5, xmm8
+ vaesenc xmm5, xmm5, [rdi+208]
+ vmovdqa xmm8, OWORD PTR [rdi+224]
+L_AES_GCM_init_vaes_calc_iv_1_aesenc_avx_last:
+ vaesenclast xmm5, xmm5, xmm8
+ vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ ; Calc counter
+ ; Initialization vector
+ cmp edx, 0
+ mov rcx, 0
+ je L_AES_GCM_init_vaes_calc_iv_done
+ cmp edx, 16
+ jl L_AES_GCM_init_vaes_calc_iv_lt16
+ and edx, 4294967280
+L_AES_GCM_init_vaes_calc_iv_16_loop:
+ vmovdqu xmm7, OWORD PTR [r10+rcx]
+ vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vpxor xmm4, xmm4, xmm7
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm4, 78
+ vpshufd xmm2, xmm5, 78
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm6, xmm0
+ vmovdqa xmm4, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm6, xmm6, xmm2
+ vpxor xmm4, xmm4, xmm1
+ vpsrld xmm0, xmm6, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm6, xmm6, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm6, xmm6, xmm0
+ vpor xmm4, xmm4, xmm1
+ vpslld xmm0, xmm6, 31
+ vpslld xmm1, xmm6, 30
+ vpslld xmm2, xmm6, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm6, xmm6, xmm0
+ vpsrld xmm2, xmm6, 1
+ vpsrld xmm3, xmm6, 2
+ vpsrld xmm0, xmm6, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm6
+ vpxor xmm4, xmm4, xmm2
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_init_vaes_calc_iv_16_loop
+ mov edx, r11d
+ cmp ecx, edx
+ je L_AES_GCM_init_vaes_calc_iv_done
+L_AES_GCM_init_vaes_calc_iv_lt16:
+ sub rsp, 16
+ vpxor xmm7, xmm7, xmm7
+ xor r13d, r13d
+ vmovdqu OWORD PTR [rsp], xmm7
+L_AES_GCM_init_vaes_calc_iv_loop:
+ movzx r12d, BYTE PTR [r10+rcx]
+ mov BYTE PTR [rsp+r13], r12b
+ inc ecx
+ inc r13d
+ cmp ecx, edx
+ jl L_AES_GCM_init_vaes_calc_iv_loop
+ vmovdqu xmm7, OWORD PTR [rsp]
+ add rsp, 16
+ vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vpxor xmm4, xmm4, xmm7
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm4, 78
+ vpshufd xmm2, xmm5, 78
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm6, xmm0
+ vmovdqa xmm4, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm6, xmm6, xmm2
+ vpxor xmm4, xmm4, xmm1
+ vpsrld xmm0, xmm6, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm6, xmm6, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm6, xmm6, xmm0
+ vpor xmm4, xmm4, xmm1
+ vpslld xmm0, xmm6, 31
+ vpslld xmm1, xmm6, 30
+ vpslld xmm2, xmm6, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm6, xmm6, xmm0
+ vpsrld xmm2, xmm6, 1
+ vpsrld xmm3, xmm6, 2
+ vpsrld xmm0, xmm6, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm6
+ vpxor xmm4, xmm4, xmm2
+L_AES_GCM_init_vaes_calc_iv_done:
+ ; T = Encrypt counter
+ vpxor xmm0, xmm0, xmm0
+ shl edx, 3
+ vmovq xmm0, rdx
+ vpxor xmm4, xmm4, xmm0
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm4, 78
+ vpshufd xmm2, xmm5, 78
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm6, xmm0
+ vmovdqa xmm4, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm6, xmm6, xmm2
+ vpxor xmm4, xmm4, xmm1
+ vpsrld xmm0, xmm6, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm6, xmm6, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm6, xmm6, xmm0
+ vpor xmm4, xmm4, xmm1
+ vpslld xmm0, xmm6, 31
+ vpslld xmm1, xmm6, 30
+ vpslld xmm2, xmm6, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm6, xmm6, xmm0
+ vpsrld xmm2, xmm6, 1
+ vpsrld xmm3, xmm6, 2
+ vpsrld xmm0, xmm6, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm6
+ vpxor xmm4, xmm4, xmm2
+ vpshufb xmm4, xmm4, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ ; Encrypt counter
+ vmovdqa xmm7, OWORD PTR [rdi]
+ vpxor xmm7, xmm7, xmm4
+ vaesenc xmm7, xmm7, [rdi+16]
+ vaesenc xmm7, xmm7, [rdi+32]
+ vaesenc xmm7, xmm7, [rdi+48]
+ vaesenc xmm7, xmm7, [rdi+64]
+ vaesenc xmm7, xmm7, [rdi+80]
+ vaesenc xmm7, xmm7, [rdi+96]
+ vaesenc xmm7, xmm7, [rdi+112]
+ vaesenc xmm7, xmm7, [rdi+128]
+ vaesenc xmm7, xmm7, [rdi+144]
+ cmp esi, 11
+ vmovdqa xmm8, OWORD PTR [rdi+160]
+ jl L_AES_GCM_init_vaes_calc_iv_2_aesenc_avx_last
+ vaesenc xmm7, xmm7, xmm8
+ vaesenc xmm7, xmm7, [rdi+176]
+ cmp esi, 13
+ vmovdqa xmm8, OWORD PTR [rdi+192]
+ jl L_AES_GCM_init_vaes_calc_iv_2_aesenc_avx_last
+ vaesenc xmm7, xmm7, xmm8
+ vaesenc xmm7, xmm7, [rdi+208]
+ vmovdqa xmm8, OWORD PTR [rdi+224]
+L_AES_GCM_init_vaes_calc_iv_2_aesenc_avx_last:
+ vaesenclast xmm7, xmm7, xmm8
+ vmovdqu xmm15, xmm7
+L_AES_GCM_init_vaes_iv_done:
+ vmovdqa OWORD PTR [r9], xmm15
+ vpshufb xmm4, xmm4, OWORD PTR L_vaes_aes_gcm_bswap_epi64
+ vpaddd xmm4, xmm4, OWORD PTR L_vaes_aes_gcm_one
+ vmovdqa OWORD PTR [rax], xmm5
+ vmovdqa OWORD PTR [r8], xmm4
+ vmovdqu xmm6, OWORD PTR [rsp+16]
+ vmovdqu xmm7, OWORD PTR [rsp+32]
+ vmovdqu xmm8, OWORD PTR [rsp+48]
+ vmovdqu xmm15, OWORD PTR [rsp+64]
+ add rsp, 80
+ pop r13
+ pop r12
+ pop rsi
+ pop rdi
+ ret
+AES_GCM_init_vaes ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_aad_update_vaes PROC
+ mov rax, rcx
+ sub rsp, 32
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqa xmm5, OWORD PTR [r8]
+ vmovdqa xmm6, OWORD PTR [r9]
+ xor ecx, ecx
+L_AES_GCM_aad_update_vaes_16_loop:
+ vmovdqu xmm7, OWORD PTR [rax+rcx]
+ vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vpxor xmm5, xmm5, xmm7
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm5, 78
+ vpshufd xmm2, xmm6, 78
+ vpclmulqdq xmm3, xmm6, xmm5, 17
+ vpclmulqdq xmm0, xmm6, xmm5, 0
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm2, xmm2, xmm6
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm4, xmm0
+ vmovdqa xmm5, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm4, xmm4, xmm2
+ vpxor xmm5, xmm5, xmm1
+ vpsrld xmm0, xmm4, 31
+ vpsrld xmm1, xmm5, 31
+ vpslld xmm4, xmm4, 1
+ vpslld xmm5, xmm5, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm5, xmm5, xmm2
+ vpor xmm4, xmm4, xmm0
+ vpor xmm5, xmm5, xmm1
+ vpslld xmm0, xmm4, 31
+ vpslld xmm1, xmm4, 30
+ vpslld xmm2, xmm4, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm4, xmm4, xmm0
+ vpsrld xmm2, xmm4, 1
+ vpsrld xmm3, xmm4, 2
+ vpsrld xmm0, xmm4, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm4
+ vpxor xmm5, xmm5, xmm2
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_aad_update_vaes_16_loop
+ vmovdqa OWORD PTR [r8], xmm5
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ add rsp, 32
+ ret
+AES_GCM_aad_update_vaes ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_encrypt_block_vaes PROC
+ mov r10, r8
+ mov r11, r9
+ mov rax, QWORD PTR [rsp+40]
+ vmovdqu xmm1, OWORD PTR [rax]
+ vpshufb xmm0, xmm1, OWORD PTR L_vaes_aes_gcm_bswap_epi64
+ vpaddd xmm1, xmm1, OWORD PTR L_vaes_aes_gcm_one
+ vmovdqu OWORD PTR [rax], xmm1
+ vpxor xmm0, xmm0, [rcx]
+ vaesenc xmm0, xmm0, [rcx+16]
+ vaesenc xmm0, xmm0, [rcx+32]
+ vaesenc xmm0, xmm0, [rcx+48]
+ vaesenc xmm0, xmm0, [rcx+64]
+ vaesenc xmm0, xmm0, [rcx+80]
+ vaesenc xmm0, xmm0, [rcx+96]
+ vaesenc xmm0, xmm0, [rcx+112]
+ vaesenc xmm0, xmm0, [rcx+128]
+ vaesenc xmm0, xmm0, [rcx+144]
+ cmp edx, 11
+ vmovdqa xmm1, OWORD PTR [rcx+160]
+ jl L_AES_GCM_encrypt_block_vaes_aesenc_block_last
+ vaesenc xmm0, xmm0, xmm1
+ vaesenc xmm0, xmm0, [rcx+176]
+ cmp edx, 13
+ vmovdqa xmm1, OWORD PTR [rcx+192]
+ jl L_AES_GCM_encrypt_block_vaes_aesenc_block_last
+ vaesenc xmm0, xmm0, xmm1
+ vaesenc xmm0, xmm0, [rcx+208]
+ vmovdqa xmm1, OWORD PTR [rcx+224]
+L_AES_GCM_encrypt_block_vaes_aesenc_block_last:
+ vaesenclast xmm0, xmm0, xmm1
+ vmovdqu xmm1, OWORD PTR [r11]
+ vpxor xmm0, xmm0, xmm1
+ vmovdqu OWORD PTR [r10], xmm0
+ vpshufb xmm0, xmm0, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vzeroupper
+ ret
+AES_GCM_encrypt_block_vaes ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_ghash_block_vaes PROC
+ sub rsp, 32
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqa xmm4, OWORD PTR [rdx]
+ vmovdqa xmm5, OWORD PTR [r8]
+ vmovdqu xmm7, OWORD PTR [rcx]
+ vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vpxor xmm4, xmm4, xmm7
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm4, 78
+ vpshufd xmm2, xmm5, 78
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm6, xmm0
+ vmovdqa xmm4, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm6, xmm6, xmm2
+ vpxor xmm4, xmm4, xmm1
+ vpsrld xmm0, xmm6, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm6, xmm6, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm6, xmm6, xmm0
+ vpor xmm4, xmm4, xmm1
+ vpslld xmm0, xmm6, 31
+ vpslld xmm1, xmm6, 30
+ vpslld xmm2, xmm6, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm6, xmm6, xmm0
+ vpsrld xmm2, xmm6, 1
+ vpsrld xmm3, xmm6, 2
+ vpsrld xmm0, xmm6, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm6
+ vpxor xmm4, xmm4, xmm2
+ vmovdqa OWORD PTR [rdx], xmm4
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ add rsp, 32
+ ret
+AES_GCM_ghash_block_vaes ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_encrypt_update_vaes PROC
+ push r13
+ push r12
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ mov rax, rcx
+ mov r10, r8
+ mov r8d, edx
+ mov r11, r9
+ mov r9d, DWORD PTR [rsp+96]
+ mov r12, QWORD PTR [rsp+104]
+ mov r14, QWORD PTR [rsp+112]
+ mov r15, QWORD PTR [rsp+120]
+ sub rsp, 688
+ vmovdqu OWORD PTR [rsp+528], xmm6
+ vmovdqu OWORD PTR [rsp+544], xmm7
+ vmovdqu OWORD PTR [rsp+560], xmm8
+ vmovdqu OWORD PTR [rsp+576], xmm9
+ vmovdqu OWORD PTR [rsp+592], xmm10
+ vmovdqu OWORD PTR [rsp+608], xmm11
+ vmovdqu OWORD PTR [rsp+624], xmm12
+ vmovdqu OWORD PTR [rsp+640], xmm13
+ vmovdqu OWORD PTR [rsp+656], xmm14
+ vmovdqu OWORD PTR [rsp+672], xmm15
+ vmovdqa xmm15, OWORD PTR [r12]
+ vmovdqa xmm6, OWORD PTR [r14]
+ vpsrlq xmm8, xmm6, 63
+ vpsllq xmm7, xmm6, 1
+ vpslldq xmm8, xmm8, 8
+ vpor xmm7, xmm7, xmm8
+ vpshufd xmm6, xmm6, 255
+ vpsrad xmm6, xmm6, 31
+ vpand xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpxor xmm6, xmm6, xmm7
+ xor edi, edi
+ cmp r9d, 128
+ jl L_AES_GCM_encrypt_update_vaes_done_128
+ vmovdqa xmm2, xmm15
+ ; H ^ 1
+ vmovdqu OWORD PTR [rsp], xmm6
+ ; H ^ 2
+ vpclmulqdq xmm7, xmm6, xmm6, 0
+ vpclmulqdq xmm10, xmm6, xmm6, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm0, xmm10
+ vmovdqu OWORD PTR [rsp+16], xmm0
+ ; H ^ 3
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm0, xmm6, 0
+ vpclmulqdq xmm8, xmm0, xmm6, 1
+ vpclmulqdq xmm9, xmm0, xmm6, 16
+ vpclmulqdq xmm10, xmm0, xmm6, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm1, xmm10
+ vmovdqu OWORD PTR [rsp+32], xmm1
+ ; H ^ 4
+ vpclmulqdq xmm7, xmm0, xmm0, 0
+ vpclmulqdq xmm10, xmm0, xmm0, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm3, xmm10
+ vmovdqu OWORD PTR [rsp+48], xmm3
+ ; H ^ 5
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm1, xmm0, 0
+ vpclmulqdq xmm8, xmm1, xmm0, 1
+ vpclmulqdq xmm9, xmm1, xmm0, 16
+ vpclmulqdq xmm10, xmm1, xmm0, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+64], xmm4
+ ; H ^ 6
+ vpclmulqdq xmm7, xmm1, xmm1, 0
+ vpclmulqdq xmm10, xmm1, xmm1, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm4
+ ; H ^ 7
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm3, xmm1, 0
+ vpclmulqdq xmm8, xmm3, xmm1, 1
+ vpclmulqdq xmm9, xmm3, xmm1, 16
+ vpclmulqdq xmm10, xmm3, xmm1, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+96], xmm4
+ ; H ^ 8
+ vpclmulqdq xmm7, xmm3, xmm3, 0
+ vpclmulqdq xmm10, xmm3, xmm3, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+112], xmm4
+ cmp r9d, 256
+ jl L_AES_GCM_encrypt_update_vaes_no_ext
+ ; H ^ 9
+ vmovdqu xmm0, OWORD PTR [rsp+48]
+ vmovdqu xmm1, OWORD PTR [rsp+64]
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm1, xmm0, 0
+ vpclmulqdq xmm8, xmm1, xmm0, 1
+ vpclmulqdq xmm9, xmm1, xmm0, 16
+ vpclmulqdq xmm10, xmm1, xmm0, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+128], xmm4
+ ; H ^ 10
+ vmovdqu xmm0, OWORD PTR [rsp+64]
+ vpclmulqdq xmm7, xmm0, xmm0, 0
+ vpclmulqdq xmm10, xmm0, xmm0, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+144], xmm4
+ ; H ^ 11
+ vmovdqu xmm0, OWORD PTR [rsp+64]
+ vmovdqu xmm1, OWORD PTR [rsp+80]
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm1, xmm0, 0
+ vpclmulqdq xmm8, xmm1, xmm0, 1
+ vpclmulqdq xmm9, xmm1, xmm0, 16
+ vpclmulqdq xmm10, xmm1, xmm0, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+160], xmm4
+ ; H ^ 12
+ vmovdqu xmm0, OWORD PTR [rsp+80]
+ vpclmulqdq xmm7, xmm0, xmm0, 0
+ vpclmulqdq xmm10, xmm0, xmm0, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+176], xmm4
+ ; H ^ 13
+ vmovdqu xmm0, OWORD PTR [rsp+80]
+ vmovdqu xmm1, OWORD PTR [rsp+96]
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm1, xmm0, 0
+ vpclmulqdq xmm8, xmm1, xmm0, 1
+ vpclmulqdq xmm9, xmm1, xmm0, 16
+ vpclmulqdq xmm10, xmm1, xmm0, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+192], xmm4
+ ; H ^ 14
+ vmovdqu xmm0, OWORD PTR [rsp+96]
+ vpclmulqdq xmm7, xmm0, xmm0, 0
+ vpclmulqdq xmm10, xmm0, xmm0, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+208], xmm4
+ ; H ^ 15
+ vmovdqu xmm0, OWORD PTR [rsp+96]
+ vmovdqu xmm1, OWORD PTR [rsp+112]
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm1, xmm0, 0
+ vpclmulqdq xmm8, xmm1, xmm0, 1
+ vpclmulqdq xmm9, xmm1, xmm0, 16
+ vpclmulqdq xmm10, xmm1, xmm0, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+224], xmm4
+ ; H ^ 16
+ vmovdqu xmm0, OWORD PTR [rsp+112]
+ vpclmulqdq xmm7, xmm0, xmm0, 0
+ vpclmulqdq xmm10, xmm0, xmm0, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+240], xmm4
+ vmovdqu ymm7, YMMWORD PTR [rsp+224]
+ vpermq ymm7, ymm7, 78
+ vmovdqu ymm8, YMMWORD PTR [rsp+192]
+ vpermq ymm8, ymm8, 78
+ vmovdqu ymm9, YMMWORD PTR [rsp+160]
+ vpermq ymm9, ymm9, 78
+ vmovdqu ymm10, YMMWORD PTR [rsp+128]
+ vpermq ymm10, ymm10, 78
+ vmovdqu YMMWORD PTR [rsp+256], ymm7
+ vmovdqu YMMWORD PTR [rsp+288], ymm8
+ vmovdqu YMMWORD PTR [rsp+320], ymm9
+ vmovdqu YMMWORD PTR [rsp+352], ymm10
+ vmovdqu ymm7, YMMWORD PTR [rsp+96]
+ vpermq ymm7, ymm7, 78
+ vmovdqu ymm8, YMMWORD PTR [rsp+64]
+ vpermq ymm8, ymm8, 78
+ vmovdqu ymm9, YMMWORD PTR [rsp+32]
+ vpermq ymm9, ymm9, 78
+ vmovdqu ymm10, YMMWORD PTR [rsp]
+ vpermq ymm10, ymm10, 78
+ vmovdqu YMMWORD PTR [rsp+384], ymm7
+ vmovdqu YMMWORD PTR [rsp+416], ymm8
+ vmovdqu YMMWORD PTR [rsp+448], ymm9
+ vmovdqu YMMWORD PTR [rsp+480], ymm10
+L_AES_GCM_encrypt_update_vaes_no_ext:
+ vbroadcasti128 ymm14, ptr_L_vaes_aes_gcm_mod2_128
+ cmp r9d, 256
+ jl L_AES_GCM_encrypt_update_vaes_after_256
+ mov r13d, r9d
+ and r13d, 4294967040
+L_AES_GCM_encrypt_update_vaes_loop_256:
+ ; 256 bytes of input
+ lea rsi, QWORD PTR [r10+rdi]
+ vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64
+ vbroadcasti128 ymm4, [r15]
+ vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
+ vpshufb ymm0, ymm0, ymm6
+ vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
+ vpshufb ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2
+ vpshufb ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3
+ vpshufb ymm3, ymm3, ymm6
+ vmovdqu xmm7, OWORD PTR [r15]
+ vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
+ vmovdqu OWORD PTR [r15], xmm7
+ vbroadcasti128 ymm4, [rax]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm1, ymm1, ymm4
+ vpxor ymm2, ymm2, ymm4
+ vpxor ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+16]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+32]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+48]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+64]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+80]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+96]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+112]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+128]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+144]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r8d, 11
+ vbroadcasti128 ymm4, [rax+160]
+ jl L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+176]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r8d, 13
+ vbroadcasti128 ymm4, [rax+192]
+ jl L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+208]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+224]
+L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last:
+ vaesenclast ymm0, ymm0, ymm4
+ vaesenclast ymm1, ymm1, ymm4
+ vaesenclast ymm2, ymm2, ymm4
+ vaesenclast ymm3, ymm3, ymm4
+ lea rcx, QWORD PTR [r11+rdi]
+ lea rdx, QWORD PTR [r10+rdi]
+ vmovdqu ymm5, YMMWORD PTR [rcx]
+ vpxor ymm0, ymm0, ymm5
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu ymm5, YMMWORD PTR [rcx+32]
+ vpxor ymm1, ymm1, ymm5
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu ymm5, YMMWORD PTR [rcx+64]
+ vpxor ymm2, ymm2, ymm5
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vmovdqu ymm5, YMMWORD PTR [rcx+96]
+ vpxor ymm3, ymm3, ymm5
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ add edi, 128
+ vbroadcasti128 ymm4, [r15]
+ vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
+ vpshufb ymm0, ymm0, ymm6
+ vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
+ vpshufb ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2
+ vpshufb ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3
+ vpshufb ymm3, ymm3, ymm6
+ vmovdqu xmm7, OWORD PTR [r15]
+ vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
+ vmovdqu OWORD PTR [r15], xmm7
+ vbroadcasti128 ymm4, [rax]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm1, ymm1, ymm4
+ vpxor ymm2, ymm2, ymm4
+ vpxor ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+16]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+32]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+48]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+64]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+80]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+96]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+112]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+128]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+144]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r8d, 11
+ vbroadcasti128 ymm4, [rax+160]
+ jl L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+176]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r8d, 13
+ vbroadcasti128 ymm4, [rax+192]
+ jl L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+208]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+224]
+L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last:
+ vaesenclast ymm0, ymm0, ymm4
+ vaesenclast ymm1, ymm1, ymm4
+ vaesenclast ymm2, ymm2, ymm4
+ vaesenclast ymm3, ymm3, ymm4
+ lea rcx, QWORD PTR [r11+rdi]
+ lea rdx, QWORD PTR [r10+rdi]
+ vmovdqu ymm5, YMMWORD PTR [rcx]
+ vpxor ymm0, ymm0, ymm5
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu ymm5, YMMWORD PTR [rcx+32]
+ vpxor ymm1, ymm1, ymm5
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu ymm5, YMMWORD PTR [rcx+64]
+ vpxor ymm2, ymm2, ymm5
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vmovdqu ymm5, YMMWORD PTR [rcx+96]
+ vpxor ymm3, ymm3, ymm5
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ add edi, 128
+ vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask
+ vpxor ymm4, ymm4, ymm4
+ vinserti128 ymm4, ymm4, xmm15, 0
+ vmovdqu ymm7, YMMWORD PTR [rsp+256]
+ vmovdqu ymm8, YMMWORD PTR [rsp+288]
+ vmovdqu ymm9, YMMWORD PTR [rsp+320]
+ vmovdqu ymm10, YMMWORD PTR [rsp+352]
+ vmovdqu ymm5, YMMWORD PTR [rsi]
+ vpshufb ymm5, ymm5, ymm6
+ vpxor ymm5, ymm5, ymm4
+ vpclmulqdq ymm0, ymm5, ymm7, 0
+ vpclmulqdq ymm1, ymm5, ymm7, 1
+ vpclmulqdq ymm2, ymm5, ymm7, 16
+ vpclmulqdq ymm3, ymm5, ymm7, 17
+ vmovdqa ymm11, ymm0
+ vpxor ymm12, ymm2, ymm1
+ vmovdqa ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rsi+32]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm8, 0
+ vpclmulqdq ymm1, ymm5, ymm8, 1
+ vpclmulqdq ymm2, ymm5, ymm8, 16
+ vpclmulqdq ymm3, ymm5, ymm8, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rsi+64]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm9, 0
+ vpclmulqdq ymm1, ymm5, ymm9, 1
+ vpclmulqdq ymm2, ymm5, ymm9, 16
+ vpclmulqdq ymm3, ymm5, ymm9, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rsi+96]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm10, 0
+ vpclmulqdq ymm1, ymm5, ymm10, 1
+ vpclmulqdq ymm2, ymm5, ymm10, 16
+ vpclmulqdq ymm3, ymm5, ymm10, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm7, YMMWORD PTR [rsp+384]
+ vmovdqu ymm8, YMMWORD PTR [rsp+416]
+ vmovdqu ymm9, YMMWORD PTR [rsp+448]
+ vmovdqu ymm10, YMMWORD PTR [rsp+480]
+ vmovdqu ymm5, YMMWORD PTR [rsi+128]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm7, 0
+ vpclmulqdq ymm1, ymm5, ymm7, 1
+ vpclmulqdq ymm2, ymm5, ymm7, 16
+ vpclmulqdq ymm3, ymm5, ymm7, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rsi+160]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm8, 0
+ vpclmulqdq ymm1, ymm5, ymm8, 1
+ vpclmulqdq ymm2, ymm5, ymm8, 16
+ vpclmulqdq ymm3, ymm5, ymm8, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rsi+192]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm9, 0
+ vpclmulqdq ymm1, ymm5, ymm9, 1
+ vpclmulqdq ymm2, ymm5, ymm9, 16
+ vpclmulqdq ymm3, ymm5, ymm9, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rsi+224]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm10, 0
+ vpclmulqdq ymm1, ymm5, ymm10, 1
+ vpclmulqdq ymm2, ymm5, ymm10, 16
+ vpclmulqdq ymm3, ymm5, ymm10, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vpclmulqdq ymm5, ymm14, ymm11, 1
+ vpshufd ymm11, ymm11, 78
+ vpxor ymm12, ymm12, ymm5
+ vpxor ymm12, ymm12, ymm11
+ vpclmulqdq ymm5, ymm14, ymm12, 1
+ vpshufd ymm12, ymm12, 78
+ vpxor ymm13, ymm13, ymm5
+ vpxor ymm13, ymm13, ymm12
+ vextracti128 xmm0, ymm13, 1
+ vpxor xmm15, xmm13, xmm0
+ cmp edi, r13d
+ jl L_AES_GCM_encrypt_update_vaes_loop_256
+L_AES_GCM_encrypt_update_vaes_after_256:
+ mov r13d, r9d
+ and r13d, 4294967168
+ cmp edi, r13d
+ jge L_AES_GCM_encrypt_update_vaes_after_128
+ ; 128 bytes of input
+ lea rsi, QWORD PTR [r10+rdi]
+ vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64
+ vbroadcasti128 ymm4, [r15]
+ vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
+ vpshufb ymm0, ymm0, ymm6
+ vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
+ vpshufb ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2
+ vpshufb ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3
+ vpshufb ymm3, ymm3, ymm6
+ vmovdqu xmm7, OWORD PTR [r15]
+ vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
+ vmovdqu OWORD PTR [r15], xmm7
+ vbroadcasti128 ymm4, [rax]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm1, ymm1, ymm4
+ vpxor ymm2, ymm2, ymm4
+ vpxor ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+16]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+32]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+48]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+64]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+80]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+96]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+112]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+128]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+144]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r8d, 11
+ vbroadcasti128 ymm4, [rax+160]
+ jl L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+176]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r8d, 13
+ vbroadcasti128 ymm4, [rax+192]
+ jl L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+208]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+224]
+L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last:
+ vaesenclast ymm0, ymm0, ymm4
+ vaesenclast ymm1, ymm1, ymm4
+ vaesenclast ymm2, ymm2, ymm4
+ vaesenclast ymm3, ymm3, ymm4
+ lea rcx, QWORD PTR [r11+rdi]
+ lea rdx, QWORD PTR [r10+rdi]
+ vmovdqu ymm5, YMMWORD PTR [rcx]
+ vpxor ymm0, ymm0, ymm5
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu ymm5, YMMWORD PTR [rcx+32]
+ vpxor ymm1, ymm1, ymm5
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu ymm5, YMMWORD PTR [rcx+64]
+ vpxor ymm2, ymm2, ymm5
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vmovdqu ymm5, YMMWORD PTR [rcx+96]
+ vpxor ymm3, ymm3, ymm5
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ add edi, 128
+ vmovdqu ymm7, YMMWORD PTR [rsp+96]
+ vpermq ymm7, ymm7, 78
+ vmovdqu ymm8, YMMWORD PTR [rsp+64]
+ vpermq ymm8, ymm8, 78
+ vmovdqu ymm9, YMMWORD PTR [rsp+32]
+ vpermq ymm9, ymm9, 78
+ vmovdqu ymm10, YMMWORD PTR [rsp]
+ vpermq ymm10, ymm10, 78
+ vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask
+ vpxor ymm4, ymm4, ymm4
+ vinserti128 ymm4, ymm4, xmm15, 0
+ vmovdqu ymm5, YMMWORD PTR [rsi]
+ vpshufb ymm5, ymm5, ymm6
+ vpxor ymm5, ymm5, ymm4
+ vpclmulqdq ymm0, ymm5, ymm7, 0
+ vpclmulqdq ymm1, ymm5, ymm7, 1
+ vpclmulqdq ymm2, ymm5, ymm7, 16
+ vpclmulqdq ymm3, ymm5, ymm7, 17
+ vmovdqa ymm11, ymm0
+ vpxor ymm12, ymm2, ymm1
+ vmovdqa ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rsi+32]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm8, 0
+ vpclmulqdq ymm1, ymm5, ymm8, 1
+ vpclmulqdq ymm2, ymm5, ymm8, 16
+ vpclmulqdq ymm3, ymm5, ymm8, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rsi+64]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm9, 0
+ vpclmulqdq ymm1, ymm5, ymm9, 1
+ vpclmulqdq ymm2, ymm5, ymm9, 16
+ vpclmulqdq ymm3, ymm5, ymm9, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rsi+96]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm10, 0
+ vpclmulqdq ymm1, ymm5, ymm10, 1
+ vpclmulqdq ymm2, ymm5, ymm10, 16
+ vpclmulqdq ymm3, ymm5, ymm10, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vpclmulqdq ymm5, ymm14, ymm11, 1
+ vpshufd ymm11, ymm11, 78
+ vpxor ymm12, ymm12, ymm5
+ vpxor ymm12, ymm12, ymm11
+ vpclmulqdq ymm5, ymm14, ymm12, 1
+ vpshufd ymm12, ymm12, 78
+ vpxor ymm13, ymm13, ymm5
+ vpxor ymm13, ymm13, ymm12
+ vextracti128 xmm0, ymm13, 1
+ vpxor xmm15, xmm13, xmm0
+L_AES_GCM_encrypt_update_vaes_after_128:
+ vmovdqu xmm6, OWORD PTR [rsp]
+L_AES_GCM_encrypt_update_vaes_done_128:
+ mov edx, r9d
+ cmp edi, edx
+ jge L_AES_GCM_encrypt_update_vaes_done_enc
+ mov r13d, r9d
+ and r13d, 4294967280
+ cmp edi, r13d
+ jge L_AES_GCM_encrypt_update_vaes_last_block_done
+ vmovdqu xmm8, OWORD PTR [r15]
+ vpshufb xmm7, xmm8, OWORD PTR L_vaes_aes_gcm_bswap_epi64
+ vpaddd xmm8, xmm8, OWORD PTR L_vaes_aes_gcm_one
+ vmovdqu OWORD PTR [r15], xmm8
+ vpxor xmm7, xmm7, [rax]
+ vaesenc xmm7, xmm7, [rax+16]
+ vaesenc xmm7, xmm7, [rax+32]
+ vaesenc xmm7, xmm7, [rax+48]
+ vaesenc xmm7, xmm7, [rax+64]
+ vaesenc xmm7, xmm7, [rax+80]
+ vaesenc xmm7, xmm7, [rax+96]
+ vaesenc xmm7, xmm7, [rax+112]
+ vaesenc xmm7, xmm7, [rax+128]
+ vaesenc xmm7, xmm7, [rax+144]
+ cmp r8d, 11
+ vmovdqa xmm8, OWORD PTR [rax+160]
+ jl L_AES_GCM_encrypt_update_vaes_aesenc_block_last
+ vaesenc xmm7, xmm7, xmm8
+ vaesenc xmm7, xmm7, [rax+176]
+ cmp r8d, 13
+ vmovdqa xmm8, OWORD PTR [rax+192]
+ jl L_AES_GCM_encrypt_update_vaes_aesenc_block_last
+ vaesenc xmm7, xmm7, xmm8
+ vaesenc xmm7, xmm7, [rax+208]
+ vmovdqa xmm8, OWORD PTR [rax+224]
+L_AES_GCM_encrypt_update_vaes_aesenc_block_last:
+ vaesenclast xmm7, xmm7, xmm8
+ vmovdqu xmm8, OWORD PTR [r11+rdi]
+ vpxor xmm7, xmm7, xmm8
+ vmovdqu OWORD PTR [r10+rdi], xmm7
+ vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vpxor xmm15, xmm15, xmm7
+ add edi, 16
+ cmp edi, r13d
+ jge L_AES_GCM_encrypt_update_vaes_last_block_ghash
+L_AES_GCM_encrypt_update_vaes_last_block_start:
+ vmovdqu xmm12, OWORD PTR [r11+rdi]
+ vmovdqu xmm8, OWORD PTR [r15]
+ vpshufb xmm7, xmm8, OWORD PTR L_vaes_aes_gcm_bswap_epi64
+ vpaddd xmm8, xmm8, OWORD PTR L_vaes_aes_gcm_one
+ vmovdqu OWORD PTR [r15], xmm8
+ vpxor xmm7, xmm7, [rax]
+ vpclmulqdq xmm9, xmm15, xmm6, 16
+ vaesenc xmm7, xmm7, [rax+16]
+ vaesenc xmm7, xmm7, [rax+32]
+ vpclmulqdq xmm10, xmm15, xmm6, 1
+ vaesenc xmm7, xmm7, [rax+48]
+ vaesenc xmm7, xmm7, [rax+64]
+ vpclmulqdq xmm11, xmm15, xmm6, 0
+ vaesenc xmm7, xmm7, [rax+80]
+ vpclmulqdq xmm1, xmm15, xmm6, 17
+ vaesenc xmm7, xmm7, [rax+96]
+ vpxor xmm9, xmm9, xmm10
+ vpslldq xmm2, xmm9, 8
+ vpsrldq xmm9, xmm9, 8
+ vaesenc xmm7, xmm7, [rax+112]
+ vpxor xmm2, xmm2, xmm11
+ vpxor xmm3, xmm1, xmm9
+ vmovdqa xmm0, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm10, xmm2, xmm0, 16
+ vaesenc xmm7, xmm7, [rax+128]
+ vpshufd xmm9, xmm2, 78
+ vpxor xmm9, xmm9, xmm10
+ vpclmulqdq xmm10, xmm9, xmm0, 16
+ vaesenc xmm7, xmm7, [rax+144]
+ vpshufd xmm9, xmm9, 78
+ vpxor xmm9, xmm9, xmm10
+ vpxor xmm15, xmm9, xmm3
+ cmp r8d, 11
+ vmovdqa xmm8, OWORD PTR [rax+160]
+ jl L_AES_GCM_encrypt_update_vaes_aesenc_gfmul_last
+ vaesenc xmm7, xmm7, xmm8
+ vaesenc xmm7, xmm7, [rax+176]
+ cmp r8d, 13
+ vmovdqa xmm8, OWORD PTR [rax+192]
+ jl L_AES_GCM_encrypt_update_vaes_aesenc_gfmul_last
+ vaesenc xmm7, xmm7, xmm8
+ vaesenc xmm7, xmm7, [rax+208]
+ vmovdqa xmm8, OWORD PTR [rax+224]
+L_AES_GCM_encrypt_update_vaes_aesenc_gfmul_last:
+ vaesenclast xmm7, xmm7, xmm8
+ vmovdqa xmm0, xmm12
+ vpxor xmm7, xmm7, xmm0
+ vmovdqu OWORD PTR [r10+rdi], xmm7
+ vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ add edi, 16
+ vpxor xmm15, xmm15, xmm7
+ cmp edi, r13d
+ jl L_AES_GCM_encrypt_update_vaes_last_block_start
+L_AES_GCM_encrypt_update_vaes_last_block_ghash:
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm15, xmm6, 0
+ vpclmulqdq xmm8, xmm15, xmm6, 1
+ vpclmulqdq xmm9, xmm15, xmm6, 16
+ vpclmulqdq xmm10, xmm15, xmm6, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm15, xmm10
+L_AES_GCM_encrypt_update_vaes_last_block_done:
+L_AES_GCM_encrypt_update_vaes_done_enc:
+ vmovdqa OWORD PTR [r12], xmm15
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp+528]
+ vmovdqu xmm7, OWORD PTR [rsp+544]
+ vmovdqu xmm8, OWORD PTR [rsp+560]
+ vmovdqu xmm9, OWORD PTR [rsp+576]
+ vmovdqu xmm10, OWORD PTR [rsp+592]
+ vmovdqu xmm11, OWORD PTR [rsp+608]
+ vmovdqu xmm12, OWORD PTR [rsp+624]
+ vmovdqu xmm13, OWORD PTR [rsp+640]
+ vmovdqu xmm14, OWORD PTR [rsp+656]
+ vmovdqu xmm15, OWORD PTR [rsp+672]
+ add rsp, 688
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r12
+ pop r13
+ ret
+AES_GCM_encrypt_update_vaes ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_encrypt_final_vaes PROC
+ push r13
+ push r12
+ push r14
+ mov rax, rcx
+ mov r10d, r9d
+ mov r9, rdx
+ mov r11d, DWORD PTR [rsp+64]
+ mov r12, QWORD PTR [rsp+72]
+ mov r14, QWORD PTR [rsp+80]
+ sub rsp, 144
+ vmovdqu OWORD PTR [rsp+16], xmm6
+ vmovdqu OWORD PTR [rsp+32], xmm7
+ vmovdqu OWORD PTR [rsp+48], xmm8
+ vmovdqu OWORD PTR [rsp+64], xmm9
+ vmovdqu OWORD PTR [rsp+80], xmm10
+ vmovdqu OWORD PTR [rsp+96], xmm11
+ vmovdqu OWORD PTR [rsp+112], xmm12
+ vmovdqu OWORD PTR [rsp+128], xmm13
+ vmovdqa xmm4, OWORD PTR [rax]
+ vmovdqa xmm5, OWORD PTR [r12]
+ vmovdqa xmm6, OWORD PTR [r14]
+ vpsrlq xmm8, xmm5, 63
+ vpsllq xmm7, xmm5, 1
+ vpslldq xmm8, xmm8, 8
+ vpor xmm7, xmm7, xmm8
+ vpshufd xmm5, xmm5, 255
+ vpsrad xmm5, xmm5, 31
+ vpand xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpxor xmm5, xmm5, xmm7
+ mov edx, r10d
+ mov ecx, r11d
+ shl rdx, 3
+ shl rcx, 3
+ vmovq xmm0, rdx
+ vmovq xmm1, rcx
+ vpunpcklqdq xmm0, xmm0, xmm1
+ vpxor xmm4, xmm4, xmm0
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm4, xmm5, 0
+ vpclmulqdq xmm8, xmm4, xmm5, 1
+ vpclmulqdq xmm9, xmm4, xmm5, 16
+ vpclmulqdq xmm10, xmm4, xmm5, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vpshufb xmm4, xmm4, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vpxor xmm0, xmm4, xmm6
+ cmp r8d, 16
+ je L_AES_GCM_encrypt_final_vaes_store_tag_16
+ xor rcx, rcx
+ vmovdqu OWORD PTR [rsp], xmm0
+L_AES_GCM_encrypt_final_vaes_store_tag_loop:
+ movzx r13d, BYTE PTR [rsp+rcx]
+ mov BYTE PTR [r9+rcx], r13b
+ inc ecx
+ cmp ecx, r8d
+ jne L_AES_GCM_encrypt_final_vaes_store_tag_loop
+ jmp L_AES_GCM_encrypt_final_vaes_store_tag_done
+L_AES_GCM_encrypt_final_vaes_store_tag_16:
+ vmovdqu OWORD PTR [r9], xmm0
+L_AES_GCM_encrypt_final_vaes_store_tag_done:
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp+16]
+ vmovdqu xmm7, OWORD PTR [rsp+32]
+ vmovdqu xmm8, OWORD PTR [rsp+48]
+ vmovdqu xmm9, OWORD PTR [rsp+64]
+ vmovdqu xmm10, OWORD PTR [rsp+80]
+ vmovdqu xmm11, OWORD PTR [rsp+96]
+ vmovdqu xmm12, OWORD PTR [rsp+112]
+ vmovdqu xmm13, OWORD PTR [rsp+128]
+ add rsp, 144
+ pop r14
+ pop r12
+ pop r13
+ ret
+AES_GCM_encrypt_final_vaes ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_decrypt_update_vaes PROC
+ push r13
+ push r12
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ mov rax, rcx
+ mov r10, r8
+ mov r8d, edx
+ mov r11, r9
+ mov r9d, DWORD PTR [rsp+96]
+ mov r12, QWORD PTR [rsp+104]
+ mov r14, QWORD PTR [rsp+112]
+ mov r15, QWORD PTR [rsp+120]
+ sub rsp, 688
+ vmovdqu OWORD PTR [rsp+528], xmm6
+ vmovdqu OWORD PTR [rsp+544], xmm7
+ vmovdqu OWORD PTR [rsp+560], xmm8
+ vmovdqu OWORD PTR [rsp+576], xmm9
+ vmovdqu OWORD PTR [rsp+592], xmm10
+ vmovdqu OWORD PTR [rsp+608], xmm11
+ vmovdqu OWORD PTR [rsp+624], xmm12
+ vmovdqu OWORD PTR [rsp+640], xmm13
+ vmovdqu OWORD PTR [rsp+656], xmm14
+ vmovdqu OWORD PTR [rsp+672], xmm15
+ vmovdqa xmm15, OWORD PTR [r12]
+ vmovdqa xmm6, OWORD PTR [r14]
+ vpsrlq xmm8, xmm6, 63
+ vpsllq xmm7, xmm6, 1
+ vpslldq xmm8, xmm8, 8
+ vpor xmm7, xmm7, xmm8
+ vpshufd xmm6, xmm6, 255
+ vpsrad xmm6, xmm6, 31
+ vpand xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpxor xmm6, xmm6, xmm7
+ xor edi, edi
+ cmp r9d, 128
+ jl L_AES_GCM_decrypt_update_vaes_done_128
+ vmovdqa xmm2, xmm15
+ ; H ^ 1
+ vmovdqu OWORD PTR [rsp], xmm6
+ ; H ^ 2
+ vpclmulqdq xmm7, xmm6, xmm6, 0
+ vpclmulqdq xmm10, xmm6, xmm6, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm0, xmm10
+ vmovdqu OWORD PTR [rsp+16], xmm0
+ ; H ^ 3
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm0, xmm6, 0
+ vpclmulqdq xmm8, xmm0, xmm6, 1
+ vpclmulqdq xmm9, xmm0, xmm6, 16
+ vpclmulqdq xmm10, xmm0, xmm6, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm1, xmm10
+ vmovdqu OWORD PTR [rsp+32], xmm1
+ ; H ^ 4
+ vpclmulqdq xmm7, xmm0, xmm0, 0
+ vpclmulqdq xmm10, xmm0, xmm0, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm3, xmm10
+ vmovdqu OWORD PTR [rsp+48], xmm3
+ ; H ^ 5
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm1, xmm0, 0
+ vpclmulqdq xmm8, xmm1, xmm0, 1
+ vpclmulqdq xmm9, xmm1, xmm0, 16
+ vpclmulqdq xmm10, xmm1, xmm0, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+64], xmm4
+ ; H ^ 6
+ vpclmulqdq xmm7, xmm1, xmm1, 0
+ vpclmulqdq xmm10, xmm1, xmm1, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm4
+ ; H ^ 7
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm3, xmm1, 0
+ vpclmulqdq xmm8, xmm3, xmm1, 1
+ vpclmulqdq xmm9, xmm3, xmm1, 16
+ vpclmulqdq xmm10, xmm3, xmm1, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+96], xmm4
+ ; H ^ 8
+ vpclmulqdq xmm7, xmm3, xmm3, 0
+ vpclmulqdq xmm10, xmm3, xmm3, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+112], xmm4
+ cmp r9d, 256
+ jl L_AES_GCM_decrypt_update_vaes_no_ext
+ ; H ^ 9
+ vmovdqu xmm0, OWORD PTR [rsp+48]
+ vmovdqu xmm1, OWORD PTR [rsp+64]
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm1, xmm0, 0
+ vpclmulqdq xmm8, xmm1, xmm0, 1
+ vpclmulqdq xmm9, xmm1, xmm0, 16
+ vpclmulqdq xmm10, xmm1, xmm0, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+128], xmm4
+ ; H ^ 10
+ vmovdqu xmm0, OWORD PTR [rsp+64]
+ vpclmulqdq xmm7, xmm0, xmm0, 0
+ vpclmulqdq xmm10, xmm0, xmm0, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+144], xmm4
+ ; H ^ 11
+ vmovdqu xmm0, OWORD PTR [rsp+64]
+ vmovdqu xmm1, OWORD PTR [rsp+80]
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm1, xmm0, 0
+ vpclmulqdq xmm8, xmm1, xmm0, 1
+ vpclmulqdq xmm9, xmm1, xmm0, 16
+ vpclmulqdq xmm10, xmm1, xmm0, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+160], xmm4
+ ; H ^ 12
+ vmovdqu xmm0, OWORD PTR [rsp+80]
+ vpclmulqdq xmm7, xmm0, xmm0, 0
+ vpclmulqdq xmm10, xmm0, xmm0, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+176], xmm4
+ ; H ^ 13
+ vmovdqu xmm0, OWORD PTR [rsp+80]
+ vmovdqu xmm1, OWORD PTR [rsp+96]
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm1, xmm0, 0
+ vpclmulqdq xmm8, xmm1, xmm0, 1
+ vpclmulqdq xmm9, xmm1, xmm0, 16
+ vpclmulqdq xmm10, xmm1, xmm0, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+192], xmm4
+ ; H ^ 14
+ vmovdqu xmm0, OWORD PTR [rsp+96]
+ vpclmulqdq xmm7, xmm0, xmm0, 0
+ vpclmulqdq xmm10, xmm0, xmm0, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+208], xmm4
+ ; H ^ 15
+ vmovdqu xmm0, OWORD PTR [rsp+96]
+ vmovdqu xmm1, OWORD PTR [rsp+112]
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm1, xmm0, 0
+ vpclmulqdq xmm8, xmm1, xmm0, 1
+ vpclmulqdq xmm9, xmm1, xmm0, 16
+ vpclmulqdq xmm10, xmm1, xmm0, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+224], xmm4
+ ; H ^ 16
+ vmovdqu xmm0, OWORD PTR [rsp+112]
+ vpclmulqdq xmm7, xmm0, xmm0, 0
+ vpclmulqdq xmm10, xmm0, xmm0, 17
+ vpxor xmm8, xmm8, xmm8
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm4, xmm10
+ vmovdqu OWORD PTR [rsp+240], xmm4
+ vmovdqu ymm7, YMMWORD PTR [rsp+224]
+ vpermq ymm7, ymm7, 78
+ vmovdqu ymm8, YMMWORD PTR [rsp+192]
+ vpermq ymm8, ymm8, 78
+ vmovdqu ymm9, YMMWORD PTR [rsp+160]
+ vpermq ymm9, ymm9, 78
+ vmovdqu ymm10, YMMWORD PTR [rsp+128]
+ vpermq ymm10, ymm10, 78
+ vmovdqu YMMWORD PTR [rsp+256], ymm7
+ vmovdqu YMMWORD PTR [rsp+288], ymm8
+ vmovdqu YMMWORD PTR [rsp+320], ymm9
+ vmovdqu YMMWORD PTR [rsp+352], ymm10
+ vmovdqu ymm7, YMMWORD PTR [rsp+96]
+ vpermq ymm7, ymm7, 78
+ vmovdqu ymm8, YMMWORD PTR [rsp+64]
+ vpermq ymm8, ymm8, 78
+ vmovdqu ymm9, YMMWORD PTR [rsp+32]
+ vpermq ymm9, ymm9, 78
+ vmovdqu ymm10, YMMWORD PTR [rsp]
+ vpermq ymm10, ymm10, 78
+ vmovdqu YMMWORD PTR [rsp+384], ymm7
+ vmovdqu YMMWORD PTR [rsp+416], ymm8
+ vmovdqu YMMWORD PTR [rsp+448], ymm9
+ vmovdqu YMMWORD PTR [rsp+480], ymm10
+L_AES_GCM_decrypt_update_vaes_no_ext:
+ vbroadcasti128 ymm14, ptr_L_vaes_aes_gcm_mod2_128
+ cmp r9d, 256
+ jl L_AES_GCM_decrypt_update_vaes_after_256
+ mov r13d, r9d
+ and r13d, 4294967040
+L_AES_GCM_decrypt_update_vaes_loop_256:
+ ; 256 bytes of input
+ lea rbx, QWORD PTR [r11+rdi]
+ vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask
+ vpxor ymm4, ymm4, ymm4
+ vinserti128 ymm4, ymm4, xmm15, 0
+ vmovdqu ymm7, YMMWORD PTR [rsp+256]
+ vmovdqu ymm8, YMMWORD PTR [rsp+288]
+ vmovdqu ymm9, YMMWORD PTR [rsp+320]
+ vmovdqu ymm10, YMMWORD PTR [rsp+352]
+ vmovdqu ymm5, YMMWORD PTR [rbx]
+ vpshufb ymm5, ymm5, ymm6
+ vpxor ymm5, ymm5, ymm4
+ vpclmulqdq ymm0, ymm5, ymm7, 0
+ vpclmulqdq ymm1, ymm5, ymm7, 1
+ vpclmulqdq ymm2, ymm5, ymm7, 16
+ vpclmulqdq ymm3, ymm5, ymm7, 17
+ vmovdqa ymm11, ymm0
+ vpxor ymm12, ymm2, ymm1
+ vmovdqa ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rbx+32]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm8, 0
+ vpclmulqdq ymm1, ymm5, ymm8, 1
+ vpclmulqdq ymm2, ymm5, ymm8, 16
+ vpclmulqdq ymm3, ymm5, ymm8, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rbx+64]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm9, 0
+ vpclmulqdq ymm1, ymm5, ymm9, 1
+ vpclmulqdq ymm2, ymm5, ymm9, 16
+ vpclmulqdq ymm3, ymm5, ymm9, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rbx+96]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm10, 0
+ vpclmulqdq ymm1, ymm5, ymm10, 1
+ vpclmulqdq ymm2, ymm5, ymm10, 16
+ vpclmulqdq ymm3, ymm5, ymm10, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm7, YMMWORD PTR [rsp+384]
+ vmovdqu ymm8, YMMWORD PTR [rsp+416]
+ vmovdqu ymm9, YMMWORD PTR [rsp+448]
+ vmovdqu ymm10, YMMWORD PTR [rsp+480]
+ vmovdqu ymm5, YMMWORD PTR [rbx+128]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm7, 0
+ vpclmulqdq ymm1, ymm5, ymm7, 1
+ vpclmulqdq ymm2, ymm5, ymm7, 16
+ vpclmulqdq ymm3, ymm5, ymm7, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rbx+160]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm8, 0
+ vpclmulqdq ymm1, ymm5, ymm8, 1
+ vpclmulqdq ymm2, ymm5, ymm8, 16
+ vpclmulqdq ymm3, ymm5, ymm8, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rbx+192]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm9, 0
+ vpclmulqdq ymm1, ymm5, ymm9, 1
+ vpclmulqdq ymm2, ymm5, ymm9, 16
+ vpclmulqdq ymm3, ymm5, ymm9, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rbx+224]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm10, 0
+ vpclmulqdq ymm1, ymm5, ymm10, 1
+ vpclmulqdq ymm2, ymm5, ymm10, 16
+ vpclmulqdq ymm3, ymm5, ymm10, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vpclmulqdq ymm5, ymm14, ymm11, 1
+ vpshufd ymm11, ymm11, 78
+ vpxor ymm12, ymm12, ymm5
+ vpxor ymm12, ymm12, ymm11
+ vpclmulqdq ymm5, ymm14, ymm12, 1
+ vpshufd ymm12, ymm12, 78
+ vpxor ymm13, ymm13, ymm5
+ vpxor ymm13, ymm13, ymm12
+ vextracti128 xmm0, ymm13, 1
+ vpxor xmm15, xmm13, xmm0
+ vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64
+ vbroadcasti128 ymm4, [r15]
+ vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
+ vpshufb ymm0, ymm0, ymm6
+ vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
+ vpshufb ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2
+ vpshufb ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3
+ vpshufb ymm3, ymm3, ymm6
+ vmovdqu xmm7, OWORD PTR [r15]
+ vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
+ vmovdqu OWORD PTR [r15], xmm7
+ vbroadcasti128 ymm4, [rax]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm1, ymm1, ymm4
+ vpxor ymm2, ymm2, ymm4
+ vpxor ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+16]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+32]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+48]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+64]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+80]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+96]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+112]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+128]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+144]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r8d, 11
+ vbroadcasti128 ymm4, [rax+160]
+ jl L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+176]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r8d, 13
+ vbroadcasti128 ymm4, [rax+192]
+ jl L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+208]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+224]
+L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last:
+ vaesenclast ymm0, ymm0, ymm4
+ vaesenclast ymm1, ymm1, ymm4
+ vaesenclast ymm2, ymm2, ymm4
+ vaesenclast ymm3, ymm3, ymm4
+ lea rcx, QWORD PTR [r11+rdi]
+ lea rdx, QWORD PTR [r10+rdi]
+ vmovdqu ymm5, YMMWORD PTR [rcx]
+ vpxor ymm0, ymm0, ymm5
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu ymm5, YMMWORD PTR [rcx+32]
+ vpxor ymm1, ymm1, ymm5
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu ymm5, YMMWORD PTR [rcx+64]
+ vpxor ymm2, ymm2, ymm5
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vmovdqu ymm5, YMMWORD PTR [rcx+96]
+ vpxor ymm3, ymm3, ymm5
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ add edi, 128
+ vbroadcasti128 ymm4, [r15]
+ vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
+ vpshufb ymm0, ymm0, ymm6
+ vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
+ vpshufb ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2
+ vpshufb ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3
+ vpshufb ymm3, ymm3, ymm6
+ vmovdqu xmm7, OWORD PTR [r15]
+ vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
+ vmovdqu OWORD PTR [r15], xmm7
+ vbroadcasti128 ymm4, [rax]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm1, ymm1, ymm4
+ vpxor ymm2, ymm2, ymm4
+ vpxor ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+16]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+32]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+48]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+64]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+80]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+96]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+112]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+128]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+144]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r8d, 11
+ vbroadcasti128 ymm4, [rax+160]
+ jl L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+176]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r8d, 13
+ vbroadcasti128 ymm4, [rax+192]
+ jl L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+208]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+224]
+L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last:
+ vaesenclast ymm0, ymm0, ymm4
+ vaesenclast ymm1, ymm1, ymm4
+ vaesenclast ymm2, ymm2, ymm4
+ vaesenclast ymm3, ymm3, ymm4
+ lea rcx, QWORD PTR [r11+rdi]
+ lea rdx, QWORD PTR [r10+rdi]
+ vmovdqu ymm5, YMMWORD PTR [rcx]
+ vpxor ymm0, ymm0, ymm5
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu ymm5, YMMWORD PTR [rcx+32]
+ vpxor ymm1, ymm1, ymm5
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu ymm5, YMMWORD PTR [rcx+64]
+ vpxor ymm2, ymm2, ymm5
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vmovdqu ymm5, YMMWORD PTR [rcx+96]
+ vpxor ymm3, ymm3, ymm5
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ add edi, 128
+ cmp edi, r13d
+ jl L_AES_GCM_decrypt_update_vaes_loop_256
+L_AES_GCM_decrypt_update_vaes_after_256:
+ vmovdqu ymm7, YMMWORD PTR [rsp+96]
+ vpermq ymm7, ymm7, 78
+ vmovdqu ymm8, YMMWORD PTR [rsp+64]
+ vpermq ymm8, ymm8, 78
+ vmovdqu ymm9, YMMWORD PTR [rsp+32]
+ vpermq ymm9, ymm9, 78
+ vmovdqu ymm10, YMMWORD PTR [rsp]
+ vpermq ymm10, ymm10, 78
+ mov r13d, r9d
+ and r13d, 4294967168
+ cmp edi, r13d
+ jge L_AES_GCM_decrypt_update_vaes_after_128
+ ; 128 bytes of input
+ lea rbx, QWORD PTR [r11+rdi]
+ vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask
+ vpxor ymm4, ymm4, ymm4
+ vinserti128 ymm4, ymm4, xmm15, 0
+ vmovdqu ymm5, YMMWORD PTR [rbx]
+ vpshufb ymm5, ymm5, ymm6
+ vpxor ymm5, ymm5, ymm4
+ vpclmulqdq ymm0, ymm5, ymm7, 0
+ vpclmulqdq ymm1, ymm5, ymm7, 1
+ vpclmulqdq ymm2, ymm5, ymm7, 16
+ vpclmulqdq ymm3, ymm5, ymm7, 17
+ vmovdqa ymm11, ymm0
+ vpxor ymm12, ymm2, ymm1
+ vmovdqa ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rbx+32]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm8, 0
+ vpclmulqdq ymm1, ymm5, ymm8, 1
+ vpclmulqdq ymm2, ymm5, ymm8, 16
+ vpclmulqdq ymm3, ymm5, ymm8, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rbx+64]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm9, 0
+ vpclmulqdq ymm1, ymm5, ymm9, 1
+ vpclmulqdq ymm2, ymm5, ymm9, 16
+ vpclmulqdq ymm3, ymm5, ymm9, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vmovdqu ymm5, YMMWORD PTR [rbx+96]
+ vpshufb ymm5, ymm5, ymm6
+ vpclmulqdq ymm0, ymm5, ymm10, 0
+ vpclmulqdq ymm1, ymm5, ymm10, 1
+ vpclmulqdq ymm2, ymm5, ymm10, 16
+ vpclmulqdq ymm3, ymm5, ymm10, 17
+ vpxor ymm11, ymm11, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm12, ymm12, ymm2
+ vpxor ymm13, ymm13, ymm3
+ vpclmulqdq ymm5, ymm14, ymm11, 1
+ vpshufd ymm11, ymm11, 78
+ vpxor ymm12, ymm12, ymm5
+ vpxor ymm12, ymm12, ymm11
+ vpclmulqdq ymm5, ymm14, ymm12, 1
+ vpshufd ymm12, ymm12, 78
+ vpxor ymm13, ymm13, ymm5
+ vpxor ymm13, ymm13, ymm12
+ vextracti128 xmm0, ymm13, 1
+ vpxor xmm15, xmm13, xmm0
+ vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64
+ vbroadcasti128 ymm4, [r15]
+ vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0
+ vpshufb ymm0, ymm0, ymm6
+ vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1
+ vpshufb ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2
+ vpshufb ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3
+ vpshufb ymm3, ymm3, ymm6
+ vmovdqu xmm7, OWORD PTR [r15]
+ vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight
+ vmovdqu OWORD PTR [r15], xmm7
+ vbroadcasti128 ymm4, [rax]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm1, ymm1, ymm4
+ vpxor ymm2, ymm2, ymm4
+ vpxor ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+16]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+32]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+48]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+64]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+80]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+96]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+112]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+128]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+144]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r8d, 11
+ vbroadcasti128 ymm4, [rax+160]
+ jl L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+176]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ cmp r8d, 13
+ vbroadcasti128 ymm4, [rax+192]
+ jl L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+208]
+ vaesenc ymm0, ymm0, ymm4
+ vaesenc ymm1, ymm1, ymm4
+ vaesenc ymm2, ymm2, ymm4
+ vaesenc ymm3, ymm3, ymm4
+ vbroadcasti128 ymm4, [rax+224]
+L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last:
+ vaesenclast ymm0, ymm0, ymm4
+ vaesenclast ymm1, ymm1, ymm4
+ vaesenclast ymm2, ymm2, ymm4
+ vaesenclast ymm3, ymm3, ymm4
+ lea rcx, QWORD PTR [r11+rdi]
+ lea rdx, QWORD PTR [r10+rdi]
+ vmovdqu ymm5, YMMWORD PTR [rcx]
+ vpxor ymm0, ymm0, ymm5
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vmovdqu ymm5, YMMWORD PTR [rcx+32]
+ vpxor ymm1, ymm1, ymm5
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vmovdqu ymm5, YMMWORD PTR [rcx+64]
+ vpxor ymm2, ymm2, ymm5
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vmovdqu ymm5, YMMWORD PTR [rcx+96]
+ vpxor ymm3, ymm3, ymm5
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ add edi, 128
+L_AES_GCM_decrypt_update_vaes_after_128:
+ vmovdqu xmm6, OWORD PTR [rsp]
+L_AES_GCM_decrypt_update_vaes_done_128:
+ mov edx, r9d
+ cmp edi, edx
+ jge L_AES_GCM_decrypt_update_vaes_done_dec
+ mov r13d, r9d
+ and r13d, 4294967280
+ cmp edi, r13d
+ jge L_AES_GCM_decrypt_update_vaes_last_block_done
+L_AES_GCM_decrypt_update_vaes_last_block_start:
+ vmovdqu xmm12, OWORD PTR [r11+rdi]
+ vmovdqa xmm0, xmm6
+ vpshufb xmm1, xmm12, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vpxor xmm1, xmm1, xmm15
+ vmovdqu xmm8, OWORD PTR [r15]
+ vpshufb xmm7, xmm8, OWORD PTR L_vaes_aes_gcm_bswap_epi64
+ vpaddd xmm8, xmm8, OWORD PTR L_vaes_aes_gcm_one
+ vmovdqu OWORD PTR [r15], xmm8
+ vpxor xmm7, xmm7, [rax]
+ vpclmulqdq xmm9, xmm1, xmm0, 16
+ vaesenc xmm7, xmm7, [rax+16]
+ vaesenc xmm7, xmm7, [rax+32]
+ vpclmulqdq xmm10, xmm1, xmm0, 1
+ vaesenc xmm7, xmm7, [rax+48]
+ vaesenc xmm7, xmm7, [rax+64]
+ vpclmulqdq xmm11, xmm1, xmm0, 0
+ vaesenc xmm7, xmm7, [rax+80]
+ vpclmulqdq xmm1, xmm1, xmm0, 17
+ vaesenc xmm7, xmm7, [rax+96]
+ vpxor xmm9, xmm9, xmm10
+ vpslldq xmm2, xmm9, 8
+ vpsrldq xmm9, xmm9, 8
+ vaesenc xmm7, xmm7, [rax+112]
+ vpxor xmm2, xmm2, xmm11
+ vpxor xmm3, xmm1, xmm9
+ vmovdqa xmm0, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm10, xmm2, xmm0, 16
+ vaesenc xmm7, xmm7, [rax+128]
+ vpshufd xmm9, xmm2, 78
+ vpxor xmm9, xmm9, xmm10
+ vpclmulqdq xmm10, xmm9, xmm0, 16
+ vaesenc xmm7, xmm7, [rax+144]
+ vpshufd xmm9, xmm9, 78
+ vpxor xmm9, xmm9, xmm10
+ vpxor xmm15, xmm9, xmm3
+ cmp r8d, 11
+ vmovdqa xmm8, OWORD PTR [rax+160]
+ jl L_AES_GCM_decrypt_update_vaes_aesenc_gfmul_last
+ vaesenc xmm7, xmm7, xmm8
+ vaesenc xmm7, xmm7, [rax+176]
+ cmp r8d, 13
+ vmovdqa xmm8, OWORD PTR [rax+192]
+ jl L_AES_GCM_decrypt_update_vaes_aesenc_gfmul_last
+ vaesenc xmm7, xmm7, xmm8
+ vaesenc xmm7, xmm7, [rax+208]
+ vmovdqa xmm8, OWORD PTR [rax+224]
+L_AES_GCM_decrypt_update_vaes_aesenc_gfmul_last:
+ vaesenclast xmm7, xmm7, xmm8
+ vmovdqa xmm0, xmm12
+ vpxor xmm7, xmm7, xmm0
+ vmovdqu OWORD PTR [r10+rdi], xmm7
+ add edi, 16
+ cmp edi, r13d
+ jl L_AES_GCM_decrypt_update_vaes_last_block_start
+L_AES_GCM_decrypt_update_vaes_last_block_done:
+L_AES_GCM_decrypt_update_vaes_done_dec:
+ vmovdqa OWORD PTR [r12], xmm15
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp+528]
+ vmovdqu xmm7, OWORD PTR [rsp+544]
+ vmovdqu xmm8, OWORD PTR [rsp+560]
+ vmovdqu xmm9, OWORD PTR [rsp+576]
+ vmovdqu xmm10, OWORD PTR [rsp+592]
+ vmovdqu xmm11, OWORD PTR [rsp+608]
+ vmovdqu xmm12, OWORD PTR [rsp+624]
+ vmovdqu xmm13, OWORD PTR [rsp+640]
+ vmovdqu xmm14, OWORD PTR [rsp+656]
+ vmovdqu xmm15, OWORD PTR [rsp+672]
+ add rsp, 688
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r12
+ pop r13
+ ret
+AES_GCM_decrypt_update_vaes ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_decrypt_final_vaes PROC
+ push r13
+ push r12
+ push r14
+ push rbp
+ push r15
+ mov rax, rcx
+ mov r10d, r9d
+ mov r9, rdx
+ mov r11d, DWORD PTR [rsp+80]
+ mov r12, QWORD PTR [rsp+88]
+ mov r14, QWORD PTR [rsp+96]
+ mov rbp, QWORD PTR [rsp+104]
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp+16], xmm6
+ vmovdqu OWORD PTR [rsp+32], xmm7
+ vmovdqu OWORD PTR [rsp+48], xmm8
+ vmovdqu OWORD PTR [rsp+64], xmm9
+ vmovdqu OWORD PTR [rsp+80], xmm10
+ vmovdqu OWORD PTR [rsp+96], xmm11
+ vmovdqu OWORD PTR [rsp+112], xmm12
+ vmovdqu OWORD PTR [rsp+128], xmm13
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ vmovdqa xmm6, OWORD PTR [rax]
+ vmovdqa xmm5, OWORD PTR [r12]
+ vmovdqa xmm15, OWORD PTR [r14]
+ vpsrlq xmm8, xmm5, 63
+ vpsllq xmm7, xmm5, 1
+ vpslldq xmm8, xmm8, 8
+ vpor xmm7, xmm7, xmm8
+ vpshufd xmm5, xmm5, 255
+ vpsrad xmm5, xmm5, 31
+ vpand xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpxor xmm5, xmm5, xmm7
+ mov edx, r10d
+ mov ecx, r11d
+ shl rdx, 3
+ shl rcx, 3
+ vmovq xmm0, rdx
+ vmovq xmm1, rcx
+ vpunpcklqdq xmm0, xmm0, xmm1
+ vpxor xmm6, xmm6, xmm0
+ ; ghash_gfmul_red_avx
+ vpclmulqdq xmm7, xmm6, xmm5, 0
+ vpclmulqdq xmm8, xmm6, xmm5, 1
+ vpclmulqdq xmm9, xmm6, xmm5, 16
+ vpclmulqdq xmm10, xmm6, xmm5, 17
+ vpxor xmm8, xmm8, xmm9
+ vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpxor xmm8, xmm8, xmm11
+ vpxor xmm8, xmm8, xmm7
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm10, xmm10, xmm8
+ vmovdqa xmm6, xmm10
+ vpshufb xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_bswap_mask
+ vpxor xmm0, xmm6, xmm15
+ cmp r8d, 16
+ je L_AES_GCM_decrypt_final_vaes_cmp_tag_16
+ sub rsp, 16
+ xor rcx, rcx
+ xor r15, r15
+ vmovdqu OWORD PTR [rsp], xmm0
+L_AES_GCM_decrypt_final_vaes_cmp_tag_loop:
+ movzx r13d, BYTE PTR [rsp+rcx]
+ xor r13b, BYTE PTR [r9+rcx]
+ or r15b, r13b
+ inc ecx
+ cmp ecx, r8d
+ jne L_AES_GCM_decrypt_final_vaes_cmp_tag_loop
+ cmp r15b, 0
+ sete r15b
+ add rsp, 16
+ xor rcx, rcx
+ jmp L_AES_GCM_decrypt_final_vaes_cmp_tag_done
+L_AES_GCM_decrypt_final_vaes_cmp_tag_16:
+ vmovdqu xmm1, OWORD PTR [r9]
+ vpcmpeqb xmm0, xmm0, xmm1
+ vpmovmskb rdx, xmm0
+ ; %%edx == 0xFFFF then return 1 else => return 0
+ xor r15d, r15d
+ cmp edx, 65535
+ sete r15b
+L_AES_GCM_decrypt_final_vaes_cmp_tag_done:
+ mov DWORD PTR [rbp], r15d
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp+16]
+ vmovdqu xmm7, OWORD PTR [rsp+32]
+ vmovdqu xmm8, OWORD PTR [rsp+48]
+ vmovdqu xmm9, OWORD PTR [rsp+64]
+ vmovdqu xmm10, OWORD PTR [rsp+80]
+ vmovdqu xmm11, OWORD PTR [rsp+96]
+ vmovdqu xmm12, OWORD PTR [rsp+112]
+ vmovdqu xmm13, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ pop r15
+ pop rbp
+ pop r14
+ pop r12
+ pop r13
+ ret
+AES_GCM_decrypt_final_vaes ENDP
+_TEXT ENDS
+ENDIF
+IFDEF HAVE_INTEL_AVX512
+_DATA SEGMENT
+ALIGN 16
+L_avx512_aes_gcm_inc_z0 QWORD \
+ 0000000000000000h, 0000000000000000h,
+ 0000000000000000h, 0000000000000001h,
+ 0000000000000000h, 0000000000000002h,
+ 0000000000000000h, 0000000000000003h
+ptr_L_avx512_aes_gcm_inc_z0 QWORD L_avx512_aes_gcm_inc_z0
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx512_aes_gcm_inc_z1 QWORD \
+ 0000000000000000h, 0000000000000004h,
+ 0000000000000000h, 0000000000000005h,
+ 0000000000000000h, 0000000000000006h,
+ 0000000000000000h, 0000000000000007h
+ptr_L_avx512_aes_gcm_inc_z1 QWORD L_avx512_aes_gcm_inc_z1
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx512_aes_gcm_inc_z2 QWORD \
+ 0000000000000000h, 0000000000000008h,
+ 0000000000000000h, 0000000000000009h,
+ 0000000000000000h, 000000000000000ah,
+ 0000000000000000h, 000000000000000bh
+ptr_L_avx512_aes_gcm_inc_z2 QWORD L_avx512_aes_gcm_inc_z2
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx512_aes_gcm_inc_z3 QWORD \
+ 0000000000000000h, 000000000000000ch,
+ 0000000000000000h, 000000000000000dh,
+ 0000000000000000h, 000000000000000eh,
+ 0000000000000000h, 000000000000000fh
+ptr_L_avx512_aes_gcm_inc_z3 QWORD L_avx512_aes_gcm_inc_z3
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx512_aes_gcm_sixteen QWORD \
+ 0000000000000000h, 0000000000000010h
+ptr_L_avx512_aes_gcm_sixteen QWORD L_avx512_aes_gcm_sixteen
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_GCM_generate_m0_avx512_rev8 QWORD \
+ 08090a0b0c0d0e0fh, 0001020304050607h
+ptr_L_GCM_generate_m0_avx512_rev8 QWORD L_GCM_generate_m0_avx512_rev8
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_GCM_generate_m0_avx512_mod2_128 QWORD \
+ 0000000000000000h, 0e100000000000000h
+ptr_L_GCM_generate_m0_avx512_mod2_128 QWORD L_GCM_generate_m0_avx512_mod2_128
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+GCM_generate_m0_avx512 PROC
+ sub rsp, 80
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu xmm9, OWORD PTR L_GCM_generate_m0_avx512_rev8
+ vmovdqu xmm10, OWORD PTR L_GCM_generate_m0_avx512_mod2_128
+ vpxor xmm8, xmm8, xmm8
+ vmovdqu xmm0, OWORD PTR [rcx]
+ vmovdqu OWORD PTR [rdx], xmm8
+ vmovdqu xmm8, xmm0
+ vpshufb xmm0, xmm0, xmm9
+ vpsllq xmm5, xmm0, 63
+ vpsrlq xmm4, xmm0, 1
+ vpslldq xmm1, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpshufd xmm1, xmm1, 255
+ vpor xmm4, xmm4, xmm5
+ vpsrad xmm1, xmm1, 31
+ vpand xmm1, xmm1, xmm10
+ vpxor xmm1, xmm1, xmm4
+ vpsllq xmm5, xmm1, 63
+ vpsrlq xmm4, xmm1, 1
+ vpslldq xmm2, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpshufd xmm2, xmm2, 255
+ vpor xmm4, xmm4, xmm5
+ vpsrad xmm2, xmm2, 31
+ vpand xmm2, xmm2, xmm10
+ vpxor xmm2, xmm2, xmm4
+ vpsllq xmm5, xmm2, 63
+ vpsrlq xmm4, xmm2, 1
+ vpslldq xmm3, xmm5, 8
+ vpsrldq xmm5, xmm5, 8
+ vpshufd xmm3, xmm3, 255
+ vpor xmm4, xmm4, xmm5
+ vpsrad xmm3, xmm3, 31
+ vpand xmm3, xmm3, xmm10
+ vpxor xmm3, xmm3, xmm4
+ vpshufb xmm3, xmm3, xmm9
+ vpshufb xmm2, xmm2, xmm9
+ vpshufb xmm1, xmm1, xmm9
+ vpshufb xmm0, xmm0, xmm9
+ vpxor xmm8, xmm3, xmm2
+ vmovdqu OWORD PTR [rdx+16], xmm3
+ vmovdqu OWORD PTR [rdx+32], xmm2
+ vmovdqu OWORD PTR [rdx+48], xmm8
+ vmovdqu OWORD PTR [rdx+64], xmm1
+ vpxor xmm4, xmm3, xmm1
+ vpxor xmm5, xmm2, xmm1
+ vpxor xmm6, xmm8, xmm1
+ vmovdqu OWORD PTR [rdx+80], xmm4
+ vmovdqu OWORD PTR [rdx+96], xmm5
+ vmovdqu OWORD PTR [rdx+112], xmm6
+ vmovdqu OWORD PTR [rdx+128], xmm0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm4, xmm3, xmm0
+ vpxor xmm6, xmm2, xmm0
+ vmovdqu OWORD PTR [rdx+144], xmm4
+ vmovdqu OWORD PTR [rdx+160], xmm6
+ vpxor xmm6, xmm3, xmm6
+ vmovdqu OWORD PTR [rdx+176], xmm6
+ vmovdqu OWORD PTR [rdx+192], xmm1
+ vpxor xmm4, xmm3, xmm1
+ vpxor xmm5, xmm2, xmm1
+ vpxor xmm6, xmm8, xmm1
+ vmovdqu OWORD PTR [rdx+208], xmm4
+ vmovdqu OWORD PTR [rdx+224], xmm5
+ vmovdqu OWORD PTR [rdx+240], xmm6
+ vmovdqu xmm0, OWORD PTR [rdx]
+ vmovdqu xmm1, OWORD PTR [rdx+16]
+ vmovdqu xmm2, OWORD PTR [rdx+32]
+ vmovdqu xmm3, OWORD PTR [rdx+48]
+ vpshufb xmm0, xmm0, xmm9
+ vpshufb xmm1, xmm1, xmm9
+ vpshufb xmm2, xmm2, xmm9
+ vpshufb xmm3, xmm3, xmm9
+ vpsllq xmm4, xmm0, 60
+ vpsllq xmm5, xmm1, 60
+ vpsllq xmm6, xmm2, 60
+ vpsllq xmm7, xmm3, 60
+ vpsrlq xmm0, xmm0, 4
+ vpsrlq xmm1, xmm1, 4
+ vpsrlq xmm2, xmm2, 4
+ vpsrlq xmm3, xmm3, 4
+ vpsrldq xmm4, xmm4, 8
+ vpsrldq xmm5, xmm5, 8
+ vpsrldq xmm6, xmm6, 8
+ vpsrldq xmm7, xmm7, 8
+ vpor xmm0, xmm0, xmm4
+ vpor xmm1, xmm1, xmm5
+ vpor xmm2, xmm2, xmm6
+ vpor xmm3, xmm3, xmm7
+ vpshufb xmm0, xmm0, xmm9
+ vpshufb xmm1, xmm1, xmm9
+ vpshufb xmm2, xmm2, xmm9
+ vpshufb xmm3, xmm3, xmm9
+ vmovdqu OWORD PTR [rdx+256], xmm0
+ vmovdqu OWORD PTR [rdx+272], xmm1
+ vmovdqu OWORD PTR [rdx+288], xmm2
+ vmovdqu OWORD PTR [rdx+304], xmm3
+ vmovdqu xmm0, OWORD PTR [rdx+64]
+ vmovdqu xmm1, OWORD PTR [rdx+80]
+ vmovdqu xmm2, OWORD PTR [rdx+96]
+ vmovdqu xmm3, OWORD PTR [rdx+112]
+ vpshufb xmm0, xmm0, xmm9
+ vpshufb xmm1, xmm1, xmm9
+ vpshufb xmm2, xmm2, xmm9
+ vpshufb xmm3, xmm3, xmm9
+ vpsllq xmm4, xmm0, 60
+ vpsllq xmm5, xmm1, 60
+ vpsllq xmm6, xmm2, 60
+ vpsllq xmm7, xmm3, 60
+ vpsrlq xmm0, xmm0, 4
+ vpsrlq xmm1, xmm1, 4
+ vpsrlq xmm2, xmm2, 4
+ vpsrlq xmm3, xmm3, 4
+ vpsrldq xmm4, xmm4, 8
+ vpsrldq xmm5, xmm5, 8
+ vpsrldq xmm6, xmm6, 8
+ vpsrldq xmm7, xmm7, 8
+ vpor xmm0, xmm0, xmm4
+ vpor xmm1, xmm1, xmm5
+ vpor xmm2, xmm2, xmm6
+ vpor xmm3, xmm3, xmm7
+ vpshufb xmm0, xmm0, xmm9
+ vpshufb xmm1, xmm1, xmm9
+ vpshufb xmm2, xmm2, xmm9
+ vpshufb xmm3, xmm3, xmm9
+ vmovdqu OWORD PTR [rdx+320], xmm0
+ vmovdqu OWORD PTR [rdx+336], xmm1
+ vmovdqu OWORD PTR [rdx+352], xmm2
+ vmovdqu OWORD PTR [rdx+368], xmm3
+ vmovdqu xmm0, OWORD PTR [rdx+128]
+ vmovdqu xmm1, OWORD PTR [rdx+144]
+ vmovdqu xmm2, OWORD PTR [rdx+160]
+ vmovdqu xmm3, OWORD PTR [rdx+176]
+ vpshufb xmm0, xmm0, xmm9
+ vpshufb xmm1, xmm1, xmm9
+ vpshufb xmm2, xmm2, xmm9
+ vpshufb xmm3, xmm3, xmm9
+ vpsllq xmm4, xmm0, 60
+ vpsllq xmm5, xmm1, 60
+ vpsllq xmm6, xmm2, 60
+ vpsllq xmm7, xmm3, 60
+ vpsrlq xmm0, xmm0, 4
+ vpsrlq xmm1, xmm1, 4
+ vpsrlq xmm2, xmm2, 4
+ vpsrlq xmm3, xmm3, 4
+ vpsrldq xmm4, xmm4, 8
+ vpsrldq xmm5, xmm5, 8
+ vpsrldq xmm6, xmm6, 8
+ vpsrldq xmm7, xmm7, 8
+ vpor xmm0, xmm0, xmm4
+ vpor xmm1, xmm1, xmm5
+ vpor xmm2, xmm2, xmm6
+ vpor xmm3, xmm3, xmm7
+ vpshufb xmm0, xmm0, xmm9
+ vpshufb xmm1, xmm1, xmm9
+ vpshufb xmm2, xmm2, xmm9
+ vpshufb xmm3, xmm3, xmm9
+ vmovdqu OWORD PTR [rdx+384], xmm0
+ vmovdqu OWORD PTR [rdx+400], xmm1
+ vmovdqu OWORD PTR [rdx+416], xmm2
+ vmovdqu OWORD PTR [rdx+432], xmm3
+ vmovdqu xmm0, OWORD PTR [rdx+192]
+ vmovdqu xmm1, OWORD PTR [rdx+208]
+ vmovdqu xmm2, OWORD PTR [rdx+224]
+ vmovdqu xmm3, OWORD PTR [rdx+240]
+ vpshufb xmm0, xmm0, xmm9
+ vpshufb xmm1, xmm1, xmm9
+ vpshufb xmm2, xmm2, xmm9
+ vpshufb xmm3, xmm3, xmm9
+ vpsllq xmm4, xmm0, 60
+ vpsllq xmm5, xmm1, 60
+ vpsllq xmm6, xmm2, 60
+ vpsllq xmm7, xmm3, 60
+ vpsrlq xmm0, xmm0, 4
+ vpsrlq xmm1, xmm1, 4
+ vpsrlq xmm2, xmm2, 4
+ vpsrlq xmm3, xmm3, 4
+ vpsrldq xmm4, xmm4, 8
+ vpsrldq xmm5, xmm5, 8
+ vpsrldq xmm6, xmm6, 8
+ vpsrldq xmm7, xmm7, 8
+ vpor xmm0, xmm0, xmm4
+ vpor xmm1, xmm1, xmm5
+ vpor xmm2, xmm2, xmm6
+ vpor xmm3, xmm3, xmm7
+ vpshufb xmm0, xmm0, xmm9
+ vpshufb xmm1, xmm1, xmm9
+ vpshufb xmm2, xmm2, xmm9
+ vpshufb xmm3, xmm3, xmm9
+ vmovdqu OWORD PTR [rdx+448], xmm0
+ vmovdqu OWORD PTR [rdx+464], xmm1
+ vmovdqu OWORD PTR [rdx+480], xmm2
+ vmovdqu OWORD PTR [rdx+496], xmm3
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ add rsp, 80
+ ret
+GCM_generate_m0_avx512 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx512_aes_gcm_one QWORD \
+ 0000000000000000h, 0000000000000001h
+ptr_L_avx512_aes_gcm_one QWORD L_avx512_aes_gcm_one
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx512_aes_gcm_two QWORD \
+ 0000000000000000h, 0000000000000002h
+ptr_L_avx512_aes_gcm_two QWORD L_avx512_aes_gcm_two
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx512_aes_gcm_three QWORD \
+ 0000000000000000h, 0000000000000003h
+ptr_L_avx512_aes_gcm_three QWORD L_avx512_aes_gcm_three
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx512_aes_gcm_four QWORD \
+ 0000000000000000h, 0000000000000004h
+ptr_L_avx512_aes_gcm_four QWORD L_avx512_aes_gcm_four
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx512_aes_gcm_five QWORD \
+ 0000000000000000h, 0000000000000005h
+ptr_L_avx512_aes_gcm_five QWORD L_avx512_aes_gcm_five
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx512_aes_gcm_six QWORD \
+ 0000000000000000h, 0000000000000006h
+ptr_L_avx512_aes_gcm_six QWORD L_avx512_aes_gcm_six
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx512_aes_gcm_seven QWORD \
+ 0000000000000000h, 0000000000000007h
+ptr_L_avx512_aes_gcm_seven QWORD L_avx512_aes_gcm_seven
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx512_aes_gcm_eight QWORD \
+ 0000000000000000h, 0000000000000008h
+ptr_L_avx512_aes_gcm_eight QWORD L_avx512_aes_gcm_eight
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx512_aes_gcm_bswap_epi64 QWORD \
+ 0001020304050607h, 08090a0b0c0d0e0fh
+ptr_L_avx512_aes_gcm_bswap_epi64 QWORD L_avx512_aes_gcm_bswap_epi64
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx512_aes_gcm_bswap_mask QWORD \
+ 08090a0b0c0d0e0fh, 0001020304050607h
+ptr_L_avx512_aes_gcm_bswap_mask QWORD L_avx512_aes_gcm_bswap_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx512_aes_gcm_mod2_128 QWORD \
+ 0000000000000001h, 0c200000000000000h
+ptr_L_avx512_aes_gcm_mod2_128 QWORD L_avx512_aes_gcm_mod2_128
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_encrypt_avx512 PROC
+ push r13
+ push rdi
+ push rsi
+ push r12
+ push rbx
+ push r14
+ push r15
+ mov rdi, rcx
+ mov rsi, rdx
+ mov r12, r8
+ mov rax, r9
+ mov r8, QWORD PTR [rsp+96]
+ mov r9d, DWORD PTR [rsp+104]
+ mov r11d, DWORD PTR [rsp+112]
+ mov ebx, DWORD PTR [rsp+120]
+ mov r14d, DWORD PTR [rsp+128]
+ mov r15, QWORD PTR [rsp+136]
+ mov r10d, DWORD PTR [rsp+144]
+ sub rsp, 1248
+ vmovdqu OWORD PTR [rsp+1088], xmm6
+ vmovdqu OWORD PTR [rsp+1104], xmm7
+ vmovdqu OWORD PTR [rsp+1120], xmm8
+ vmovdqu OWORD PTR [rsp+1136], xmm9
+ vmovdqu OWORD PTR [rsp+1152], xmm10
+ vmovdqu OWORD PTR [rsp+1168], xmm11
+ vmovdqu OWORD PTR [rsp+1184], xmm12
+ vmovdqu OWORD PTR [rsp+1200], xmm13
+ vmovdqu OWORD PTR [rsp+1216], xmm14
+ vmovdqu OWORD PTR [rsp+1232], xmm15
+ vpxor xmm4, xmm4, xmm4
+ vpxor xmm6, xmm6, xmm6
+ mov edx, ebx
+ cmp edx, 12
+ jne L_AES_GCM_encrypt_avx512_iv_not_12
+ ; # Calculate values when IV is 12 bytes
+ ; Set counter based on IV
+ mov ecx, 16777216
+ vmovq xmm4, QWORD PTR [rax]
+ vpinsrd xmm4, xmm4, DWORD PTR [rax+8], 2
+ vpinsrd xmm4, xmm4, ecx, 3
+ ; H = Encrypt X(=0) and T = Encrypt counter
+ vmovdqa xmm5, OWORD PTR [r15]
+ vpxor xmm1, xmm4, xmm5
+ vmovdqa xmm7, OWORD PTR [r15+16]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [r15+32]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [r15+48]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [r15+64]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [r15+80]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [r15+96]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [r15+112]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [r15+128]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [r15+144]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ cmp r10d, 11
+ vmovdqa xmm7, OWORD PTR [r15+160]
+ jl L_AES_GCM_encrypt_avx512_calc_iv_12_last
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [r15+176]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ cmp r10d, 13
+ vmovdqa xmm7, OWORD PTR [r15+192]
+ jl L_AES_GCM_encrypt_avx512_calc_iv_12_last
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [r15+208]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_avx512_calc_iv_12_last:
+ vaesenclast xmm5, xmm5, xmm7
+ vaesenclast xmm1, xmm1, xmm7
+ vpshufb xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vmovdqu OWORD PTR [rsp+1040], xmm1
+ jmp L_AES_GCM_encrypt_avx512_iv_done
+L_AES_GCM_encrypt_avx512_iv_not_12:
+ ; Calculate values when IV is not 12 bytes
+ ; H = Encrypt X(=0)
+ vmovdqa xmm5, OWORD PTR [r15]
+ vaesenc xmm5, xmm5, [r15+16]
+ vaesenc xmm5, xmm5, [r15+32]
+ vaesenc xmm5, xmm5, [r15+48]
+ vaesenc xmm5, xmm5, [r15+64]
+ vaesenc xmm5, xmm5, [r15+80]
+ vaesenc xmm5, xmm5, [r15+96]
+ vaesenc xmm5, xmm5, [r15+112]
+ vaesenc xmm5, xmm5, [r15+128]
+ vaesenc xmm5, xmm5, [r15+144]
+ cmp r10d, 11
+ vmovdqa xmm9, OWORD PTR [r15+160]
+ jl L_AES_GCM_encrypt_avx512_calc_iv_1_aesenc_avx_last
+ vaesenc xmm5, xmm5, xmm9
+ vaesenc xmm5, xmm5, [r15+176]
+ cmp r10d, 13
+ vmovdqa xmm9, OWORD PTR [r15+192]
+ jl L_AES_GCM_encrypt_avx512_calc_iv_1_aesenc_avx_last
+ vaesenc xmm5, xmm5, xmm9
+ vaesenc xmm5, xmm5, [r15+208]
+ vmovdqa xmm9, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_avx512_calc_iv_1_aesenc_avx_last:
+ vaesenclast xmm5, xmm5, xmm9
+ vpshufb xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ ; Calc counter
+ ; Initialization vector
+ cmp edx, 0
+ mov rcx, 0
+ je L_AES_GCM_encrypt_avx512_calc_iv_done
+ cmp edx, 16
+ jl L_AES_GCM_encrypt_avx512_calc_iv_lt16
+ and edx, 4294967280
+L_AES_GCM_encrypt_avx512_calc_iv_16_loop:
+ vmovdqu xmm8, OWORD PTR [rax+rcx]
+ vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vpxor xmm4, xmm4, xmm8
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm4, 78
+ vpshufd xmm2, xmm5, 78
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm7, xmm0
+ vmovdqa xmm4, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm7, xmm7, xmm2
+ vpxor xmm4, xmm4, xmm1
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm4, xmm4, xmm1
+ vpslld xmm0, xmm7, 31
+ vpslld xmm1, xmm7, 30
+ vpslld xmm2, xmm7, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm7, xmm7, xmm0
+ vpsrld xmm2, xmm7, 1
+ vpsrld xmm3, xmm7, 2
+ vpsrld xmm0, xmm7, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm4, xmm4, xmm2
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_encrypt_avx512_calc_iv_16_loop
+ mov edx, ebx
+ cmp ecx, edx
+ je L_AES_GCM_encrypt_avx512_calc_iv_done
+L_AES_GCM_encrypt_avx512_calc_iv_lt16:
+ sub rsp, 16
+ vpxor xmm8, xmm8, xmm8
+ xor ebx, ebx
+ vmovdqu OWORD PTR [rsp], xmm8
+L_AES_GCM_encrypt_avx512_calc_iv_loop:
+ movzx r13d, BYTE PTR [rax+rcx]
+ mov BYTE PTR [rsp+rbx], r13b
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_encrypt_avx512_calc_iv_loop
+ vmovdqu xmm8, OWORD PTR [rsp]
+ add rsp, 16
+ vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vpxor xmm4, xmm4, xmm8
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm4, 78
+ vpshufd xmm2, xmm5, 78
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm7, xmm0
+ vmovdqa xmm4, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm7, xmm7, xmm2
+ vpxor xmm4, xmm4, xmm1
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm4, xmm4, xmm1
+ vpslld xmm0, xmm7, 31
+ vpslld xmm1, xmm7, 30
+ vpslld xmm2, xmm7, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm7, xmm7, xmm0
+ vpsrld xmm2, xmm7, 1
+ vpsrld xmm3, xmm7, 2
+ vpsrld xmm0, xmm7, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm4, xmm4, xmm2
+L_AES_GCM_encrypt_avx512_calc_iv_done:
+ ; T = Encrypt counter
+ vpxor xmm0, xmm0, xmm0
+ shl edx, 3
+ vmovq xmm0, rdx
+ vpxor xmm4, xmm4, xmm0
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm4, 78
+ vpshufd xmm2, xmm5, 78
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm7, xmm0
+ vmovdqa xmm4, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm7, xmm7, xmm2
+ vpxor xmm4, xmm4, xmm1
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm4, xmm4, xmm1
+ vpslld xmm0, xmm7, 31
+ vpslld xmm1, xmm7, 30
+ vpslld xmm2, xmm7, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm7, xmm7, xmm0
+ vpsrld xmm2, xmm7, 1
+ vpsrld xmm3, xmm7, 2
+ vpsrld xmm0, xmm7, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm4, xmm4, xmm2
+ vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ ; Encrypt counter
+ vmovdqa xmm8, OWORD PTR [r15]
+ vpxor xmm8, xmm8, xmm4
+ vaesenc xmm8, xmm8, [r15+16]
+ vaesenc xmm8, xmm8, [r15+32]
+ vaesenc xmm8, xmm8, [r15+48]
+ vaesenc xmm8, xmm8, [r15+64]
+ vaesenc xmm8, xmm8, [r15+80]
+ vaesenc xmm8, xmm8, [r15+96]
+ vaesenc xmm8, xmm8, [r15+112]
+ vaesenc xmm8, xmm8, [r15+128]
+ vaesenc xmm8, xmm8, [r15+144]
+ cmp r10d, 11
+ vmovdqa xmm9, OWORD PTR [r15+160]
+ jl L_AES_GCM_encrypt_avx512_calc_iv_2_aesenc_avx_last
+ vaesenc xmm8, xmm8, xmm9
+ vaesenc xmm8, xmm8, [r15+176]
+ cmp r10d, 13
+ vmovdqa xmm9, OWORD PTR [r15+192]
+ jl L_AES_GCM_encrypt_avx512_calc_iv_2_aesenc_avx_last
+ vaesenc xmm8, xmm8, xmm9
+ vaesenc xmm8, xmm8, [r15+208]
+ vmovdqa xmm9, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_avx512_calc_iv_2_aesenc_avx_last:
+ vaesenclast xmm8, xmm8, xmm9
+ vmovdqu OWORD PTR [rsp+1040], xmm8
+L_AES_GCM_encrypt_avx512_iv_done:
+ ; Additional authentication data
+ mov edx, r11d
+ cmp edx, 0
+ je L_AES_GCM_encrypt_avx512_calc_aad_done
+ xor ecx, ecx
+ cmp edx, 16
+ jl L_AES_GCM_encrypt_avx512_calc_aad_lt16
+ and edx, 4294967280
+L_AES_GCM_encrypt_avx512_calc_aad_16_loop:
+ vmovdqu xmm8, OWORD PTR [r12+rcx]
+ vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vpxor xmm6, xmm6, xmm8
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm6, 78
+ vpshufd xmm2, xmm5, 78
+ vpclmulqdq xmm3, xmm5, xmm6, 17
+ vpclmulqdq xmm0, xmm5, xmm6, 0
+ vpxor xmm1, xmm1, xmm6
+ vpxor xmm2, xmm2, xmm5
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm7, xmm0
+ vmovdqa xmm6, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm7, xmm7, xmm2
+ vpxor xmm6, xmm6, xmm1
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm6, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm6, xmm6, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm6, xmm6, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm6, xmm6, xmm1
+ vpslld xmm0, xmm7, 31
+ vpslld xmm1, xmm7, 30
+ vpslld xmm2, xmm7, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm7, xmm7, xmm0
+ vpsrld xmm2, xmm7, 1
+ vpsrld xmm3, xmm7, 2
+ vpsrld xmm0, xmm7, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm6, xmm6, xmm2
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_encrypt_avx512_calc_aad_16_loop
+ mov edx, r11d
+ cmp ecx, edx
+ je L_AES_GCM_encrypt_avx512_calc_aad_done
+L_AES_GCM_encrypt_avx512_calc_aad_lt16:
+ sub rsp, 16
+ vpxor xmm8, xmm8, xmm8
+ xor ebx, ebx
+ vmovdqu OWORD PTR [rsp], xmm8
+L_AES_GCM_encrypt_avx512_calc_aad_loop:
+ movzx r13d, BYTE PTR [r12+rcx]
+ mov BYTE PTR [rsp+rbx], r13b
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_encrypt_avx512_calc_aad_loop
+ vmovdqu xmm8, OWORD PTR [rsp]
+ add rsp, 16
+ vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vpxor xmm6, xmm6, xmm8
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm6, 78
+ vpshufd xmm2, xmm5, 78
+ vpclmulqdq xmm3, xmm5, xmm6, 17
+ vpclmulqdq xmm0, xmm5, xmm6, 0
+ vpxor xmm1, xmm1, xmm6
+ vpxor xmm2, xmm2, xmm5
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm7, xmm0
+ vmovdqa xmm6, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm7, xmm7, xmm2
+ vpxor xmm6, xmm6, xmm1
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm6, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm6, xmm6, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm6, xmm6, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm6, xmm6, xmm1
+ vpslld xmm0, xmm7, 31
+ vpslld xmm1, xmm7, 30
+ vpslld xmm2, xmm7, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm7, xmm7, xmm0
+ vpsrld xmm2, xmm7, 1
+ vpsrld xmm3, xmm7, 2
+ vpsrld xmm0, xmm7, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm6, xmm6, xmm2
+L_AES_GCM_encrypt_avx512_calc_aad_done:
+ ; Calculate counter and H
+ vpsrlq xmm9, xmm5, 63
+ vpsllq xmm8, xmm5, 1
+ vpslldq xmm9, xmm9, 8
+ vpor xmm8, xmm8, xmm9
+ vpshufd xmm5, xmm5, 255
+ vpsrad xmm5, xmm5, 31
+ vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_epi64
+ vpand xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpaddd xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_one
+ vpxor xmm5, xmm5, xmm8
+ vmovdqu OWORD PTR [rsp+1024], xmm4
+ xor ebx, ebx
+ cmp r9d, 256
+ jl L_AES_GCM_encrypt_avx512_done_128
+ vmovdqa xmm2, xmm6
+ ; H ^ 1
+ vmovdqu OWORD PTR [rsp], xmm5
+ ; H ^ 2
+ vpclmulqdq xmm8, xmm5, xmm5, 0
+ vpclmulqdq xmm11, xmm5, xmm5, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm0, xmm11
+ vmovdqu OWORD PTR [rsp+16], xmm0
+ ; H ^ 3
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm5, 78
+ vpxor xmm9, xmm9, xmm5
+ vpshufd xmm10, xmm0, 78
+ vpxor xmm10, xmm10, xmm0
+ vpclmulqdq xmm8, xmm0, xmm5, 0
+ vpclmulqdq xmm11, xmm0, xmm5, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm1, xmm11
+ vmovdqu OWORD PTR [rsp+32], xmm1
+ ; H ^ 4
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm3, xmm11
+ vmovdqu OWORD PTR [rsp+48], xmm3
+ ; H ^ 5
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+64], xmm7
+ ; H ^ 6
+ vpclmulqdq xmm8, xmm1, xmm1, 0
+ vpclmulqdq xmm11, xmm1, xmm1, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+80], xmm7
+ ; H ^ 7
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm1, 78
+ vpxor xmm9, xmm9, xmm1
+ vpshufd xmm10, xmm3, 78
+ vpxor xmm10, xmm10, xmm3
+ vpclmulqdq xmm8, xmm3, xmm1, 0
+ vpclmulqdq xmm11, xmm3, xmm1, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm7
+ ; H ^ 8
+ vpclmulqdq xmm8, xmm3, xmm3, 0
+ vpclmulqdq xmm11, xmm3, xmm3, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+112], xmm7
+ ; H ^ 9
+ vmovdqu xmm0, OWORD PTR [rsp+48]
+ vmovdqu xmm1, OWORD PTR [rsp+64]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+128], xmm7
+ ; H ^ 10
+ vmovdqu xmm0, OWORD PTR [rsp+64]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+144], xmm7
+ ; H ^ 11
+ vmovdqu xmm0, OWORD PTR [rsp+64]
+ vmovdqu xmm1, OWORD PTR [rsp+80]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+160], xmm7
+ ; H ^ 12
+ vmovdqu xmm0, OWORD PTR [rsp+80]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+176], xmm7
+ ; H ^ 13
+ vmovdqu xmm0, OWORD PTR [rsp+80]
+ vmovdqu xmm1, OWORD PTR [rsp+96]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+192], xmm7
+ ; H ^ 14
+ vmovdqu xmm0, OWORD PTR [rsp+96]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+208], xmm7
+ ; H ^ 15
+ vmovdqu xmm0, OWORD PTR [rsp+96]
+ vmovdqu xmm1, OWORD PTR [rsp+112]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+224], xmm7
+ ; H ^ 16
+ vmovdqu xmm0, OWORD PTR [rsp+112]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+240], xmm7
+ cmp r9d, 512
+ jl L_AES_GCM_encrypt_avx512_no_ext
+ ; H ^ 17
+ vmovdqu xmm0, OWORD PTR [rsp+112]
+ vmovdqu xmm1, OWORD PTR [rsp+128]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+256], xmm7
+ ; H ^ 18
+ vmovdqu xmm0, OWORD PTR [rsp+128]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+272], xmm7
+ ; H ^ 19
+ vmovdqu xmm0, OWORD PTR [rsp+128]
+ vmovdqu xmm1, OWORD PTR [rsp+144]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+288], xmm7
+ ; H ^ 20
+ vmovdqu xmm0, OWORD PTR [rsp+144]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+304], xmm7
+ ; H ^ 21
+ vmovdqu xmm0, OWORD PTR [rsp+144]
+ vmovdqu xmm1, OWORD PTR [rsp+160]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+320], xmm7
+ ; H ^ 22
+ vmovdqu xmm0, OWORD PTR [rsp+160]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+336], xmm7
+ ; H ^ 23
+ vmovdqu xmm0, OWORD PTR [rsp+160]
+ vmovdqu xmm1, OWORD PTR [rsp+176]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+352], xmm7
+ ; H ^ 24
+ vmovdqu xmm0, OWORD PTR [rsp+176]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+368], xmm7
+ ; H ^ 25
+ vmovdqu xmm0, OWORD PTR [rsp+176]
+ vmovdqu xmm1, OWORD PTR [rsp+192]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+384], xmm7
+ ; H ^ 26
+ vmovdqu xmm0, OWORD PTR [rsp+192]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+400], xmm7
+ ; H ^ 27
+ vmovdqu xmm0, OWORD PTR [rsp+192]
+ vmovdqu xmm1, OWORD PTR [rsp+208]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+416], xmm7
+ ; H ^ 28
+ vmovdqu xmm0, OWORD PTR [rsp+208]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+432], xmm7
+ ; H ^ 29
+ vmovdqu xmm0, OWORD PTR [rsp+208]
+ vmovdqu xmm1, OWORD PTR [rsp+224]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+448], xmm7
+ ; H ^ 30
+ vmovdqu xmm0, OWORD PTR [rsp+224]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+464], xmm7
+ ; H ^ 31
+ vmovdqu xmm0, OWORD PTR [rsp+224]
+ vmovdqu xmm1, OWORD PTR [rsp+240]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+480], xmm7
+ ; H ^ 32
+ vmovdqu xmm0, OWORD PTR [rsp+240]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+496], xmm7
+L_AES_GCM_encrypt_avx512_no_ext:
+ vbroadcasti32x4 zmm22, ptr_L_avx512_aes_gcm_bswap_epi64
+ vbroadcasti32x4 zmm30, ptr_L_avx512_aes_gcm_bswap_mask
+ vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128
+ vbroadcasti32x4 zmm9, [r15]
+ vbroadcasti32x4 zmm10, [r15+16]
+ vbroadcasti32x4 zmm11, [r15+32]
+ vbroadcasti32x4 zmm12, [r15+48]
+ vbroadcasti32x4 zmm13, [r15+64]
+ vbroadcasti32x4 zmm14, [r15+80]
+ vbroadcasti32x4 zmm15, [r15+96]
+ vbroadcasti32x4 zmm1, [r15+112]
+ vbroadcasti32x4 zmm2, [r15+128]
+ vbroadcasti32x4 zmm3, [r15+144]
+ cmp r9d, 512
+ jl L_AES_GCM_encrypt_avx512_no_windows
+ mov r13d, r9d
+ and r13d, 4294966784
+ vmovdqu64 zmm23, [rsp+448]
+ vshufi64x2 zmm23, zmm23, zmm23, 27
+ vmovdqu64 zmm24, [rsp+384]
+ vshufi64x2 zmm24, zmm24, zmm24, 27
+ vmovdqu64 zmm25, [rsp+320]
+ vshufi64x2 zmm25, zmm25, zmm25, 27
+ vmovdqu64 zmm26, [rsp+256]
+ vshufi64x2 zmm26, zmm26, zmm26, 27
+ vmovdqu64 [rsp+512], zmm23
+ vmovdqu64 [rsp+576], zmm24
+ vmovdqu64 [rsp+640], zmm25
+ vmovdqu64 [rsp+704], zmm26
+ vmovdqu64 zmm23, [rsp+192]
+ vshufi64x2 zmm23, zmm23, zmm23, 27
+ vmovdqu64 zmm24, [rsp+128]
+ vshufi64x2 zmm24, zmm24, zmm24, 27
+ vmovdqu64 zmm25, [rsp+64]
+ vshufi64x2 zmm25, zmm25, zmm25, 27
+ vmovdqu64 zmm26, [rsp]
+ vshufi64x2 zmm26, zmm26, zmm26, 27
+ vmovdqu64 [rsp+768], zmm23
+ vmovdqu64 [rsp+832], zmm24
+ vmovdqu64 [rsp+896], zmm25
+ vmovdqu64 [rsp+960], zmm26
+ ; 512 bytes of input
+ lea rcx, QWORD PTR [rsi+rbx]
+ mov QWORD PTR [rsp+1056], rcx
+ vbroadcasti32x4 zmm20, [rsp+1024]
+ vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
+ vpshufb zmm16, zmm16, zmm22
+ vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
+ vpshufb zmm17, zmm17, zmm22
+ vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2
+ vpshufb zmm18, zmm18, zmm22
+ vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3
+ vpshufb zmm19, zmm19, zmm22
+ vmovdqu xmm8, OWORD PTR [rsp+1024]
+ vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen
+ vmovdqu OWORD PTR [rsp+1024], xmm8
+ vpxorq zmm16, zmm16, zmm9
+ vpxorq zmm17, zmm17, zmm9
+ vpxorq zmm18, zmm18, zmm9
+ vpxorq zmm19, zmm19, zmm9
+ vaesenc zmm16, zmm16, zmm10
+ vaesenc zmm17, zmm17, zmm10
+ vaesenc zmm18, zmm18, zmm10
+ vaesenc zmm19, zmm19, zmm10
+ vaesenc zmm16, zmm16, zmm11
+ vaesenc zmm17, zmm17, zmm11
+ vaesenc zmm18, zmm18, zmm11
+ vaesenc zmm19, zmm19, zmm11
+ vaesenc zmm16, zmm16, zmm12
+ vaesenc zmm17, zmm17, zmm12
+ vaesenc zmm18, zmm18, zmm12
+ vaesenc zmm19, zmm19, zmm12
+ vaesenc zmm16, zmm16, zmm13
+ vaesenc zmm17, zmm17, zmm13
+ vaesenc zmm18, zmm18, zmm13
+ vaesenc zmm19, zmm19, zmm13
+ vaesenc zmm16, zmm16, zmm14
+ vaesenc zmm17, zmm17, zmm14
+ vaesenc zmm18, zmm18, zmm14
+ vaesenc zmm19, zmm19, zmm14
+ vaesenc zmm16, zmm16, zmm15
+ vaesenc zmm17, zmm17, zmm15
+ vaesenc zmm18, zmm18, zmm15
+ vaesenc zmm19, zmm19, zmm15
+ vaesenc zmm16, zmm16, zmm1
+ vaesenc zmm17, zmm17, zmm1
+ vaesenc zmm18, zmm18, zmm1
+ vaesenc zmm19, zmm19, zmm1
+ vaesenc zmm16, zmm16, zmm2
+ vaesenc zmm17, zmm17, zmm2
+ vaesenc zmm18, zmm18, zmm2
+ vaesenc zmm19, zmm19, zmm2
+ vaesenc zmm16, zmm16, zmm3
+ vaesenc zmm17, zmm17, zmm3
+ vaesenc zmm18, zmm18, zmm3
+ vaesenc zmm19, zmm19, zmm3
+ cmp r10d, 11
+ vbroadcasti32x4 zmm20, [r15+160]
+ jl L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+176]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ cmp r10d, 13
+ vbroadcasti32x4 zmm20, [r15+192]
+ jl L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+208]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+224]
+L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last:
+ vaesenclast zmm16, zmm16, zmm20
+ vaesenclast zmm17, zmm17, zmm20
+ vaesenclast zmm18, zmm18, zmm20
+ vaesenclast zmm19, zmm19, zmm20
+ lea rcx, QWORD PTR [rdi+rbx]
+ lea rdx, QWORD PTR [rsi+rbx]
+ vmovdqu64 zmm21, [rcx]
+ vpxorq zmm16, zmm16, zmm21
+ vmovdqu64 [rdx], zmm16
+ vmovdqu64 zmm21, [rcx+64]
+ vpxorq zmm17, zmm17, zmm21
+ vmovdqu64 [rdx+64], zmm17
+ vmovdqu64 zmm21, [rcx+128]
+ vpxorq zmm18, zmm18, zmm21
+ vmovdqu64 [rdx+128], zmm18
+ vmovdqu64 zmm21, [rcx+192]
+ vpxorq zmm19, zmm19, zmm21
+ vmovdqu64 [rdx+192], zmm19
+ add ebx, 256
+ vbroadcasti32x4 zmm20, [rsp+1024]
+ vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
+ vpshufb zmm16, zmm16, zmm22
+ vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
+ vpshufb zmm17, zmm17, zmm22
+ vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2
+ vpshufb zmm18, zmm18, zmm22
+ vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3
+ vpshufb zmm19, zmm19, zmm22
+ vmovdqu xmm8, OWORD PTR [rsp+1024]
+ vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen
+ vmovdqu OWORD PTR [rsp+1024], xmm8
+ vpxorq zmm16, zmm16, zmm9
+ vpxorq zmm17, zmm17, zmm9
+ vpxorq zmm18, zmm18, zmm9
+ vpxorq zmm19, zmm19, zmm9
+ vaesenc zmm16, zmm16, zmm10
+ vaesenc zmm17, zmm17, zmm10
+ vaesenc zmm18, zmm18, zmm10
+ vaesenc zmm19, zmm19, zmm10
+ vaesenc zmm16, zmm16, zmm11
+ vaesenc zmm17, zmm17, zmm11
+ vaesenc zmm18, zmm18, zmm11
+ vaesenc zmm19, zmm19, zmm11
+ vaesenc zmm16, zmm16, zmm12
+ vaesenc zmm17, zmm17, zmm12
+ vaesenc zmm18, zmm18, zmm12
+ vaesenc zmm19, zmm19, zmm12
+ vaesenc zmm16, zmm16, zmm13
+ vaesenc zmm17, zmm17, zmm13
+ vaesenc zmm18, zmm18, zmm13
+ vaesenc zmm19, zmm19, zmm13
+ vaesenc zmm16, zmm16, zmm14
+ vaesenc zmm17, zmm17, zmm14
+ vaesenc zmm18, zmm18, zmm14
+ vaesenc zmm19, zmm19, zmm14
+ vaesenc zmm16, zmm16, zmm15
+ vaesenc zmm17, zmm17, zmm15
+ vaesenc zmm18, zmm18, zmm15
+ vaesenc zmm19, zmm19, zmm15
+ vaesenc zmm16, zmm16, zmm1
+ vaesenc zmm17, zmm17, zmm1
+ vaesenc zmm18, zmm18, zmm1
+ vaesenc zmm19, zmm19, zmm1
+ vaesenc zmm16, zmm16, zmm2
+ vaesenc zmm17, zmm17, zmm2
+ vaesenc zmm18, zmm18, zmm2
+ vaesenc zmm19, zmm19, zmm2
+ vaesenc zmm16, zmm16, zmm3
+ vaesenc zmm17, zmm17, zmm3
+ vaesenc zmm18, zmm18, zmm3
+ vaesenc zmm19, zmm19, zmm3
+ cmp r10d, 11
+ vbroadcasti32x4 zmm20, [r15+160]
+ jl L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+176]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ cmp r10d, 13
+ vbroadcasti32x4 zmm20, [r15+192]
+ jl L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+208]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+224]
+L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last:
+ vaesenclast zmm16, zmm16, zmm20
+ vaesenclast zmm17, zmm17, zmm20
+ vaesenclast zmm18, zmm18, zmm20
+ vaesenclast zmm19, zmm19, zmm20
+ lea rcx, QWORD PTR [rdi+rbx]
+ lea rdx, QWORD PTR [rsi+rbx]
+ vmovdqu64 zmm21, [rcx]
+ vpxorq zmm16, zmm16, zmm21
+ vmovdqu64 [rdx], zmm16
+ vmovdqu64 zmm21, [rcx+64]
+ vpxorq zmm17, zmm17, zmm21
+ vmovdqu64 [rdx+64], zmm17
+ vmovdqu64 zmm21, [rcx+128]
+ vpxorq zmm18, zmm18, zmm21
+ vmovdqu64 [rdx+128], zmm18
+ vmovdqu64 zmm21, [rcx+192]
+ vpxorq zmm19, zmm19, zmm21
+ vmovdqu64 [rdx+192], zmm19
+ add ebx, 256
+ cmp ebx, r13d
+ jge L_AES_GCM_encrypt_avx512_last_win
+L_AES_GCM_encrypt_avx512_win_loop:
+ lea rcx, QWORD PTR [rsi+rbx]
+ mov QWORD PTR [rsp+1072], rcx
+ mov r12, QWORD PTR [rsp+1056]
+ vpxorq zmm21, zmm21, zmm21
+ vinserti32x4 zmm21, zmm21, xmm6, 0
+ vbroadcasti32x4 zmm20, [rsp+1024]
+ vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
+ vpshufb zmm16, zmm16, zmm22
+ vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
+ vpshufb zmm17, zmm17, zmm22
+ vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2
+ vpshufb zmm18, zmm18, zmm22
+ vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3
+ vpshufb zmm19, zmm19, zmm22
+ vmovdqu xmm8, OWORD PTR [rsp+1024]
+ vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen
+ vmovdqu OWORD PTR [rsp+1024], xmm8
+ vpxorq zmm16, zmm16, zmm9
+ vpxorq zmm17, zmm17, zmm9
+ vpxorq zmm18, zmm18, zmm9
+ vpxorq zmm19, zmm19, zmm9
+ vmovdqu64 zmm31, [r12]
+ vpshufb zmm31, zmm31, zmm30
+ vpxorq zmm31, zmm31, zmm21
+ vaesenc zmm16, zmm16, zmm10
+ vaesenc zmm17, zmm17, zmm10
+ vaesenc zmm18, zmm18, zmm10
+ vaesenc zmm19, zmm19, zmm10
+ vpclmulqdq zmm23, zmm31, [rsp+512], 0
+ vpclmulqdq zmm24, zmm31, [rsp+512], 1
+ vpclmulqdq zmm25, zmm31, [rsp+512], 16
+ vpclmulqdq zmm26, zmm31, [rsp+512], 17
+ vmovdqa64 zmm27, zmm23
+ vpxorq zmm28, zmm25, zmm24
+ vmovdqa64 zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm11
+ vaesenc zmm17, zmm17, zmm11
+ vaesenc zmm18, zmm18, zmm11
+ vaesenc zmm19, zmm19, zmm11
+ vmovdqu64 zmm31, [r12+64]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm12
+ vaesenc zmm17, zmm17, zmm12
+ vaesenc zmm18, zmm18, zmm12
+ vaesenc zmm19, zmm19, zmm12
+ vpclmulqdq zmm23, zmm31, [rsp+576], 0
+ vpclmulqdq zmm24, zmm31, [rsp+576], 1
+ vpclmulqdq zmm25, zmm31, [rsp+576], 16
+ vpclmulqdq zmm26, zmm31, [rsp+576], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm13
+ vaesenc zmm17, zmm17, zmm13
+ vaesenc zmm18, zmm18, zmm13
+ vaesenc zmm19, zmm19, zmm13
+ vmovdqu64 zmm31, [r12+128]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm14
+ vaesenc zmm17, zmm17, zmm14
+ vaesenc zmm18, zmm18, zmm14
+ vaesenc zmm19, zmm19, zmm14
+ vpclmulqdq zmm23, zmm31, [rsp+640], 0
+ vpclmulqdq zmm24, zmm31, [rsp+640], 1
+ vpclmulqdq zmm25, zmm31, [rsp+640], 16
+ vpclmulqdq zmm26, zmm31, [rsp+640], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm15
+ vaesenc zmm17, zmm17, zmm15
+ vaesenc zmm18, zmm18, zmm15
+ vaesenc zmm19, zmm19, zmm15
+ vmovdqu64 zmm31, [r12+192]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm1
+ vaesenc zmm17, zmm17, zmm1
+ vaesenc zmm18, zmm18, zmm1
+ vaesenc zmm19, zmm19, zmm1
+ vpclmulqdq zmm23, zmm31, [rsp+704], 0
+ vpclmulqdq zmm24, zmm31, [rsp+704], 1
+ vpclmulqdq zmm25, zmm31, [rsp+704], 16
+ vpclmulqdq zmm26, zmm31, [rsp+704], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm2
+ vaesenc zmm17, zmm17, zmm2
+ vaesenc zmm18, zmm18, zmm2
+ vaesenc zmm19, zmm19, zmm2
+ vaesenc zmm16, zmm16, zmm3
+ vaesenc zmm17, zmm17, zmm3
+ vaesenc zmm18, zmm18, zmm3
+ vaesenc zmm19, zmm19, zmm3
+ cmp r10d, 11
+ vbroadcasti32x4 zmm20, [r15+160]
+ jl L_AES_GCM_encrypt_avx512_a_il_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+176]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ cmp r10d, 13
+ vbroadcasti32x4 zmm20, [r15+192]
+ jl L_AES_GCM_encrypt_avx512_a_il_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+208]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+224]
+L_AES_GCM_encrypt_avx512_a_il_last:
+ vaesenclast zmm16, zmm16, zmm20
+ vaesenclast zmm17, zmm17, zmm20
+ vaesenclast zmm18, zmm18, zmm20
+ vaesenclast zmm19, zmm19, zmm20
+ lea rcx, QWORD PTR [rdi+rbx]
+ lea rdx, QWORD PTR [rsi+rbx]
+ vmovdqu64 zmm21, [rcx]
+ vpxorq zmm16, zmm16, zmm21
+ vmovdqu64 [rdx], zmm16
+ vmovdqu64 zmm21, [rcx+64]
+ vpxorq zmm17, zmm17, zmm21
+ vmovdqu64 [rdx+64], zmm17
+ vmovdqu64 zmm21, [rcx+128]
+ vpxorq zmm18, zmm18, zmm21
+ vmovdqu64 [rdx+128], zmm18
+ vmovdqu64 zmm21, [rcx+192]
+ vpxorq zmm19, zmm19, zmm21
+ vmovdqu64 [rdx+192], zmm19
+ add ebx, 256
+ vbroadcasti32x4 zmm20, [rsp+1024]
+ vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
+ vpshufb zmm16, zmm16, zmm22
+ vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
+ vpshufb zmm17, zmm17, zmm22
+ vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2
+ vpshufb zmm18, zmm18, zmm22
+ vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3
+ vpshufb zmm19, zmm19, zmm22
+ vmovdqu xmm8, OWORD PTR [rsp+1024]
+ vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen
+ vmovdqu OWORD PTR [rsp+1024], xmm8
+ vpxorq zmm16, zmm16, zmm9
+ vpxorq zmm17, zmm17, zmm9
+ vpxorq zmm18, zmm18, zmm9
+ vpxorq zmm19, zmm19, zmm9
+ vmovdqu64 zmm31, [r12+256]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm10
+ vaesenc zmm17, zmm17, zmm10
+ vaesenc zmm18, zmm18, zmm10
+ vaesenc zmm19, zmm19, zmm10
+ vpclmulqdq zmm23, zmm31, [rsp+768], 0
+ vpclmulqdq zmm24, zmm31, [rsp+768], 1
+ vpclmulqdq zmm25, zmm31, [rsp+768], 16
+ vpclmulqdq zmm26, zmm31, [rsp+768], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm11
+ vaesenc zmm17, zmm17, zmm11
+ vaesenc zmm18, zmm18, zmm11
+ vaesenc zmm19, zmm19, zmm11
+ vmovdqu64 zmm31, [r12+320]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm12
+ vaesenc zmm17, zmm17, zmm12
+ vaesenc zmm18, zmm18, zmm12
+ vaesenc zmm19, zmm19, zmm12
+ vpclmulqdq zmm23, zmm31, [rsp+832], 0
+ vpclmulqdq zmm24, zmm31, [rsp+832], 1
+ vpclmulqdq zmm25, zmm31, [rsp+832], 16
+ vpclmulqdq zmm26, zmm31, [rsp+832], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm13
+ vaesenc zmm17, zmm17, zmm13
+ vaesenc zmm18, zmm18, zmm13
+ vaesenc zmm19, zmm19, zmm13
+ vmovdqu64 zmm31, [r12+384]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm14
+ vaesenc zmm17, zmm17, zmm14
+ vaesenc zmm18, zmm18, zmm14
+ vaesenc zmm19, zmm19, zmm14
+ vpclmulqdq zmm23, zmm31, [rsp+896], 0
+ vpclmulqdq zmm24, zmm31, [rsp+896], 1
+ vpclmulqdq zmm25, zmm31, [rsp+896], 16
+ vpclmulqdq zmm26, zmm31, [rsp+896], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm15
+ vaesenc zmm17, zmm17, zmm15
+ vaesenc zmm18, zmm18, zmm15
+ vaesenc zmm19, zmm19, zmm15
+ vmovdqu64 zmm31, [r12+448]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm1
+ vaesenc zmm17, zmm17, zmm1
+ vaesenc zmm18, zmm18, zmm1
+ vaesenc zmm19, zmm19, zmm1
+ vpclmulqdq zmm23, zmm31, [rsp+960], 0
+ vpclmulqdq zmm24, zmm31, [rsp+960], 1
+ vpclmulqdq zmm25, zmm31, [rsp+960], 16
+ vpclmulqdq zmm26, zmm31, [rsp+960], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm2
+ vaesenc zmm17, zmm17, zmm2
+ vaesenc zmm18, zmm18, zmm2
+ vaesenc zmm19, zmm19, zmm2
+ vaesenc zmm16, zmm16, zmm3
+ vaesenc zmm17, zmm17, zmm3
+ vaesenc zmm18, zmm18, zmm3
+ vaesenc zmm19, zmm19, zmm3
+ cmp r10d, 11
+ vbroadcasti32x4 zmm20, [r15+160]
+ jl L_AES_GCM_encrypt_avx512_b_il_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+176]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ cmp r10d, 13
+ vbroadcasti32x4 zmm20, [r15+192]
+ jl L_AES_GCM_encrypt_avx512_b_il_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+208]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+224]
+L_AES_GCM_encrypt_avx512_b_il_last:
+ vaesenclast zmm16, zmm16, zmm20
+ vaesenclast zmm17, zmm17, zmm20
+ vaesenclast zmm18, zmm18, zmm20
+ vaesenclast zmm19, zmm19, zmm20
+ lea rcx, QWORD PTR [rdi+rbx]
+ lea rdx, QWORD PTR [rsi+rbx]
+ vmovdqu64 zmm21, [rcx]
+ vpxorq zmm16, zmm16, zmm21
+ vmovdqu64 [rdx], zmm16
+ vmovdqu64 zmm21, [rcx+64]
+ vpxorq zmm17, zmm17, zmm21
+ vmovdqu64 [rdx+64], zmm17
+ vmovdqu64 zmm21, [rcx+128]
+ vpxorq zmm18, zmm18, zmm21
+ vmovdqu64 [rdx+128], zmm18
+ vmovdqu64 zmm21, [rcx+192]
+ vpxorq zmm19, zmm19, zmm21
+ vmovdqu64 [rdx+192], zmm19
+ add ebx, 256
+ vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128
+ vpclmulqdq zmm23, zmm31, zmm27, 1
+ vpshufd zmm27, zmm27, 78
+ vpternlogq zmm28, zmm27, zmm23, 150
+ vpclmulqdq zmm23, zmm31, zmm28, 1
+ vpshufd zmm28, zmm28, 78
+ vpternlogq zmm29, zmm28, zmm23, 150
+ vextracti32x4 xmm0, zmm29, 1
+ vextracti32x4 xmm4, zmm29, 2
+ vextracti32x4 xmm5, zmm29, 3
+ vpxorq xmm6, xmm29, xmm0
+ vpternlogq xmm6, xmm5, xmm4, 150
+ mov rcx, QWORD PTR [rsp+1072]
+ mov QWORD PTR [rsp+1056], rcx
+ cmp ebx, r13d
+ jl L_AES_GCM_encrypt_avx512_win_loop
+L_AES_GCM_encrypt_avx512_last_win:
+ mov rcx, QWORD PTR [rsp+1056]
+ vpxorq zmm20, zmm20, zmm20
+ vinserti32x4 zmm20, zmm20, xmm6, 0
+ vmovdqu64 zmm23, [rsp+512]
+ vmovdqu64 zmm24, [rsp+576]
+ vmovdqu64 zmm25, [rsp+640]
+ vmovdqu64 zmm26, [rsp+704]
+ vmovdqu64 zmm21, [rcx]
+ vpshufb zmm21, zmm21, zmm30
+ vpxorq zmm21, zmm21, zmm20
+ vpclmulqdq zmm16, zmm21, zmm23, 0
+ vpclmulqdq zmm17, zmm21, zmm23, 1
+ vpclmulqdq zmm18, zmm21, zmm23, 16
+ vpclmulqdq zmm19, zmm21, zmm23, 17
+ vmovdqa64 zmm27, zmm16
+ vpxorq zmm28, zmm18, zmm17
+ vmovdqa64 zmm29, zmm19
+ vmovdqu64 zmm21, [rcx+64]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm24, 0
+ vpclmulqdq zmm17, zmm21, zmm24, 1
+ vpclmulqdq zmm18, zmm21, zmm24, 16
+ vpclmulqdq zmm19, zmm21, zmm24, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rcx+128]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm25, 0
+ vpclmulqdq zmm17, zmm21, zmm25, 1
+ vpclmulqdq zmm18, zmm21, zmm25, 16
+ vpclmulqdq zmm19, zmm21, zmm25, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rcx+192]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm26, 0
+ vpclmulqdq zmm17, zmm21, zmm26, 1
+ vpclmulqdq zmm18, zmm21, zmm26, 16
+ vpclmulqdq zmm19, zmm21, zmm26, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm23, [rsp+768]
+ vmovdqu64 zmm24, [rsp+832]
+ vmovdqu64 zmm25, [rsp+896]
+ vmovdqu64 zmm26, [rsp+960]
+ vmovdqu64 zmm21, [rcx+256]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm23, 0
+ vpclmulqdq zmm17, zmm21, zmm23, 1
+ vpclmulqdq zmm18, zmm21, zmm23, 16
+ vpclmulqdq zmm19, zmm21, zmm23, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rcx+320]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm24, 0
+ vpclmulqdq zmm17, zmm21, zmm24, 1
+ vpclmulqdq zmm18, zmm21, zmm24, 16
+ vpclmulqdq zmm19, zmm21, zmm24, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rcx+384]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm25, 0
+ vpclmulqdq zmm17, zmm21, zmm25, 1
+ vpclmulqdq zmm18, zmm21, zmm25, 16
+ vpclmulqdq zmm19, zmm21, zmm25, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rcx+448]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm26, 0
+ vpclmulqdq zmm17, zmm21, zmm26, 1
+ vpclmulqdq zmm18, zmm21, zmm26, 16
+ vpclmulqdq zmm19, zmm21, zmm26, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vpclmulqdq zmm21, zmm31, zmm27, 1
+ vpshufd zmm27, zmm27, 78
+ vpternlogq zmm28, zmm27, zmm21, 150
+ vpclmulqdq zmm21, zmm31, zmm28, 1
+ vpshufd zmm28, zmm28, 78
+ vpternlogq zmm29, zmm28, zmm21, 150
+ vextracti32x4 xmm0, zmm29, 1
+ vextracti32x4 xmm4, zmm29, 2
+ vextracti32x4 xmm5, zmm29, 3
+ vpxorq xmm6, xmm29, xmm0
+ vpternlogq xmm6, xmm5, xmm4, 150
+L_AES_GCM_encrypt_avx512_no_windows:
+ vmovdqu64 zmm23, [rsp+192]
+ vshufi64x2 zmm23, zmm23, zmm23, 27
+ vmovdqu64 zmm24, [rsp+128]
+ vshufi64x2 zmm24, zmm24, zmm24, 27
+ vmovdqu64 zmm25, [rsp+64]
+ vshufi64x2 zmm25, zmm25, zmm25, 27
+ vmovdqu64 zmm26, [rsp]
+ vshufi64x2 zmm26, zmm26, zmm26, 27
+ mov r13d, r9d
+ and r13d, 4294967040
+ cmp ebx, r13d
+ jge L_AES_GCM_encrypt_avx512_after_256
+ ; 256 bytes of input
+ vbroadcasti32x4 zmm20, [rsp+1024]
+ vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
+ vpshufb zmm16, zmm16, zmm22
+ vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
+ vpshufb zmm17, zmm17, zmm22
+ vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2
+ vpshufb zmm18, zmm18, zmm22
+ vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3
+ vpshufb zmm19, zmm19, zmm22
+ vmovdqu xmm8, OWORD PTR [rsp+1024]
+ vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen
+ vmovdqu OWORD PTR [rsp+1024], xmm8
+ vpxorq zmm16, zmm16, zmm9
+ vpxorq zmm17, zmm17, zmm9
+ vpxorq zmm18, zmm18, zmm9
+ vpxorq zmm19, zmm19, zmm9
+ vaesenc zmm16, zmm16, zmm10
+ vaesenc zmm17, zmm17, zmm10
+ vaesenc zmm18, zmm18, zmm10
+ vaesenc zmm19, zmm19, zmm10
+ vaesenc zmm16, zmm16, zmm11
+ vaesenc zmm17, zmm17, zmm11
+ vaesenc zmm18, zmm18, zmm11
+ vaesenc zmm19, zmm19, zmm11
+ vaesenc zmm16, zmm16, zmm12
+ vaesenc zmm17, zmm17, zmm12
+ vaesenc zmm18, zmm18, zmm12
+ vaesenc zmm19, zmm19, zmm12
+ vaesenc zmm16, zmm16, zmm13
+ vaesenc zmm17, zmm17, zmm13
+ vaesenc zmm18, zmm18, zmm13
+ vaesenc zmm19, zmm19, zmm13
+ vaesenc zmm16, zmm16, zmm14
+ vaesenc zmm17, zmm17, zmm14
+ vaesenc zmm18, zmm18, zmm14
+ vaesenc zmm19, zmm19, zmm14
+ vaesenc zmm16, zmm16, zmm15
+ vaesenc zmm17, zmm17, zmm15
+ vaesenc zmm18, zmm18, zmm15
+ vaesenc zmm19, zmm19, zmm15
+ vaesenc zmm16, zmm16, zmm1
+ vaesenc zmm17, zmm17, zmm1
+ vaesenc zmm18, zmm18, zmm1
+ vaesenc zmm19, zmm19, zmm1
+ vaesenc zmm16, zmm16, zmm2
+ vaesenc zmm17, zmm17, zmm2
+ vaesenc zmm18, zmm18, zmm2
+ vaesenc zmm19, zmm19, zmm2
+ vaesenc zmm16, zmm16, zmm3
+ vaesenc zmm17, zmm17, zmm3
+ vaesenc zmm18, zmm18, zmm3
+ vaesenc zmm19, zmm19, zmm3
+ cmp r10d, 11
+ vbroadcasti32x4 zmm20, [r15+160]
+ jl L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+176]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ cmp r10d, 13
+ vbroadcasti32x4 zmm20, [r15+192]
+ jl L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+208]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+224]
+L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last:
+ vaesenclast zmm16, zmm16, zmm20
+ vaesenclast zmm17, zmm17, zmm20
+ vaesenclast zmm18, zmm18, zmm20
+ vaesenclast zmm19, zmm19, zmm20
+ lea rcx, QWORD PTR [rdi+rbx]
+ lea rdx, QWORD PTR [rsi+rbx]
+ vmovdqu64 zmm21, [rcx]
+ vpxorq zmm16, zmm16, zmm21
+ vmovdqu64 [rdx], zmm16
+ vmovdqu64 zmm21, [rcx+64]
+ vpxorq zmm17, zmm17, zmm21
+ vmovdqu64 [rdx+64], zmm17
+ vmovdqu64 zmm21, [rcx+128]
+ vpxorq zmm18, zmm18, zmm21
+ vmovdqu64 [rdx+128], zmm18
+ vmovdqu64 zmm21, [rcx+192]
+ vpxorq zmm19, zmm19, zmm21
+ vmovdqu64 [rdx+192], zmm19
+ mov QWORD PTR [rsp+1056], rdx
+ add ebx, 256
+ cmp ebx, r13d
+ jge L_AES_GCM_encrypt_avx512_last_ghash
+L_AES_GCM_encrypt_avx512_ghash_128:
+ vbroadcasti32x4 zmm20, [rsp+1024]
+ vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
+ vpshufb zmm16, zmm16, zmm22
+ vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
+ vpshufb zmm17, zmm17, zmm22
+ vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2
+ vpshufb zmm18, zmm18, zmm22
+ vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3
+ vpshufb zmm19, zmm19, zmm22
+ vmovdqu xmm8, OWORD PTR [rsp+1024]
+ vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen
+ vmovdqu OWORD PTR [rsp+1024], xmm8
+ vpxorq zmm16, zmm16, zmm9
+ vpxorq zmm17, zmm17, zmm9
+ vpxorq zmm18, zmm18, zmm9
+ vpxorq zmm19, zmm19, zmm9
+ vaesenc zmm16, zmm16, zmm10
+ vaesenc zmm17, zmm17, zmm10
+ vaesenc zmm18, zmm18, zmm10
+ vaesenc zmm19, zmm19, zmm10
+ vaesenc zmm16, zmm16, zmm11
+ vaesenc zmm17, zmm17, zmm11
+ vaesenc zmm18, zmm18, zmm11
+ vaesenc zmm19, zmm19, zmm11
+ vaesenc zmm16, zmm16, zmm12
+ vaesenc zmm17, zmm17, zmm12
+ vaesenc zmm18, zmm18, zmm12
+ vaesenc zmm19, zmm19, zmm12
+ vaesenc zmm16, zmm16, zmm13
+ vaesenc zmm17, zmm17, zmm13
+ vaesenc zmm18, zmm18, zmm13
+ vaesenc zmm19, zmm19, zmm13
+ vaesenc zmm16, zmm16, zmm14
+ vaesenc zmm17, zmm17, zmm14
+ vaesenc zmm18, zmm18, zmm14
+ vaesenc zmm19, zmm19, zmm14
+ vaesenc zmm16, zmm16, zmm15
+ vaesenc zmm17, zmm17, zmm15
+ vaesenc zmm18, zmm18, zmm15
+ vaesenc zmm19, zmm19, zmm15
+ vaesenc zmm16, zmm16, zmm1
+ vaesenc zmm17, zmm17, zmm1
+ vaesenc zmm18, zmm18, zmm1
+ vaesenc zmm19, zmm19, zmm1
+ vaesenc zmm16, zmm16, zmm2
+ vaesenc zmm17, zmm17, zmm2
+ vaesenc zmm18, zmm18, zmm2
+ vaesenc zmm19, zmm19, zmm2
+ vaesenc zmm16, zmm16, zmm3
+ vaesenc zmm17, zmm17, zmm3
+ vaesenc zmm18, zmm18, zmm3
+ vaesenc zmm19, zmm19, zmm3
+ cmp r10d, 11
+ vbroadcasti32x4 zmm20, [r15+160]
+ jl L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+176]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ cmp r10d, 13
+ vbroadcasti32x4 zmm20, [r15+192]
+ jl L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+208]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+224]
+L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last:
+ vaesenclast zmm16, zmm16, zmm20
+ vaesenclast zmm17, zmm17, zmm20
+ vaesenclast zmm18, zmm18, zmm20
+ vaesenclast zmm19, zmm19, zmm20
+ lea rcx, QWORD PTR [rdi+rbx]
+ lea rdx, QWORD PTR [rsi+rbx]
+ vmovdqu64 zmm21, [rcx]
+ vpxorq zmm16, zmm16, zmm21
+ vmovdqu64 [rdx], zmm16
+ vmovdqu64 zmm21, [rcx+64]
+ vpxorq zmm17, zmm17, zmm21
+ vmovdqu64 [rdx+64], zmm17
+ vmovdqu64 zmm21, [rcx+128]
+ vpxorq zmm18, zmm18, zmm21
+ vmovdqu64 [rdx+128], zmm18
+ vmovdqu64 zmm21, [rcx+192]
+ vpxorq zmm19, zmm19, zmm21
+ vmovdqu64 [rdx+192], zmm19
+ mov rcx, QWORD PTR [rsp+1056]
+ vpxorq zmm20, zmm20, zmm20
+ vinserti32x4 zmm20, zmm20, xmm6, 0
+ vmovdqu64 zmm21, [rcx]
+ vpshufb zmm21, zmm21, zmm30
+ vpxorq zmm21, zmm21, zmm20
+ vpclmulqdq zmm16, zmm21, zmm23, 0
+ vpclmulqdq zmm17, zmm21, zmm23, 1
+ vpclmulqdq zmm18, zmm21, zmm23, 16
+ vpclmulqdq zmm19, zmm21, zmm23, 17
+ vmovdqa64 zmm27, zmm16
+ vpxorq zmm28, zmm18, zmm17
+ vmovdqa64 zmm29, zmm19
+ vmovdqu64 zmm21, [rcx+64]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm24, 0
+ vpclmulqdq zmm17, zmm21, zmm24, 1
+ vpclmulqdq zmm18, zmm21, zmm24, 16
+ vpclmulqdq zmm19, zmm21, zmm24, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rcx+128]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm25, 0
+ vpclmulqdq zmm17, zmm21, zmm25, 1
+ vpclmulqdq zmm18, zmm21, zmm25, 16
+ vpclmulqdq zmm19, zmm21, zmm25, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rcx+192]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm26, 0
+ vpclmulqdq zmm17, zmm21, zmm26, 1
+ vpclmulqdq zmm18, zmm21, zmm26, 16
+ vpclmulqdq zmm19, zmm21, zmm26, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vpclmulqdq zmm21, zmm31, zmm27, 1
+ vpshufd zmm27, zmm27, 78
+ vpternlogq zmm28, zmm27, zmm21, 150
+ vpclmulqdq zmm21, zmm31, zmm28, 1
+ vpshufd zmm28, zmm28, 78
+ vpternlogq zmm29, zmm28, zmm21, 150
+ vextracti32x4 xmm0, zmm29, 1
+ vextracti32x4 xmm4, zmm29, 2
+ vextracti32x4 xmm5, zmm29, 3
+ vpxorq xmm6, xmm29, xmm0
+ vpternlogq xmm6, xmm5, xmm4, 150
+ mov QWORD PTR [rsp+1056], rdx
+ add ebx, 256
+ cmp ebx, r13d
+ jl L_AES_GCM_encrypt_avx512_ghash_128
+L_AES_GCM_encrypt_avx512_last_ghash:
+ mov rcx, QWORD PTR [rsp+1056]
+ vpxorq zmm20, zmm20, zmm20
+ vinserti32x4 zmm20, zmm20, xmm6, 0
+ vmovdqu64 zmm21, [rcx]
+ vpshufb zmm21, zmm21, zmm30
+ vpxorq zmm21, zmm21, zmm20
+ vpclmulqdq zmm16, zmm21, zmm23, 0
+ vpclmulqdq zmm17, zmm21, zmm23, 1
+ vpclmulqdq zmm18, zmm21, zmm23, 16
+ vpclmulqdq zmm19, zmm21, zmm23, 17
+ vmovdqa64 zmm27, zmm16
+ vpxorq zmm28, zmm18, zmm17
+ vmovdqa64 zmm29, zmm19
+ vmovdqu64 zmm21, [rcx+64]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm24, 0
+ vpclmulqdq zmm17, zmm21, zmm24, 1
+ vpclmulqdq zmm18, zmm21, zmm24, 16
+ vpclmulqdq zmm19, zmm21, zmm24, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rcx+128]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm25, 0
+ vpclmulqdq zmm17, zmm21, zmm25, 1
+ vpclmulqdq zmm18, zmm21, zmm25, 16
+ vpclmulqdq zmm19, zmm21, zmm25, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rcx+192]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm26, 0
+ vpclmulqdq zmm17, zmm21, zmm26, 1
+ vpclmulqdq zmm18, zmm21, zmm26, 16
+ vpclmulqdq zmm19, zmm21, zmm26, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vpclmulqdq zmm21, zmm31, zmm27, 1
+ vpshufd zmm27, zmm27, 78
+ vpternlogq zmm28, zmm27, zmm21, 150
+ vpclmulqdq zmm21, zmm31, zmm28, 1
+ vpshufd zmm28, zmm28, 78
+ vpternlogq zmm29, zmm28, zmm21, 150
+ vextracti32x4 xmm0, zmm29, 1
+ vextracti32x4 xmm4, zmm29, 2
+ vextracti32x4 xmm5, zmm29, 3
+ vpxorq xmm6, xmm29, xmm0
+ vpternlogq xmm6, xmm5, xmm4, 150
+L_AES_GCM_encrypt_avx512_after_256:
+ vmovdqu xmm5, OWORD PTR [rsp]
+L_AES_GCM_encrypt_avx512_done_128:
+ mov edx, r9d
+ cmp ebx, edx
+ jge L_AES_GCM_encrypt_avx512_done_enc
+ mov r13d, r9d
+ and r13d, 4294967280
+ cmp ebx, r13d
+ jge L_AES_GCM_encrypt_avx512_last_block_done
+ vmovdqu xmm9, OWORD PTR [rsp+1024]
+ vpshufb xmm8, xmm9, OWORD PTR L_avx512_aes_gcm_bswap_epi64
+ vpaddd xmm9, xmm9, OWORD PTR L_avx512_aes_gcm_one
+ vmovdqu OWORD PTR [rsp+1024], xmm9
+ vpxor xmm8, xmm8, [r15]
+ vaesenc xmm8, xmm8, [r15+16]
+ vaesenc xmm8, xmm8, [r15+32]
+ vaesenc xmm8, xmm8, [r15+48]
+ vaesenc xmm8, xmm8, [r15+64]
+ vaesenc xmm8, xmm8, [r15+80]
+ vaesenc xmm8, xmm8, [r15+96]
+ vaesenc xmm8, xmm8, [r15+112]
+ vaesenc xmm8, xmm8, [r15+128]
+ vaesenc xmm8, xmm8, [r15+144]
+ cmp r10d, 11
+ vmovdqa xmm9, OWORD PTR [r15+160]
+ jl L_AES_GCM_encrypt_avx512_aesenc_block_last
+ vaesenc xmm8, xmm8, xmm9
+ vaesenc xmm8, xmm8, [r15+176]
+ cmp r10d, 13
+ vmovdqa xmm9, OWORD PTR [r15+192]
+ jl L_AES_GCM_encrypt_avx512_aesenc_block_last
+ vaesenc xmm8, xmm8, xmm9
+ vaesenc xmm8, xmm8, [r15+208]
+ vmovdqa xmm9, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_avx512_aesenc_block_last:
+ vaesenclast xmm8, xmm8, xmm9
+ vmovdqu xmm9, OWORD PTR [rdi+rbx]
+ vpxor xmm8, xmm8, xmm9
+ vmovdqu OWORD PTR [rsi+rbx], xmm8
+ vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vpxor xmm6, xmm6, xmm8
+ add ebx, 16
+ cmp ebx, r13d
+ jge L_AES_GCM_encrypt_avx512_last_block_ghash
+L_AES_GCM_encrypt_avx512_last_block_start:
+ vmovdqu xmm13, OWORD PTR [rdi+rbx]
+ vmovdqu xmm9, OWORD PTR [rsp+1024]
+ vpshufb xmm8, xmm9, OWORD PTR L_avx512_aes_gcm_bswap_epi64
+ vpaddd xmm9, xmm9, OWORD PTR L_avx512_aes_gcm_one
+ vmovdqu OWORD PTR [rsp+1024], xmm9
+ vpxor xmm8, xmm8, [r15]
+ vpclmulqdq xmm10, xmm6, xmm5, 16
+ vaesenc xmm8, xmm8, [r15+16]
+ vaesenc xmm8, xmm8, [r15+32]
+ vpclmulqdq xmm11, xmm6, xmm5, 1
+ vaesenc xmm8, xmm8, [r15+48]
+ vaesenc xmm8, xmm8, [r15+64]
+ vpclmulqdq xmm12, xmm6, xmm5, 0
+ vaesenc xmm8, xmm8, [r15+80]
+ vpclmulqdq xmm1, xmm6, xmm5, 17
+ vaesenc xmm8, xmm8, [r15+96]
+ vpxor xmm10, xmm10, xmm11
+ vpslldq xmm2, xmm10, 8
+ vpsrldq xmm10, xmm10, 8
+ vaesenc xmm8, xmm8, [r15+112]
+ vpxor xmm2, xmm2, xmm12
+ vpxor xmm3, xmm1, xmm10
+ vmovdqa xmm0, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm2, xmm0, 16
+ vaesenc xmm8, xmm8, [r15+128]
+ vpshufd xmm10, xmm2, 78
+ vpxor xmm10, xmm10, xmm11
+ vpclmulqdq xmm11, xmm10, xmm0, 16
+ vaesenc xmm8, xmm8, [r15+144]
+ vpshufd xmm10, xmm10, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm6, xmm10, xmm3
+ cmp r10d, 11
+ vmovdqa xmm9, OWORD PTR [r15+160]
+ jl L_AES_GCM_encrypt_avx512_aesenc_gfmul_last
+ vaesenc xmm8, xmm8, xmm9
+ vaesenc xmm8, xmm8, [r15+176]
+ cmp r10d, 13
+ vmovdqa xmm9, OWORD PTR [r15+192]
+ jl L_AES_GCM_encrypt_avx512_aesenc_gfmul_last
+ vaesenc xmm8, xmm8, xmm9
+ vaesenc xmm8, xmm8, [r15+208]
+ vmovdqa xmm9, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_avx512_aesenc_gfmul_last:
+ vaesenclast xmm8, xmm8, xmm9
+ vmovdqa xmm0, xmm13
+ vpxor xmm8, xmm8, xmm0
+ vmovdqu OWORD PTR [rsi+rbx], xmm8
+ vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ add ebx, 16
+ vpxor xmm6, xmm6, xmm8
+ cmp ebx, r13d
+ jl L_AES_GCM_encrypt_avx512_last_block_start
+L_AES_GCM_encrypt_avx512_last_block_ghash:
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm5, 78
+ vpxor xmm9, xmm9, xmm5
+ vpshufd xmm10, xmm6, 78
+ vpxor xmm10, xmm10, xmm6
+ vpclmulqdq xmm8, xmm6, xmm5, 0
+ vpclmulqdq xmm11, xmm6, xmm5, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm6, xmm11
+L_AES_GCM_encrypt_avx512_last_block_done:
+ mov ecx, r9d
+ mov edx, ecx
+ and ecx, 15
+ jz L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_done
+ vmovdqu xmm4, OWORD PTR [rsp+1024]
+ vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_epi64
+ vpxor xmm4, xmm4, [r15]
+ vaesenc xmm4, xmm4, [r15+16]
+ vaesenc xmm4, xmm4, [r15+32]
+ vaesenc xmm4, xmm4, [r15+48]
+ vaesenc xmm4, xmm4, [r15+64]
+ vaesenc xmm4, xmm4, [r15+80]
+ vaesenc xmm4, xmm4, [r15+96]
+ vaesenc xmm4, xmm4, [r15+112]
+ vaesenc xmm4, xmm4, [r15+128]
+ vaesenc xmm4, xmm4, [r15+144]
+ cmp r10d, 11
+ vmovdqa xmm9, OWORD PTR [r15+160]
+ jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_aesenc_avx_last
+ vaesenc xmm4, xmm4, xmm9
+ vaesenc xmm4, xmm4, [r15+176]
+ cmp r10d, 13
+ vmovdqa xmm9, OWORD PTR [r15+192]
+ jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_aesenc_avx_last
+ vaesenc xmm4, xmm4, xmm9
+ vaesenc xmm4, xmm4, [r15+208]
+ vmovdqa xmm9, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_aesenc_avx_last:
+ vaesenclast xmm4, xmm4, xmm9
+ sub rsp, 16
+ xor ecx, ecx
+ vmovdqu OWORD PTR [rsp], xmm4
+L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_loop:
+ movzx r13d, BYTE PTR [rdi+rbx]
+ xor r13b, BYTE PTR [rsp+rcx]
+ mov BYTE PTR [rsi+rbx], r13b
+ mov BYTE PTR [rsp+rcx], r13b
+ inc ebx
+ inc ecx
+ cmp ebx, edx
+ jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_loop
+ xor r13, r13
+ cmp ecx, 16
+ je L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_finish_enc
+L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_byte_loop:
+ mov BYTE PTR [rsp+rcx], r13b
+ inc ecx
+ cmp ecx, 16
+ jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_byte_loop
+L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_finish_enc:
+ vmovdqu xmm4, OWORD PTR [rsp]
+ add rsp, 16
+ vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vpxor xmm6, xmm6, xmm4
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm5, 78
+ vpxor xmm9, xmm9, xmm5
+ vpshufd xmm10, xmm6, 78
+ vpxor xmm10, xmm10, xmm6
+ vpclmulqdq xmm8, xmm6, xmm5, 0
+ vpclmulqdq xmm11, xmm6, xmm5, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm6, xmm11
+L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_done:
+L_AES_GCM_encrypt_avx512_done_enc:
+ mov edx, r9d
+ mov ecx, r11d
+ shl rdx, 3
+ shl rcx, 3
+ vmovq xmm0, rdx
+ vmovq xmm1, rcx
+ vpunpcklqdq xmm0, xmm0, xmm1
+ vpxor xmm6, xmm6, xmm0
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm5, 78
+ vpxor xmm9, xmm9, xmm5
+ vpshufd xmm10, xmm6, 78
+ vpxor xmm10, xmm10, xmm6
+ vpclmulqdq xmm8, xmm6, xmm5, 0
+ vpclmulqdq xmm11, xmm6, xmm5, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm6, xmm11
+ vpshufb xmm6, xmm6, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vmovdqu xmm0, OWORD PTR [rsp+1040]
+ vpxor xmm0, xmm0, xmm6
+ cmp r14d, 16
+ je L_AES_GCM_encrypt_avx512_store_tag_16
+ xor rcx, rcx
+ vmovdqu OWORD PTR [rsp], xmm0
+L_AES_GCM_encrypt_avx512_store_tag_loop:
+ movzx r13d, BYTE PTR [rsp+rcx]
+ mov BYTE PTR [r8+rcx], r13b
+ inc ecx
+ cmp ecx, r14d
+ jne L_AES_GCM_encrypt_avx512_store_tag_loop
+ jmp L_AES_GCM_encrypt_avx512_store_tag_done
+L_AES_GCM_encrypt_avx512_store_tag_16:
+ vmovdqu OWORD PTR [r8], xmm0
+L_AES_GCM_encrypt_avx512_store_tag_done:
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp+1088]
+ vmovdqu xmm7, OWORD PTR [rsp+1104]
+ vmovdqu xmm8, OWORD PTR [rsp+1120]
+ vmovdqu xmm9, OWORD PTR [rsp+1136]
+ vmovdqu xmm10, OWORD PTR [rsp+1152]
+ vmovdqu xmm11, OWORD PTR [rsp+1168]
+ vmovdqu xmm12, OWORD PTR [rsp+1184]
+ vmovdqu xmm13, OWORD PTR [rsp+1200]
+ vmovdqu xmm14, OWORD PTR [rsp+1216]
+ vmovdqu xmm15, OWORD PTR [rsp+1232]
+ add rsp, 1248
+ pop r15
+ pop r14
+ pop rbx
+ pop r12
+ pop rsi
+ pop rdi
+ pop r13
+ ret
+AES_GCM_encrypt_avx512 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_decrypt_avx512 PROC
+ push r13
+ push rdi
+ push rsi
+ push r12
+ push rbx
+ push r14
+ push r15
+ push rbp
+ mov rdi, rcx
+ mov rsi, rdx
+ mov r12, r8
+ mov rax, r9
+ mov r8, QWORD PTR [rsp+104]
+ mov r9d, DWORD PTR [rsp+112]
+ mov r11d, DWORD PTR [rsp+120]
+ mov ebx, DWORD PTR [rsp+128]
+ mov r14d, DWORD PTR [rsp+136]
+ mov r15, QWORD PTR [rsp+144]
+ mov r10d, DWORD PTR [rsp+152]
+ mov rbp, QWORD PTR [rsp+160]
+ sub rsp, 1216
+ vmovdqu OWORD PTR [rsp+1056], xmm6
+ vmovdqu OWORD PTR [rsp+1072], xmm7
+ vmovdqu OWORD PTR [rsp+1088], xmm8
+ vmovdqu OWORD PTR [rsp+1104], xmm9
+ vmovdqu OWORD PTR [rsp+1120], xmm10
+ vmovdqu OWORD PTR [rsp+1136], xmm11
+ vmovdqu OWORD PTR [rsp+1152], xmm12
+ vmovdqu OWORD PTR [rsp+1168], xmm13
+ vmovdqu OWORD PTR [rsp+1184], xmm14
+ vmovdqu OWORD PTR [rsp+1200], xmm15
+ vpxor xmm4, xmm4, xmm4
+ vpxor xmm6, xmm6, xmm6
+ cmp ebx, 12
+ mov edx, ebx
+ jne L_AES_GCM_decrypt_avx512_iv_not_12
+ ; # Calculate values when IV is 12 bytes
+ ; Set counter based on IV
+ mov ecx, 16777216
+ vmovq xmm4, QWORD PTR [rax]
+ vpinsrd xmm4, xmm4, DWORD PTR [rax+8], 2
+ vpinsrd xmm4, xmm4, ecx, 3
+ ; H = Encrypt X(=0) and T = Encrypt counter
+ vmovdqa xmm5, OWORD PTR [r15]
+ vpxor xmm1, xmm4, xmm5
+ vmovdqa xmm7, OWORD PTR [r15+16]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [r15+32]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [r15+48]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [r15+64]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [r15+80]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [r15+96]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [r15+112]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [r15+128]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [r15+144]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ cmp r10d, 11
+ vmovdqa xmm7, OWORD PTR [r15+160]
+ jl L_AES_GCM_decrypt_avx512_calc_iv_12_last
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [r15+176]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ cmp r10d, 13
+ vmovdqa xmm7, OWORD PTR [r15+192]
+ jl L_AES_GCM_decrypt_avx512_calc_iv_12_last
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [r15+208]
+ vaesenc xmm5, xmm5, xmm7
+ vaesenc xmm1, xmm1, xmm7
+ vmovdqa xmm7, OWORD PTR [r15+224]
+L_AES_GCM_decrypt_avx512_calc_iv_12_last:
+ vaesenclast xmm5, xmm5, xmm7
+ vaesenclast xmm1, xmm1, xmm7
+ vpshufb xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vmovdqu OWORD PTR [rsp+1040], xmm1
+ jmp L_AES_GCM_decrypt_avx512_iv_done
+L_AES_GCM_decrypt_avx512_iv_not_12:
+ ; Calculate values when IV is not 12 bytes
+ ; H = Encrypt X(=0)
+ vmovdqa xmm5, OWORD PTR [r15]
+ vaesenc xmm5, xmm5, [r15+16]
+ vaesenc xmm5, xmm5, [r15+32]
+ vaesenc xmm5, xmm5, [r15+48]
+ vaesenc xmm5, xmm5, [r15+64]
+ vaesenc xmm5, xmm5, [r15+80]
+ vaesenc xmm5, xmm5, [r15+96]
+ vaesenc xmm5, xmm5, [r15+112]
+ vaesenc xmm5, xmm5, [r15+128]
+ vaesenc xmm5, xmm5, [r15+144]
+ cmp r10d, 11
+ vmovdqa xmm9, OWORD PTR [r15+160]
+ jl L_AES_GCM_decrypt_avx512_calc_iv_1_aesenc_avx_last
+ vaesenc xmm5, xmm5, xmm9
+ vaesenc xmm5, xmm5, [r15+176]
+ cmp r10d, 13
+ vmovdqa xmm9, OWORD PTR [r15+192]
+ jl L_AES_GCM_decrypt_avx512_calc_iv_1_aesenc_avx_last
+ vaesenc xmm5, xmm5, xmm9
+ vaesenc xmm5, xmm5, [r15+208]
+ vmovdqa xmm9, OWORD PTR [r15+224]
+L_AES_GCM_decrypt_avx512_calc_iv_1_aesenc_avx_last:
+ vaesenclast xmm5, xmm5, xmm9
+ vpshufb xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ ; Calc counter
+ ; Initialization vector
+ cmp edx, 0
+ mov rcx, 0
+ je L_AES_GCM_decrypt_avx512_calc_iv_done
+ cmp edx, 16
+ jl L_AES_GCM_decrypt_avx512_calc_iv_lt16
+ and edx, 4294967280
+L_AES_GCM_decrypt_avx512_calc_iv_16_loop:
+ vmovdqu xmm8, OWORD PTR [rax+rcx]
+ vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vpxor xmm4, xmm4, xmm8
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm4, 78
+ vpshufd xmm2, xmm5, 78
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm7, xmm0
+ vmovdqa xmm4, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm7, xmm7, xmm2
+ vpxor xmm4, xmm4, xmm1
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm4, xmm4, xmm1
+ vpslld xmm0, xmm7, 31
+ vpslld xmm1, xmm7, 30
+ vpslld xmm2, xmm7, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm7, xmm7, xmm0
+ vpsrld xmm2, xmm7, 1
+ vpsrld xmm3, xmm7, 2
+ vpsrld xmm0, xmm7, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm4, xmm4, xmm2
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_decrypt_avx512_calc_iv_16_loop
+ mov edx, ebx
+ cmp ecx, edx
+ je L_AES_GCM_decrypt_avx512_calc_iv_done
+L_AES_GCM_decrypt_avx512_calc_iv_lt16:
+ sub rsp, 16
+ vpxor xmm8, xmm8, xmm8
+ xor ebx, ebx
+ vmovdqu OWORD PTR [rsp], xmm8
+L_AES_GCM_decrypt_avx512_calc_iv_loop:
+ movzx r13d, BYTE PTR [rax+rcx]
+ mov BYTE PTR [rsp+rbx], r13b
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_decrypt_avx512_calc_iv_loop
+ vmovdqu xmm8, OWORD PTR [rsp]
+ add rsp, 16
+ vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vpxor xmm4, xmm4, xmm8
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm4, 78
+ vpshufd xmm2, xmm5, 78
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm7, xmm0
+ vmovdqa xmm4, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm7, xmm7, xmm2
+ vpxor xmm4, xmm4, xmm1
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm4, xmm4, xmm1
+ vpslld xmm0, xmm7, 31
+ vpslld xmm1, xmm7, 30
+ vpslld xmm2, xmm7, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm7, xmm7, xmm0
+ vpsrld xmm2, xmm7, 1
+ vpsrld xmm3, xmm7, 2
+ vpsrld xmm0, xmm7, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm4, xmm4, xmm2
+L_AES_GCM_decrypt_avx512_calc_iv_done:
+ ; T = Encrypt counter
+ vpxor xmm0, xmm0, xmm0
+ shl edx, 3
+ vmovq xmm0, rdx
+ vpxor xmm4, xmm4, xmm0
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm4, 78
+ vpshufd xmm2, xmm5, 78
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm7, xmm0
+ vmovdqa xmm4, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm7, xmm7, xmm2
+ vpxor xmm4, xmm4, xmm1
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm4, xmm4, xmm1
+ vpslld xmm0, xmm7, 31
+ vpslld xmm1, xmm7, 30
+ vpslld xmm2, xmm7, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm7, xmm7, xmm0
+ vpsrld xmm2, xmm7, 1
+ vpsrld xmm3, xmm7, 2
+ vpsrld xmm0, xmm7, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm4, xmm4, xmm2
+ vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ ; Encrypt counter
+ vmovdqa xmm8, OWORD PTR [r15]
+ vpxor xmm8, xmm8, xmm4
+ vaesenc xmm8, xmm8, [r15+16]
+ vaesenc xmm8, xmm8, [r15+32]
+ vaesenc xmm8, xmm8, [r15+48]
+ vaesenc xmm8, xmm8, [r15+64]
+ vaesenc xmm8, xmm8, [r15+80]
+ vaesenc xmm8, xmm8, [r15+96]
+ vaesenc xmm8, xmm8, [r15+112]
+ vaesenc xmm8, xmm8, [r15+128]
+ vaesenc xmm8, xmm8, [r15+144]
+ cmp r10d, 11
+ vmovdqa xmm9, OWORD PTR [r15+160]
+ jl L_AES_GCM_decrypt_avx512_calc_iv_2_aesenc_avx_last
+ vaesenc xmm8, xmm8, xmm9
+ vaesenc xmm8, xmm8, [r15+176]
+ cmp r10d, 13
+ vmovdqa xmm9, OWORD PTR [r15+192]
+ jl L_AES_GCM_decrypt_avx512_calc_iv_2_aesenc_avx_last
+ vaesenc xmm8, xmm8, xmm9
+ vaesenc xmm8, xmm8, [r15+208]
+ vmovdqa xmm9, OWORD PTR [r15+224]
+L_AES_GCM_decrypt_avx512_calc_iv_2_aesenc_avx_last:
+ vaesenclast xmm8, xmm8, xmm9
+ vmovdqu OWORD PTR [rsp+1040], xmm8
+L_AES_GCM_decrypt_avx512_iv_done:
+ ; Additional authentication data
+ mov edx, r11d
+ cmp edx, 0
+ je L_AES_GCM_decrypt_avx512_calc_aad_done
+ xor ecx, ecx
+ cmp edx, 16
+ jl L_AES_GCM_decrypt_avx512_calc_aad_lt16
+ and edx, 4294967280
+L_AES_GCM_decrypt_avx512_calc_aad_16_loop:
+ vmovdqu xmm8, OWORD PTR [r12+rcx]
+ vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vpxor xmm6, xmm6, xmm8
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm6, 78
+ vpshufd xmm2, xmm5, 78
+ vpclmulqdq xmm3, xmm5, xmm6, 17
+ vpclmulqdq xmm0, xmm5, xmm6, 0
+ vpxor xmm1, xmm1, xmm6
+ vpxor xmm2, xmm2, xmm5
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm7, xmm0
+ vmovdqa xmm6, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm7, xmm7, xmm2
+ vpxor xmm6, xmm6, xmm1
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm6, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm6, xmm6, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm6, xmm6, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm6, xmm6, xmm1
+ vpslld xmm0, xmm7, 31
+ vpslld xmm1, xmm7, 30
+ vpslld xmm2, xmm7, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm7, xmm7, xmm0
+ vpsrld xmm2, xmm7, 1
+ vpsrld xmm3, xmm7, 2
+ vpsrld xmm0, xmm7, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm6, xmm6, xmm2
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_decrypt_avx512_calc_aad_16_loop
+ mov edx, r11d
+ cmp ecx, edx
+ je L_AES_GCM_decrypt_avx512_calc_aad_done
+L_AES_GCM_decrypt_avx512_calc_aad_lt16:
+ sub rsp, 16
+ vpxor xmm8, xmm8, xmm8
+ xor ebx, ebx
+ vmovdqu OWORD PTR [rsp], xmm8
+L_AES_GCM_decrypt_avx512_calc_aad_loop:
+ movzx r13d, BYTE PTR [r12+rcx]
+ mov BYTE PTR [rsp+rbx], r13b
+ inc ecx
+ inc ebx
+ cmp ecx, edx
+ jl L_AES_GCM_decrypt_avx512_calc_aad_loop
+ vmovdqu xmm8, OWORD PTR [rsp]
+ add rsp, 16
+ vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vpxor xmm6, xmm6, xmm8
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm6, 78
+ vpshufd xmm2, xmm5, 78
+ vpclmulqdq xmm3, xmm5, xmm6, 17
+ vpclmulqdq xmm0, xmm5, xmm6, 0
+ vpxor xmm1, xmm1, xmm6
+ vpxor xmm2, xmm2, xmm5
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm7, xmm0
+ vmovdqa xmm6, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm7, xmm7, xmm2
+ vpxor xmm6, xmm6, xmm1
+ vpsrld xmm0, xmm7, 31
+ vpsrld xmm1, xmm6, 31
+ vpslld xmm7, xmm7, 1
+ vpslld xmm6, xmm6, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm6, xmm6, xmm2
+ vpor xmm7, xmm7, xmm0
+ vpor xmm6, xmm6, xmm1
+ vpslld xmm0, xmm7, 31
+ vpslld xmm1, xmm7, 30
+ vpslld xmm2, xmm7, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm7, xmm7, xmm0
+ vpsrld xmm2, xmm7, 1
+ vpsrld xmm3, xmm7, 2
+ vpsrld xmm0, xmm7, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm7
+ vpxor xmm6, xmm6, xmm2
+L_AES_GCM_decrypt_avx512_calc_aad_done:
+ ; Calculate counter and H
+ vpsrlq xmm9, xmm5, 63
+ vpsllq xmm8, xmm5, 1
+ vpslldq xmm9, xmm9, 8
+ vpor xmm8, xmm8, xmm9
+ vpshufd xmm5, xmm5, 255
+ vpsrad xmm5, xmm5, 31
+ vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_epi64
+ vpand xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpaddd xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_one
+ vpxor xmm5, xmm5, xmm8
+ vmovdqu OWORD PTR [rsp+1024], xmm4
+ xor ebx, ebx
+ cmp r9d, 256
+ jl L_AES_GCM_decrypt_avx512_done_128
+ vmovdqa xmm2, xmm6
+ ; H ^ 1
+ vmovdqu OWORD PTR [rsp], xmm5
+ ; H ^ 2
+ vpclmulqdq xmm8, xmm5, xmm5, 0
+ vpclmulqdq xmm11, xmm5, xmm5, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm0, xmm11
+ vmovdqu OWORD PTR [rsp+16], xmm0
+ ; H ^ 3
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm5, 78
+ vpxor xmm9, xmm9, xmm5
+ vpshufd xmm10, xmm0, 78
+ vpxor xmm10, xmm10, xmm0
+ vpclmulqdq xmm8, xmm0, xmm5, 0
+ vpclmulqdq xmm11, xmm0, xmm5, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm1, xmm11
+ vmovdqu OWORD PTR [rsp+32], xmm1
+ ; H ^ 4
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm3, xmm11
+ vmovdqu OWORD PTR [rsp+48], xmm3
+ ; H ^ 5
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+64], xmm7
+ ; H ^ 6
+ vpclmulqdq xmm8, xmm1, xmm1, 0
+ vpclmulqdq xmm11, xmm1, xmm1, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+80], xmm7
+ ; H ^ 7
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm1, 78
+ vpxor xmm9, xmm9, xmm1
+ vpshufd xmm10, xmm3, 78
+ vpxor xmm10, xmm10, xmm3
+ vpclmulqdq xmm8, xmm3, xmm1, 0
+ vpclmulqdq xmm11, xmm3, xmm1, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm7
+ ; H ^ 8
+ vpclmulqdq xmm8, xmm3, xmm3, 0
+ vpclmulqdq xmm11, xmm3, xmm3, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+112], xmm7
+ ; H ^ 9
+ vmovdqu xmm0, OWORD PTR [rsp+48]
+ vmovdqu xmm1, OWORD PTR [rsp+64]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+128], xmm7
+ ; H ^ 10
+ vmovdqu xmm0, OWORD PTR [rsp+64]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+144], xmm7
+ ; H ^ 11
+ vmovdqu xmm0, OWORD PTR [rsp+64]
+ vmovdqu xmm1, OWORD PTR [rsp+80]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+160], xmm7
+ ; H ^ 12
+ vmovdqu xmm0, OWORD PTR [rsp+80]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+176], xmm7
+ ; H ^ 13
+ vmovdqu xmm0, OWORD PTR [rsp+80]
+ vmovdqu xmm1, OWORD PTR [rsp+96]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+192], xmm7
+ ; H ^ 14
+ vmovdqu xmm0, OWORD PTR [rsp+96]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+208], xmm7
+ ; H ^ 15
+ vmovdqu xmm0, OWORD PTR [rsp+96]
+ vmovdqu xmm1, OWORD PTR [rsp+112]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+224], xmm7
+ ; H ^ 16
+ vmovdqu xmm0, OWORD PTR [rsp+112]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+240], xmm7
+ cmp r9d, 512
+ jl L_AES_GCM_decrypt_avx512_no_ext
+ ; H ^ 17
+ vmovdqu xmm0, OWORD PTR [rsp+112]
+ vmovdqu xmm1, OWORD PTR [rsp+128]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+256], xmm7
+ ; H ^ 18
+ vmovdqu xmm0, OWORD PTR [rsp+128]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+272], xmm7
+ ; H ^ 19
+ vmovdqu xmm0, OWORD PTR [rsp+128]
+ vmovdqu xmm1, OWORD PTR [rsp+144]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+288], xmm7
+ ; H ^ 20
+ vmovdqu xmm0, OWORD PTR [rsp+144]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+304], xmm7
+ ; H ^ 21
+ vmovdqu xmm0, OWORD PTR [rsp+144]
+ vmovdqu xmm1, OWORD PTR [rsp+160]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+320], xmm7
+ ; H ^ 22
+ vmovdqu xmm0, OWORD PTR [rsp+160]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+336], xmm7
+ ; H ^ 23
+ vmovdqu xmm0, OWORD PTR [rsp+160]
+ vmovdqu xmm1, OWORD PTR [rsp+176]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+352], xmm7
+ ; H ^ 24
+ vmovdqu xmm0, OWORD PTR [rsp+176]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+368], xmm7
+ ; H ^ 25
+ vmovdqu xmm0, OWORD PTR [rsp+176]
+ vmovdqu xmm1, OWORD PTR [rsp+192]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+384], xmm7
+ ; H ^ 26
+ vmovdqu xmm0, OWORD PTR [rsp+192]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+400], xmm7
+ ; H ^ 27
+ vmovdqu xmm0, OWORD PTR [rsp+192]
+ vmovdqu xmm1, OWORD PTR [rsp+208]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+416], xmm7
+ ; H ^ 28
+ vmovdqu xmm0, OWORD PTR [rsp+208]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+432], xmm7
+ ; H ^ 29
+ vmovdqu xmm0, OWORD PTR [rsp+208]
+ vmovdqu xmm1, OWORD PTR [rsp+224]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+448], xmm7
+ ; H ^ 30
+ vmovdqu xmm0, OWORD PTR [rsp+224]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+464], xmm7
+ ; H ^ 31
+ vmovdqu xmm0, OWORD PTR [rsp+224]
+ vmovdqu xmm1, OWORD PTR [rsp+240]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+480], xmm7
+ ; H ^ 32
+ vmovdqu xmm0, OWORD PTR [rsp+240]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+496], xmm7
+L_AES_GCM_decrypt_avx512_no_ext:
+ vbroadcasti32x4 zmm22, ptr_L_avx512_aes_gcm_bswap_epi64
+ vbroadcasti32x4 zmm30, ptr_L_avx512_aes_gcm_bswap_mask
+ vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128
+ vbroadcasti32x4 zmm9, [r15]
+ vbroadcasti32x4 zmm10, [r15+16]
+ vbroadcasti32x4 zmm11, [r15+32]
+ vbroadcasti32x4 zmm12, [r15+48]
+ vbroadcasti32x4 zmm13, [r15+64]
+ vbroadcasti32x4 zmm14, [r15+80]
+ vbroadcasti32x4 zmm15, [r15+96]
+ vbroadcasti32x4 zmm1, [r15+112]
+ vbroadcasti32x4 zmm2, [r15+128]
+ vbroadcasti32x4 zmm3, [r15+144]
+ cmp r9d, 512
+ jl L_AES_GCM_decrypt_avx512_no_windows
+ mov r13d, r9d
+ and r13d, 4294966784
+ vmovdqu64 zmm23, [rsp+448]
+ vshufi64x2 zmm23, zmm23, zmm23, 27
+ vmovdqu64 zmm24, [rsp+384]
+ vshufi64x2 zmm24, zmm24, zmm24, 27
+ vmovdqu64 zmm25, [rsp+320]
+ vshufi64x2 zmm25, zmm25, zmm25, 27
+ vmovdqu64 zmm26, [rsp+256]
+ vshufi64x2 zmm26, zmm26, zmm26, 27
+ vmovdqu64 [rsp+512], zmm23
+ vmovdqu64 [rsp+576], zmm24
+ vmovdqu64 [rsp+640], zmm25
+ vmovdqu64 [rsp+704], zmm26
+ vmovdqu64 zmm23, [rsp+192]
+ vshufi64x2 zmm23, zmm23, zmm23, 27
+ vmovdqu64 zmm24, [rsp+128]
+ vshufi64x2 zmm24, zmm24, zmm24, 27
+ vmovdqu64 zmm25, [rsp+64]
+ vshufi64x2 zmm25, zmm25, zmm25, 27
+ vmovdqu64 zmm26, [rsp]
+ vshufi64x2 zmm26, zmm26, zmm26, 27
+ vmovdqu64 [rsp+768], zmm23
+ vmovdqu64 [rsp+832], zmm24
+ vmovdqu64 [rsp+896], zmm25
+ vmovdqu64 [rsp+960], zmm26
+ ; 512 bytes of input
+ xor r12d, r12d
+ lea rax, QWORD PTR [rdi+rbx]
+ vpxorq zmm20, zmm20, zmm20
+ vinserti32x4 zmm20, zmm20, xmm6, 0
+ vmovdqu64 zmm23, [rsp+512]
+ vmovdqu64 zmm24, [rsp+576]
+ vmovdqu64 zmm25, [rsp+640]
+ vmovdqu64 zmm26, [rsp+704]
+ vmovdqu64 zmm21, [rax]
+ vpshufb zmm21, zmm21, zmm30
+ vpxorq zmm21, zmm21, zmm20
+ vpclmulqdq zmm16, zmm21, zmm23, 0
+ vpclmulqdq zmm17, zmm21, zmm23, 1
+ vpclmulqdq zmm18, zmm21, zmm23, 16
+ vpclmulqdq zmm19, zmm21, zmm23, 17
+ vmovdqa64 zmm27, zmm16
+ vpxorq zmm28, zmm18, zmm17
+ vmovdqa64 zmm29, zmm19
+ vmovdqu64 zmm21, [rax+64]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm24, 0
+ vpclmulqdq zmm17, zmm21, zmm24, 1
+ vpclmulqdq zmm18, zmm21, zmm24, 16
+ vpclmulqdq zmm19, zmm21, zmm24, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rax+128]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm25, 0
+ vpclmulqdq zmm17, zmm21, zmm25, 1
+ vpclmulqdq zmm18, zmm21, zmm25, 16
+ vpclmulqdq zmm19, zmm21, zmm25, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rax+192]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm26, 0
+ vpclmulqdq zmm17, zmm21, zmm26, 1
+ vpclmulqdq zmm18, zmm21, zmm26, 16
+ vpclmulqdq zmm19, zmm21, zmm26, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm23, [rsp+768]
+ vmovdqu64 zmm24, [rsp+832]
+ vmovdqu64 zmm25, [rsp+896]
+ vmovdqu64 zmm26, [rsp+960]
+ vmovdqu64 zmm21, [rax+256]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm23, 0
+ vpclmulqdq zmm17, zmm21, zmm23, 1
+ vpclmulqdq zmm18, zmm21, zmm23, 16
+ vpclmulqdq zmm19, zmm21, zmm23, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rax+320]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm24, 0
+ vpclmulqdq zmm17, zmm21, zmm24, 1
+ vpclmulqdq zmm18, zmm21, zmm24, 16
+ vpclmulqdq zmm19, zmm21, zmm24, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rax+384]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm25, 0
+ vpclmulqdq zmm17, zmm21, zmm25, 1
+ vpclmulqdq zmm18, zmm21, zmm25, 16
+ vpclmulqdq zmm19, zmm21, zmm25, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rax+448]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm26, 0
+ vpclmulqdq zmm17, zmm21, zmm26, 1
+ vpclmulqdq zmm18, zmm21, zmm26, 16
+ vpclmulqdq zmm19, zmm21, zmm26, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vpclmulqdq zmm21, zmm31, zmm27, 1
+ vpshufd zmm27, zmm27, 78
+ vpternlogq zmm28, zmm27, zmm21, 150
+ vpclmulqdq zmm21, zmm31, zmm28, 1
+ vpshufd zmm28, zmm28, 78
+ vpternlogq zmm29, zmm28, zmm21, 150
+ vextracti32x4 xmm0, zmm29, 1
+ vextracti32x4 xmm4, zmm29, 2
+ vextracti32x4 xmm5, zmm29, 3
+ vpxorq xmm6, xmm29, xmm0
+ vpternlogq xmm6, xmm5, xmm4, 150
+ add ebx, 512
+ cmp ebx, r13d
+ jge L_AES_GCM_decrypt_avx512_last_aes
+L_AES_GCM_decrypt_avx512_win_loop:
+ lea rax, QWORD PTR [rdi+rbx]
+ vpxorq zmm21, zmm21, zmm21
+ vinserti32x4 zmm21, zmm21, xmm6, 0
+ vbroadcasti32x4 zmm20, [rsp+1024]
+ vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
+ vpshufb zmm16, zmm16, zmm22
+ vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
+ vpshufb zmm17, zmm17, zmm22
+ vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2
+ vpshufb zmm18, zmm18, zmm22
+ vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3
+ vpshufb zmm19, zmm19, zmm22
+ vmovdqu xmm8, OWORD PTR [rsp+1024]
+ vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen
+ vmovdqu OWORD PTR [rsp+1024], xmm8
+ vpxorq zmm16, zmm16, zmm9
+ vpxorq zmm17, zmm17, zmm9
+ vpxorq zmm18, zmm18, zmm9
+ vpxorq zmm19, zmm19, zmm9
+ vmovdqu64 zmm31, [rax]
+ vpshufb zmm31, zmm31, zmm30
+ vpxorq zmm31, zmm31, zmm21
+ vaesenc zmm16, zmm16, zmm10
+ vaesenc zmm17, zmm17, zmm10
+ vaesenc zmm18, zmm18, zmm10
+ vaesenc zmm19, zmm19, zmm10
+ vpclmulqdq zmm23, zmm31, [rsp+512], 0
+ vpclmulqdq zmm24, zmm31, [rsp+512], 1
+ vpclmulqdq zmm25, zmm31, [rsp+512], 16
+ vpclmulqdq zmm26, zmm31, [rsp+512], 17
+ vmovdqa64 zmm27, zmm23
+ vpxorq zmm28, zmm25, zmm24
+ vmovdqa64 zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm11
+ vaesenc zmm17, zmm17, zmm11
+ vaesenc zmm18, zmm18, zmm11
+ vaesenc zmm19, zmm19, zmm11
+ vmovdqu64 zmm31, [rax+64]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm12
+ vaesenc zmm17, zmm17, zmm12
+ vaesenc zmm18, zmm18, zmm12
+ vaesenc zmm19, zmm19, zmm12
+ vpclmulqdq zmm23, zmm31, [rsp+576], 0
+ vpclmulqdq zmm24, zmm31, [rsp+576], 1
+ vpclmulqdq zmm25, zmm31, [rsp+576], 16
+ vpclmulqdq zmm26, zmm31, [rsp+576], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm13
+ vaesenc zmm17, zmm17, zmm13
+ vaesenc zmm18, zmm18, zmm13
+ vaesenc zmm19, zmm19, zmm13
+ vmovdqu64 zmm31, [rax+128]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm14
+ vaesenc zmm17, zmm17, zmm14
+ vaesenc zmm18, zmm18, zmm14
+ vaesenc zmm19, zmm19, zmm14
+ vpclmulqdq zmm23, zmm31, [rsp+640], 0
+ vpclmulqdq zmm24, zmm31, [rsp+640], 1
+ vpclmulqdq zmm25, zmm31, [rsp+640], 16
+ vpclmulqdq zmm26, zmm31, [rsp+640], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm15
+ vaesenc zmm17, zmm17, zmm15
+ vaesenc zmm18, zmm18, zmm15
+ vaesenc zmm19, zmm19, zmm15
+ vmovdqu64 zmm31, [rax+192]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm1
+ vaesenc zmm17, zmm17, zmm1
+ vaesenc zmm18, zmm18, zmm1
+ vaesenc zmm19, zmm19, zmm1
+ vpclmulqdq zmm23, zmm31, [rsp+704], 0
+ vpclmulqdq zmm24, zmm31, [rsp+704], 1
+ vpclmulqdq zmm25, zmm31, [rsp+704], 16
+ vpclmulqdq zmm26, zmm31, [rsp+704], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm2
+ vaesenc zmm17, zmm17, zmm2
+ vaesenc zmm18, zmm18, zmm2
+ vaesenc zmm19, zmm19, zmm2
+ vaesenc zmm16, zmm16, zmm3
+ vaesenc zmm17, zmm17, zmm3
+ vaesenc zmm18, zmm18, zmm3
+ vaesenc zmm19, zmm19, zmm3
+ cmp r10d, 11
+ vbroadcasti32x4 zmm20, [r15+160]
+ jl L_AES_GCM_decrypt_avx512_a_il_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+176]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ cmp r10d, 13
+ vbroadcasti32x4 zmm20, [r15+192]
+ jl L_AES_GCM_decrypt_avx512_a_il_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+208]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+224]
+L_AES_GCM_decrypt_avx512_a_il_last:
+ vaesenclast zmm16, zmm16, zmm20
+ vaesenclast zmm17, zmm17, zmm20
+ vaesenclast zmm18, zmm18, zmm20
+ vaesenclast zmm19, zmm19, zmm20
+ lea rcx, QWORD PTR [rdi+r12]
+ lea rdx, QWORD PTR [rsi+r12]
+ vmovdqu64 zmm21, [rcx]
+ vpxorq zmm16, zmm16, zmm21
+ vmovdqu64 [rdx], zmm16
+ vmovdqu64 zmm21, [rcx+64]
+ vpxorq zmm17, zmm17, zmm21
+ vmovdqu64 [rdx+64], zmm17
+ vmovdqu64 zmm21, [rcx+128]
+ vpxorq zmm18, zmm18, zmm21
+ vmovdqu64 [rdx+128], zmm18
+ vmovdqu64 zmm21, [rcx+192]
+ vpxorq zmm19, zmm19, zmm21
+ vmovdqu64 [rdx+192], zmm19
+ add r12d, 256
+ vbroadcasti32x4 zmm20, [rsp+1024]
+ vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
+ vpshufb zmm16, zmm16, zmm22
+ vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
+ vpshufb zmm17, zmm17, zmm22
+ vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2
+ vpshufb zmm18, zmm18, zmm22
+ vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3
+ vpshufb zmm19, zmm19, zmm22
+ vmovdqu xmm8, OWORD PTR [rsp+1024]
+ vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen
+ vmovdqu OWORD PTR [rsp+1024], xmm8
+ vpxorq zmm16, zmm16, zmm9
+ vpxorq zmm17, zmm17, zmm9
+ vpxorq zmm18, zmm18, zmm9
+ vpxorq zmm19, zmm19, zmm9
+ vmovdqu64 zmm31, [rax+256]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm10
+ vaesenc zmm17, zmm17, zmm10
+ vaesenc zmm18, zmm18, zmm10
+ vaesenc zmm19, zmm19, zmm10
+ vpclmulqdq zmm23, zmm31, [rsp+768], 0
+ vpclmulqdq zmm24, zmm31, [rsp+768], 1
+ vpclmulqdq zmm25, zmm31, [rsp+768], 16
+ vpclmulqdq zmm26, zmm31, [rsp+768], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm11
+ vaesenc zmm17, zmm17, zmm11
+ vaesenc zmm18, zmm18, zmm11
+ vaesenc zmm19, zmm19, zmm11
+ vmovdqu64 zmm31, [rax+320]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm12
+ vaesenc zmm17, zmm17, zmm12
+ vaesenc zmm18, zmm18, zmm12
+ vaesenc zmm19, zmm19, zmm12
+ vpclmulqdq zmm23, zmm31, [rsp+832], 0
+ vpclmulqdq zmm24, zmm31, [rsp+832], 1
+ vpclmulqdq zmm25, zmm31, [rsp+832], 16
+ vpclmulqdq zmm26, zmm31, [rsp+832], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm13
+ vaesenc zmm17, zmm17, zmm13
+ vaesenc zmm18, zmm18, zmm13
+ vaesenc zmm19, zmm19, zmm13
+ vmovdqu64 zmm31, [rax+384]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm14
+ vaesenc zmm17, zmm17, zmm14
+ vaesenc zmm18, zmm18, zmm14
+ vaesenc zmm19, zmm19, zmm14
+ vpclmulqdq zmm23, zmm31, [rsp+896], 0
+ vpclmulqdq zmm24, zmm31, [rsp+896], 1
+ vpclmulqdq zmm25, zmm31, [rsp+896], 16
+ vpclmulqdq zmm26, zmm31, [rsp+896], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm15
+ vaesenc zmm17, zmm17, zmm15
+ vaesenc zmm18, zmm18, zmm15
+ vaesenc zmm19, zmm19, zmm15
+ vmovdqu64 zmm31, [rax+448]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm1
+ vaesenc zmm17, zmm17, zmm1
+ vaesenc zmm18, zmm18, zmm1
+ vaesenc zmm19, zmm19, zmm1
+ vpclmulqdq zmm23, zmm31, [rsp+960], 0
+ vpclmulqdq zmm24, zmm31, [rsp+960], 1
+ vpclmulqdq zmm25, zmm31, [rsp+960], 16
+ vpclmulqdq zmm26, zmm31, [rsp+960], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm2
+ vaesenc zmm17, zmm17, zmm2
+ vaesenc zmm18, zmm18, zmm2
+ vaesenc zmm19, zmm19, zmm2
+ vaesenc zmm16, zmm16, zmm3
+ vaesenc zmm17, zmm17, zmm3
+ vaesenc zmm18, zmm18, zmm3
+ vaesenc zmm19, zmm19, zmm3
+ cmp r10d, 11
+ vbroadcasti32x4 zmm20, [r15+160]
+ jl L_AES_GCM_decrypt_avx512_b_il_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+176]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ cmp r10d, 13
+ vbroadcasti32x4 zmm20, [r15+192]
+ jl L_AES_GCM_decrypt_avx512_b_il_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+208]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+224]
+L_AES_GCM_decrypt_avx512_b_il_last:
+ vaesenclast zmm16, zmm16, zmm20
+ vaesenclast zmm17, zmm17, zmm20
+ vaesenclast zmm18, zmm18, zmm20
+ vaesenclast zmm19, zmm19, zmm20
+ lea rcx, QWORD PTR [rdi+r12]
+ lea rdx, QWORD PTR [rsi+r12]
+ vmovdqu64 zmm21, [rcx]
+ vpxorq zmm16, zmm16, zmm21
+ vmovdqu64 [rdx], zmm16
+ vmovdqu64 zmm21, [rcx+64]
+ vpxorq zmm17, zmm17, zmm21
+ vmovdqu64 [rdx+64], zmm17
+ vmovdqu64 zmm21, [rcx+128]
+ vpxorq zmm18, zmm18, zmm21
+ vmovdqu64 [rdx+128], zmm18
+ vmovdqu64 zmm21, [rcx+192]
+ vpxorq zmm19, zmm19, zmm21
+ vmovdqu64 [rdx+192], zmm19
+ add r12d, 256
+ vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128
+ vpclmulqdq zmm23, zmm31, zmm27, 1
+ vpshufd zmm27, zmm27, 78
+ vpternlogq zmm28, zmm27, zmm23, 150
+ vpclmulqdq zmm23, zmm31, zmm28, 1
+ vpshufd zmm28, zmm28, 78
+ vpternlogq zmm29, zmm28, zmm23, 150
+ vextracti32x4 xmm0, zmm29, 1
+ vextracti32x4 xmm4, zmm29, 2
+ vextracti32x4 xmm5, zmm29, 3
+ vpxorq xmm6, xmm29, xmm0
+ vpternlogq xmm6, xmm5, xmm4, 150
+ add ebx, 512
+ cmp ebx, r13d
+ jl L_AES_GCM_decrypt_avx512_win_loop
+L_AES_GCM_decrypt_avx512_last_aes:
+ vbroadcasti32x4 zmm20, [rsp+1024]
+ vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
+ vpshufb zmm16, zmm16, zmm22
+ vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
+ vpshufb zmm17, zmm17, zmm22
+ vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2
+ vpshufb zmm18, zmm18, zmm22
+ vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3
+ vpshufb zmm19, zmm19, zmm22
+ vmovdqu xmm8, OWORD PTR [rsp+1024]
+ vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen
+ vmovdqu OWORD PTR [rsp+1024], xmm8
+ vpxorq zmm16, zmm16, zmm9
+ vpxorq zmm17, zmm17, zmm9
+ vpxorq zmm18, zmm18, zmm9
+ vpxorq zmm19, zmm19, zmm9
+ vaesenc zmm16, zmm16, zmm10
+ vaesenc zmm17, zmm17, zmm10
+ vaesenc zmm18, zmm18, zmm10
+ vaesenc zmm19, zmm19, zmm10
+ vaesenc zmm16, zmm16, zmm11
+ vaesenc zmm17, zmm17, zmm11
+ vaesenc zmm18, zmm18, zmm11
+ vaesenc zmm19, zmm19, zmm11
+ vaesenc zmm16, zmm16, zmm12
+ vaesenc zmm17, zmm17, zmm12
+ vaesenc zmm18, zmm18, zmm12
+ vaesenc zmm19, zmm19, zmm12
+ vaesenc zmm16, zmm16, zmm13
+ vaesenc zmm17, zmm17, zmm13
+ vaesenc zmm18, zmm18, zmm13
+ vaesenc zmm19, zmm19, zmm13
+ vaesenc zmm16, zmm16, zmm14
+ vaesenc zmm17, zmm17, zmm14
+ vaesenc zmm18, zmm18, zmm14
+ vaesenc zmm19, zmm19, zmm14
+ vaesenc zmm16, zmm16, zmm15
+ vaesenc zmm17, zmm17, zmm15
+ vaesenc zmm18, zmm18, zmm15
+ vaesenc zmm19, zmm19, zmm15
+ vaesenc zmm16, zmm16, zmm1
+ vaesenc zmm17, zmm17, zmm1
+ vaesenc zmm18, zmm18, zmm1
+ vaesenc zmm19, zmm19, zmm1
+ vaesenc zmm16, zmm16, zmm2
+ vaesenc zmm17, zmm17, zmm2
+ vaesenc zmm18, zmm18, zmm2
+ vaesenc zmm19, zmm19, zmm2
+ vaesenc zmm16, zmm16, zmm3
+ vaesenc zmm17, zmm17, zmm3
+ vaesenc zmm18, zmm18, zmm3
+ vaesenc zmm19, zmm19, zmm3
+ cmp r10d, 11
+ vbroadcasti32x4 zmm20, [r15+160]
+ jl L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+176]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ cmp r10d, 13
+ vbroadcasti32x4 zmm20, [r15+192]
+ jl L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+208]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+224]
+L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last:
+ vaesenclast zmm16, zmm16, zmm20
+ vaesenclast zmm17, zmm17, zmm20
+ vaesenclast zmm18, zmm18, zmm20
+ vaesenclast zmm19, zmm19, zmm20
+ lea rcx, QWORD PTR [rdi+r12]
+ lea rdx, QWORD PTR [rsi+r12]
+ vmovdqu64 zmm21, [rcx]
+ vpxorq zmm16, zmm16, zmm21
+ vmovdqu64 [rdx], zmm16
+ vmovdqu64 zmm21, [rcx+64]
+ vpxorq zmm17, zmm17, zmm21
+ vmovdqu64 [rdx+64], zmm17
+ vmovdqu64 zmm21, [rcx+128]
+ vpxorq zmm18, zmm18, zmm21
+ vmovdqu64 [rdx+128], zmm18
+ vmovdqu64 zmm21, [rcx+192]
+ vpxorq zmm19, zmm19, zmm21
+ vmovdqu64 [rdx+192], zmm19
+ add r12d, 256
+ vbroadcasti32x4 zmm20, [rsp+1024]
+ vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
+ vpshufb zmm16, zmm16, zmm22
+ vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
+ vpshufb zmm17, zmm17, zmm22
+ vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2
+ vpshufb zmm18, zmm18, zmm22
+ vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3
+ vpshufb zmm19, zmm19, zmm22
+ vmovdqu xmm8, OWORD PTR [rsp+1024]
+ vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen
+ vmovdqu OWORD PTR [rsp+1024], xmm8
+ vpxorq zmm16, zmm16, zmm9
+ vpxorq zmm17, zmm17, zmm9
+ vpxorq zmm18, zmm18, zmm9
+ vpxorq zmm19, zmm19, zmm9
+ vaesenc zmm16, zmm16, zmm10
+ vaesenc zmm17, zmm17, zmm10
+ vaesenc zmm18, zmm18, zmm10
+ vaesenc zmm19, zmm19, zmm10
+ vaesenc zmm16, zmm16, zmm11
+ vaesenc zmm17, zmm17, zmm11
+ vaesenc zmm18, zmm18, zmm11
+ vaesenc zmm19, zmm19, zmm11
+ vaesenc zmm16, zmm16, zmm12
+ vaesenc zmm17, zmm17, zmm12
+ vaesenc zmm18, zmm18, zmm12
+ vaesenc zmm19, zmm19, zmm12
+ vaesenc zmm16, zmm16, zmm13
+ vaesenc zmm17, zmm17, zmm13
+ vaesenc zmm18, zmm18, zmm13
+ vaesenc zmm19, zmm19, zmm13
+ vaesenc zmm16, zmm16, zmm14
+ vaesenc zmm17, zmm17, zmm14
+ vaesenc zmm18, zmm18, zmm14
+ vaesenc zmm19, zmm19, zmm14
+ vaesenc zmm16, zmm16, zmm15
+ vaesenc zmm17, zmm17, zmm15
+ vaesenc zmm18, zmm18, zmm15
+ vaesenc zmm19, zmm19, zmm15
+ vaesenc zmm16, zmm16, zmm1
+ vaesenc zmm17, zmm17, zmm1
+ vaesenc zmm18, zmm18, zmm1
+ vaesenc zmm19, zmm19, zmm1
+ vaesenc zmm16, zmm16, zmm2
+ vaesenc zmm17, zmm17, zmm2
+ vaesenc zmm18, zmm18, zmm2
+ vaesenc zmm19, zmm19, zmm2
+ vaesenc zmm16, zmm16, zmm3
+ vaesenc zmm17, zmm17, zmm3
+ vaesenc zmm18, zmm18, zmm3
+ vaesenc zmm19, zmm19, zmm3
+ cmp r10d, 11
+ vbroadcasti32x4 zmm20, [r15+160]
+ jl L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+176]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ cmp r10d, 13
+ vbroadcasti32x4 zmm20, [r15+192]
+ jl L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+208]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+224]
+L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last:
+ vaesenclast zmm16, zmm16, zmm20
+ vaesenclast zmm17, zmm17, zmm20
+ vaesenclast zmm18, zmm18, zmm20
+ vaesenclast zmm19, zmm19, zmm20
+ lea rcx, QWORD PTR [rdi+r12]
+ lea rdx, QWORD PTR [rsi+r12]
+ vmovdqu64 zmm21, [rcx]
+ vpxorq zmm16, zmm16, zmm21
+ vmovdqu64 [rdx], zmm16
+ vmovdqu64 zmm21, [rcx+64]
+ vpxorq zmm17, zmm17, zmm21
+ vmovdqu64 [rdx+64], zmm17
+ vmovdqu64 zmm21, [rcx+128]
+ vpxorq zmm18, zmm18, zmm21
+ vmovdqu64 [rdx+128], zmm18
+ vmovdqu64 zmm21, [rcx+192]
+ vpxorq zmm19, zmm19, zmm21
+ vmovdqu64 [rdx+192], zmm19
+ add r12d, 256
+L_AES_GCM_decrypt_avx512_no_windows:
+ vmovdqu64 zmm23, [rsp+192]
+ vshufi64x2 zmm23, zmm23, zmm23, 27
+ vmovdqu64 zmm24, [rsp+128]
+ vshufi64x2 zmm24, zmm24, zmm24, 27
+ vmovdqu64 zmm25, [rsp+64]
+ vshufi64x2 zmm25, zmm25, zmm25, 27
+ vmovdqu64 zmm26, [rsp]
+ vshufi64x2 zmm26, zmm26, zmm26, 27
+ mov r13d, r9d
+ and r13d, 4294967040
+ cmp ebx, r13d
+ jge L_AES_GCM_decrypt_avx512_after_256
+ ; 256 bytes of input
+ lea rax, QWORD PTR [rdi+rbx]
+ vpxorq zmm20, zmm20, zmm20
+ vinserti32x4 zmm20, zmm20, xmm6, 0
+ vmovdqu64 zmm21, [rax]
+ vpshufb zmm21, zmm21, zmm30
+ vpxorq zmm21, zmm21, zmm20
+ vpclmulqdq zmm16, zmm21, zmm23, 0
+ vpclmulqdq zmm17, zmm21, zmm23, 1
+ vpclmulqdq zmm18, zmm21, zmm23, 16
+ vpclmulqdq zmm19, zmm21, zmm23, 17
+ vmovdqa64 zmm27, zmm16
+ vpxorq zmm28, zmm18, zmm17
+ vmovdqa64 zmm29, zmm19
+ vmovdqu64 zmm21, [rax+64]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm24, 0
+ vpclmulqdq zmm17, zmm21, zmm24, 1
+ vpclmulqdq zmm18, zmm21, zmm24, 16
+ vpclmulqdq zmm19, zmm21, zmm24, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rax+128]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm25, 0
+ vpclmulqdq zmm17, zmm21, zmm25, 1
+ vpclmulqdq zmm18, zmm21, zmm25, 16
+ vpclmulqdq zmm19, zmm21, zmm25, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rax+192]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm26, 0
+ vpclmulqdq zmm17, zmm21, zmm26, 1
+ vpclmulqdq zmm18, zmm21, zmm26, 16
+ vpclmulqdq zmm19, zmm21, zmm26, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vpclmulqdq zmm21, zmm31, zmm27, 1
+ vpshufd zmm27, zmm27, 78
+ vpternlogq zmm28, zmm27, zmm21, 150
+ vpclmulqdq zmm21, zmm31, zmm28, 1
+ vpshufd zmm28, zmm28, 78
+ vpternlogq zmm29, zmm28, zmm21, 150
+ vextracti32x4 xmm0, zmm29, 1
+ vextracti32x4 xmm4, zmm29, 2
+ vextracti32x4 xmm5, zmm29, 3
+ vpxorq xmm6, xmm29, xmm0
+ vpternlogq xmm6, xmm5, xmm4, 150
+ vbroadcasti32x4 zmm20, [rsp+1024]
+ vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
+ vpshufb zmm16, zmm16, zmm22
+ vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
+ vpshufb zmm17, zmm17, zmm22
+ vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2
+ vpshufb zmm18, zmm18, zmm22
+ vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3
+ vpshufb zmm19, zmm19, zmm22
+ vmovdqu xmm8, OWORD PTR [rsp+1024]
+ vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen
+ vmovdqu OWORD PTR [rsp+1024], xmm8
+ vpxorq zmm16, zmm16, zmm9
+ vpxorq zmm17, zmm17, zmm9
+ vpxorq zmm18, zmm18, zmm9
+ vpxorq zmm19, zmm19, zmm9
+ vaesenc zmm16, zmm16, zmm10
+ vaesenc zmm17, zmm17, zmm10
+ vaesenc zmm18, zmm18, zmm10
+ vaesenc zmm19, zmm19, zmm10
+ vaesenc zmm16, zmm16, zmm11
+ vaesenc zmm17, zmm17, zmm11
+ vaesenc zmm18, zmm18, zmm11
+ vaesenc zmm19, zmm19, zmm11
+ vaesenc zmm16, zmm16, zmm12
+ vaesenc zmm17, zmm17, zmm12
+ vaesenc zmm18, zmm18, zmm12
+ vaesenc zmm19, zmm19, zmm12
+ vaesenc zmm16, zmm16, zmm13
+ vaesenc zmm17, zmm17, zmm13
+ vaesenc zmm18, zmm18, zmm13
+ vaesenc zmm19, zmm19, zmm13
+ vaesenc zmm16, zmm16, zmm14
+ vaesenc zmm17, zmm17, zmm14
+ vaesenc zmm18, zmm18, zmm14
+ vaesenc zmm19, zmm19, zmm14
+ vaesenc zmm16, zmm16, zmm15
+ vaesenc zmm17, zmm17, zmm15
+ vaesenc zmm18, zmm18, zmm15
+ vaesenc zmm19, zmm19, zmm15
+ vaesenc zmm16, zmm16, zmm1
+ vaesenc zmm17, zmm17, zmm1
+ vaesenc zmm18, zmm18, zmm1
+ vaesenc zmm19, zmm19, zmm1
+ vaesenc zmm16, zmm16, zmm2
+ vaesenc zmm17, zmm17, zmm2
+ vaesenc zmm18, zmm18, zmm2
+ vaesenc zmm19, zmm19, zmm2
+ vaesenc zmm16, zmm16, zmm3
+ vaesenc zmm17, zmm17, zmm3
+ vaesenc zmm18, zmm18, zmm3
+ vaesenc zmm19, zmm19, zmm3
+ cmp r10d, 11
+ vbroadcasti32x4 zmm20, [r15+160]
+ jl L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+176]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ cmp r10d, 13
+ vbroadcasti32x4 zmm20, [r15+192]
+ jl L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+208]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [r15+224]
+L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last:
+ vaesenclast zmm16, zmm16, zmm20
+ vaesenclast zmm17, zmm17, zmm20
+ vaesenclast zmm18, zmm18, zmm20
+ vaesenclast zmm19, zmm19, zmm20
+ lea rcx, QWORD PTR [rdi+rbx]
+ lea rdx, QWORD PTR [rsi+rbx]
+ vmovdqu64 zmm21, [rcx]
+ vpxorq zmm16, zmm16, zmm21
+ vmovdqu64 [rdx], zmm16
+ vmovdqu64 zmm21, [rcx+64]
+ vpxorq zmm17, zmm17, zmm21
+ vmovdqu64 [rdx+64], zmm17
+ vmovdqu64 zmm21, [rcx+128]
+ vpxorq zmm18, zmm18, zmm21
+ vmovdqu64 [rdx+128], zmm18
+ vmovdqu64 zmm21, [rcx+192]
+ vpxorq zmm19, zmm19, zmm21
+ vmovdqu64 [rdx+192], zmm19
+ add ebx, 256
+L_AES_GCM_decrypt_avx512_after_256:
+ vmovdqu xmm5, OWORD PTR [rsp]
+L_AES_GCM_decrypt_avx512_done_128:
+ mov edx, r9d
+ cmp ebx, edx
+ jge L_AES_GCM_decrypt_avx512_done_dec
+ mov r13d, r9d
+ and r13d, 4294967280
+ cmp ebx, r13d
+ jge L_AES_GCM_decrypt_avx512_last_block_done
+L_AES_GCM_decrypt_avx512_last_block_start:
+ vmovdqu xmm13, OWORD PTR [rdi+rbx]
+ vmovdqa xmm0, xmm5
+ vpshufb xmm1, xmm13, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vpxor xmm1, xmm1, xmm6
+ vmovdqu xmm9, OWORD PTR [rsp+1024]
+ vpshufb xmm8, xmm9, OWORD PTR L_avx512_aes_gcm_bswap_epi64
+ vpaddd xmm9, xmm9, OWORD PTR L_avx512_aes_gcm_one
+ vmovdqu OWORD PTR [rsp+1024], xmm9
+ vpxor xmm8, xmm8, [r15]
+ vpclmulqdq xmm10, xmm1, xmm0, 16
+ vaesenc xmm8, xmm8, [r15+16]
+ vaesenc xmm8, xmm8, [r15+32]
+ vpclmulqdq xmm11, xmm1, xmm0, 1
+ vaesenc xmm8, xmm8, [r15+48]
+ vaesenc xmm8, xmm8, [r15+64]
+ vpclmulqdq xmm12, xmm1, xmm0, 0
+ vaesenc xmm8, xmm8, [r15+80]
+ vpclmulqdq xmm1, xmm1, xmm0, 17
+ vaesenc xmm8, xmm8, [r15+96]
+ vpxor xmm10, xmm10, xmm11
+ vpslldq xmm2, xmm10, 8
+ vpsrldq xmm10, xmm10, 8
+ vaesenc xmm8, xmm8, [r15+112]
+ vpxor xmm2, xmm2, xmm12
+ vpxor xmm3, xmm1, xmm10
+ vmovdqa xmm0, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm2, xmm0, 16
+ vaesenc xmm8, xmm8, [r15+128]
+ vpshufd xmm10, xmm2, 78
+ vpxor xmm10, xmm10, xmm11
+ vpclmulqdq xmm11, xmm10, xmm0, 16
+ vaesenc xmm8, xmm8, [r15+144]
+ vpshufd xmm10, xmm10, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm6, xmm10, xmm3
+ cmp r10d, 11
+ vmovdqa xmm9, OWORD PTR [r15+160]
+ jl L_AES_GCM_decrypt_avx512_aesenc_gfmul_last
+ vaesenc xmm8, xmm8, xmm9
+ vaesenc xmm8, xmm8, [r15+176]
+ cmp r10d, 13
+ vmovdqa xmm9, OWORD PTR [r15+192]
+ jl L_AES_GCM_decrypt_avx512_aesenc_gfmul_last
+ vaesenc xmm8, xmm8, xmm9
+ vaesenc xmm8, xmm8, [r15+208]
+ vmovdqa xmm9, OWORD PTR [r15+224]
+L_AES_GCM_decrypt_avx512_aesenc_gfmul_last:
+ vaesenclast xmm8, xmm8, xmm9
+ vmovdqa xmm0, xmm13
+ vpxor xmm8, xmm8, xmm0
+ vmovdqu OWORD PTR [rsi+rbx], xmm8
+ add ebx, 16
+ cmp ebx, r13d
+ jl L_AES_GCM_decrypt_avx512_last_block_start
+L_AES_GCM_decrypt_avx512_last_block_done:
+ mov ecx, r9d
+ mov edx, ecx
+ and ecx, 15
+ jz L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_done
+ vmovdqu xmm4, OWORD PTR [rsp+1024]
+ vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_epi64
+ vpxor xmm4, xmm4, [r15]
+ vaesenc xmm4, xmm4, [r15+16]
+ vaesenc xmm4, xmm4, [r15+32]
+ vaesenc xmm4, xmm4, [r15+48]
+ vaesenc xmm4, xmm4, [r15+64]
+ vaesenc xmm4, xmm4, [r15+80]
+ vaesenc xmm4, xmm4, [r15+96]
+ vaesenc xmm4, xmm4, [r15+112]
+ vaesenc xmm4, xmm4, [r15+128]
+ vaesenc xmm4, xmm4, [r15+144]
+ cmp r10d, 11
+ vmovdqa xmm9, OWORD PTR [r15+160]
+ jl L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_aesenc_avx_last
+ vaesenc xmm4, xmm4, xmm9
+ vaesenc xmm4, xmm4, [r15+176]
+ cmp r10d, 13
+ vmovdqa xmm9, OWORD PTR [r15+192]
+ jl L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_aesenc_avx_last
+ vaesenc xmm4, xmm4, xmm9
+ vaesenc xmm4, xmm4, [r15+208]
+ vmovdqa xmm9, OWORD PTR [r15+224]
+L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_aesenc_avx_last:
+ vaesenclast xmm4, xmm4, xmm9
+ sub rsp, 32
+ xor ecx, ecx
+ vmovdqu OWORD PTR [rsp], xmm4
+ vpxor xmm0, xmm0, xmm0
+ vmovdqu OWORD PTR [rsp+16], xmm0
+L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_loop:
+ movzx r13d, BYTE PTR [rdi+rbx]
+ mov BYTE PTR [rsp+rcx+16], r13b
+ xor r13b, BYTE PTR [rsp+rcx]
+ mov BYTE PTR [rsi+rbx], r13b
+ inc ebx
+ inc ecx
+ cmp ebx, edx
+ jl L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_loop
+ vmovdqu xmm4, OWORD PTR [rsp+16]
+ add rsp, 32
+ vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vpxor xmm6, xmm6, xmm4
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm5, 78
+ vpxor xmm9, xmm9, xmm5
+ vpshufd xmm10, xmm6, 78
+ vpxor xmm10, xmm10, xmm6
+ vpclmulqdq xmm8, xmm6, xmm5, 0
+ vpclmulqdq xmm11, xmm6, xmm5, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm6, xmm11
+L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_done:
+L_AES_GCM_decrypt_avx512_done_dec:
+ mov edx, r9d
+ mov ecx, r11d
+ shl rdx, 3
+ shl rcx, 3
+ vmovq xmm0, rdx
+ vmovq xmm1, rcx
+ vpunpcklqdq xmm0, xmm0, xmm1
+ vpxor xmm6, xmm6, xmm0
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm5, 78
+ vpxor xmm9, xmm9, xmm5
+ vpshufd xmm10, xmm6, 78
+ vpxor xmm10, xmm10, xmm6
+ vpclmulqdq xmm8, xmm6, xmm5, 0
+ vpclmulqdq xmm11, xmm6, xmm5, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm6, xmm11
+ vpshufb xmm6, xmm6, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vmovdqu xmm0, OWORD PTR [rsp+1040]
+ vpxor xmm0, xmm0, xmm6
+ cmp r14d, 16
+ je L_AES_GCM_decrypt_avx512_cmp_tag_16
+ sub rsp, 16
+ xor rcx, rcx
+ xor rbx, rbx
+ vmovdqu OWORD PTR [rsp], xmm0
+L_AES_GCM_decrypt_avx512_cmp_tag_loop:
+ movzx r13d, BYTE PTR [rsp+rcx]
+ xor r13b, BYTE PTR [r8+rcx]
+ or bl, r13b
+ inc ecx
+ cmp ecx, r14d
+ jne L_AES_GCM_decrypt_avx512_cmp_tag_loop
+ cmp bl, 0
+ sete bl
+ add rsp, 16
+ xor rcx, rcx
+ jmp L_AES_GCM_decrypt_avx512_cmp_tag_done
+L_AES_GCM_decrypt_avx512_cmp_tag_16:
+ vmovdqu xmm1, OWORD PTR [r8]
+ vpcmpeqb xmm0, xmm0, xmm1
+ vpmovmskb rdx, xmm0
+ ; %%edx == 0xFFFF then return 1 else => return 0
+ xor ebx, ebx
+ cmp edx, 65535
+ sete bl
+L_AES_GCM_decrypt_avx512_cmp_tag_done:
+ mov DWORD PTR [rbp], ebx
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp+1056]
+ vmovdqu xmm7, OWORD PTR [rsp+1072]
+ vmovdqu xmm8, OWORD PTR [rsp+1088]
+ vmovdqu xmm9, OWORD PTR [rsp+1104]
+ vmovdqu xmm10, OWORD PTR [rsp+1120]
+ vmovdqu xmm11, OWORD PTR [rsp+1136]
+ vmovdqu xmm12, OWORD PTR [rsp+1152]
+ vmovdqu xmm13, OWORD PTR [rsp+1168]
+ vmovdqu xmm14, OWORD PTR [rsp+1184]
+ vmovdqu xmm15, OWORD PTR [rsp+1200]
+ add rsp, 1216
+ pop rbp
+ pop r15
+ pop r14
+ pop rbx
+ pop r12
+ pop rsi
+ pop rdi
+ pop r13
+ ret
+AES_GCM_decrypt_avx512 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_init_avx512 PROC
+ push rdi
+ push rsi
+ push r12
+ push r13
+ mov rdi, rcx
+ mov rsi, rdx
+ mov r10, r8
+ mov r11d, r9d
+ mov rax, QWORD PTR [rsp+72]
+ mov r8, QWORD PTR [rsp+80]
+ mov r9, QWORD PTR [rsp+88]
+ sub rsp, 80
+ vmovdqu OWORD PTR [rsp+16], xmm6
+ vmovdqu OWORD PTR [rsp+32], xmm7
+ vmovdqu OWORD PTR [rsp+48], xmm8
+ vmovdqu OWORD PTR [rsp+64], xmm15
+ vpxor xmm4, xmm4, xmm4
+ mov edx, r11d
+ cmp edx, 12
+ jne L_AES_GCM_init_avx512_iv_not_12
+ ; # Calculate values when IV is 12 bytes
+ ; Set counter based on IV
+ mov ecx, 16777216
+ vmovq xmm4, QWORD PTR [r10]
+ vpinsrd xmm4, xmm4, DWORD PTR [r10+8], 2
+ vpinsrd xmm4, xmm4, ecx, 3
+ ; H = Encrypt X(=0) and T = Encrypt counter
+ vmovdqa xmm5, OWORD PTR [rdi]
+ vpxor xmm1, xmm4, xmm5
+ vmovdqa xmm6, OWORD PTR [rdi+16]
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ vmovdqa xmm6, OWORD PTR [rdi+32]
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ vmovdqa xmm6, OWORD PTR [rdi+48]
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ vmovdqa xmm6, OWORD PTR [rdi+64]
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ vmovdqa xmm6, OWORD PTR [rdi+80]
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ vmovdqa xmm6, OWORD PTR [rdi+96]
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ vmovdqa xmm6, OWORD PTR [rdi+112]
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ vmovdqa xmm6, OWORD PTR [rdi+128]
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ vmovdqa xmm6, OWORD PTR [rdi+144]
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ cmp esi, 11
+ vmovdqa xmm6, OWORD PTR [rdi+160]
+ jl L_AES_GCM_init_avx512_calc_iv_12_last
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ vmovdqa xmm6, OWORD PTR [rdi+176]
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ cmp esi, 13
+ vmovdqa xmm6, OWORD PTR [rdi+192]
+ jl L_AES_GCM_init_avx512_calc_iv_12_last
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ vmovdqa xmm6, OWORD PTR [rdi+208]
+ vaesenc xmm5, xmm5, xmm6
+ vaesenc xmm1, xmm1, xmm6
+ vmovdqa xmm6, OWORD PTR [rdi+224]
+L_AES_GCM_init_avx512_calc_iv_12_last:
+ vaesenclast xmm5, xmm5, xmm6
+ vaesenclast xmm1, xmm1, xmm6
+ vpshufb xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vmovdqu xmm15, xmm1
+ jmp L_AES_GCM_init_avx512_iv_done
+L_AES_GCM_init_avx512_iv_not_12:
+ ; Calculate values when IV is not 12 bytes
+ ; H = Encrypt X(=0)
+ vmovdqa xmm5, OWORD PTR [rdi]
+ vaesenc xmm5, xmm5, [rdi+16]
+ vaesenc xmm5, xmm5, [rdi+32]
+ vaesenc xmm5, xmm5, [rdi+48]
+ vaesenc xmm5, xmm5, [rdi+64]
+ vaesenc xmm5, xmm5, [rdi+80]
+ vaesenc xmm5, xmm5, [rdi+96]
+ vaesenc xmm5, xmm5, [rdi+112]
+ vaesenc xmm5, xmm5, [rdi+128]
+ vaesenc xmm5, xmm5, [rdi+144]
+ cmp esi, 11
+ vmovdqa xmm8, OWORD PTR [rdi+160]
+ jl L_AES_GCM_init_avx512_calc_iv_1_aesenc_avx_last
+ vaesenc xmm5, xmm5, xmm8
+ vaesenc xmm5, xmm5, [rdi+176]
+ cmp esi, 13
+ vmovdqa xmm8, OWORD PTR [rdi+192]
+ jl L_AES_GCM_init_avx512_calc_iv_1_aesenc_avx_last
+ vaesenc xmm5, xmm5, xmm8
+ vaesenc xmm5, xmm5, [rdi+208]
+ vmovdqa xmm8, OWORD PTR [rdi+224]
+L_AES_GCM_init_avx512_calc_iv_1_aesenc_avx_last:
+ vaesenclast xmm5, xmm5, xmm8
+ vpshufb xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ ; Calc counter
+ ; Initialization vector
+ cmp edx, 0
+ mov rcx, 0
+ je L_AES_GCM_init_avx512_calc_iv_done
+ cmp edx, 16
+ jl L_AES_GCM_init_avx512_calc_iv_lt16
+ and edx, 4294967280
+L_AES_GCM_init_avx512_calc_iv_16_loop:
+ vmovdqu xmm7, OWORD PTR [r10+rcx]
+ vpshufb xmm7, xmm7, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vpxor xmm4, xmm4, xmm7
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm4, 78
+ vpshufd xmm2, xmm5, 78
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm6, xmm0
+ vmovdqa xmm4, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm6, xmm6, xmm2
+ vpxor xmm4, xmm4, xmm1
+ vpsrld xmm0, xmm6, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm6, xmm6, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm6, xmm6, xmm0
+ vpor xmm4, xmm4, xmm1
+ vpslld xmm0, xmm6, 31
+ vpslld xmm1, xmm6, 30
+ vpslld xmm2, xmm6, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm6, xmm6, xmm0
+ vpsrld xmm2, xmm6, 1
+ vpsrld xmm3, xmm6, 2
+ vpsrld xmm0, xmm6, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm6
+ vpxor xmm4, xmm4, xmm2
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_init_avx512_calc_iv_16_loop
+ mov edx, r11d
+ cmp ecx, edx
+ je L_AES_GCM_init_avx512_calc_iv_done
+L_AES_GCM_init_avx512_calc_iv_lt16:
+ sub rsp, 16
+ vpxor xmm7, xmm7, xmm7
+ xor r13d, r13d
+ vmovdqu OWORD PTR [rsp], xmm7
+L_AES_GCM_init_avx512_calc_iv_loop:
+ movzx r12d, BYTE PTR [r10+rcx]
+ mov BYTE PTR [rsp+r13], r12b
+ inc ecx
+ inc r13d
+ cmp ecx, edx
+ jl L_AES_GCM_init_avx512_calc_iv_loop
+ vmovdqu xmm7, OWORD PTR [rsp]
+ add rsp, 16
+ vpshufb xmm7, xmm7, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vpxor xmm4, xmm4, xmm7
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm4, 78
+ vpshufd xmm2, xmm5, 78
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm6, xmm0
+ vmovdqa xmm4, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm6, xmm6, xmm2
+ vpxor xmm4, xmm4, xmm1
+ vpsrld xmm0, xmm6, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm6, xmm6, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm6, xmm6, xmm0
+ vpor xmm4, xmm4, xmm1
+ vpslld xmm0, xmm6, 31
+ vpslld xmm1, xmm6, 30
+ vpslld xmm2, xmm6, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm6, xmm6, xmm0
+ vpsrld xmm2, xmm6, 1
+ vpsrld xmm3, xmm6, 2
+ vpsrld xmm0, xmm6, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm6
+ vpxor xmm4, xmm4, xmm2
+L_AES_GCM_init_avx512_calc_iv_done:
+ ; T = Encrypt counter
+ vpxor xmm0, xmm0, xmm0
+ shl edx, 3
+ vmovq xmm0, rdx
+ vpxor xmm4, xmm4, xmm0
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm4, 78
+ vpshufd xmm2, xmm5, 78
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm6, xmm0
+ vmovdqa xmm4, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm6, xmm6, xmm2
+ vpxor xmm4, xmm4, xmm1
+ vpsrld xmm0, xmm6, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm6, xmm6, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm6, xmm6, xmm0
+ vpor xmm4, xmm4, xmm1
+ vpslld xmm0, xmm6, 31
+ vpslld xmm1, xmm6, 30
+ vpslld xmm2, xmm6, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm6, xmm6, xmm0
+ vpsrld xmm2, xmm6, 1
+ vpsrld xmm3, xmm6, 2
+ vpsrld xmm0, xmm6, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm6
+ vpxor xmm4, xmm4, xmm2
+ vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ ; Encrypt counter
+ vmovdqa xmm7, OWORD PTR [rdi]
+ vpxor xmm7, xmm7, xmm4
+ vaesenc xmm7, xmm7, [rdi+16]
+ vaesenc xmm7, xmm7, [rdi+32]
+ vaesenc xmm7, xmm7, [rdi+48]
+ vaesenc xmm7, xmm7, [rdi+64]
+ vaesenc xmm7, xmm7, [rdi+80]
+ vaesenc xmm7, xmm7, [rdi+96]
+ vaesenc xmm7, xmm7, [rdi+112]
+ vaesenc xmm7, xmm7, [rdi+128]
+ vaesenc xmm7, xmm7, [rdi+144]
+ cmp esi, 11
+ vmovdqa xmm8, OWORD PTR [rdi+160]
+ jl L_AES_GCM_init_avx512_calc_iv_2_aesenc_avx_last
+ vaesenc xmm7, xmm7, xmm8
+ vaesenc xmm7, xmm7, [rdi+176]
+ cmp esi, 13
+ vmovdqa xmm8, OWORD PTR [rdi+192]
+ jl L_AES_GCM_init_avx512_calc_iv_2_aesenc_avx_last
+ vaesenc xmm7, xmm7, xmm8
+ vaesenc xmm7, xmm7, [rdi+208]
+ vmovdqa xmm8, OWORD PTR [rdi+224]
+L_AES_GCM_init_avx512_calc_iv_2_aesenc_avx_last:
+ vaesenclast xmm7, xmm7, xmm8
+ vmovdqu xmm15, xmm7
+L_AES_GCM_init_avx512_iv_done:
+ vmovdqa OWORD PTR [r9], xmm15
+ vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_epi64
+ vpaddd xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_one
+ vmovdqa OWORD PTR [rax], xmm5
+ vmovdqa OWORD PTR [r8], xmm4
+ vmovdqu xmm6, OWORD PTR [rsp+16]
+ vmovdqu xmm7, OWORD PTR [rsp+32]
+ vmovdqu xmm8, OWORD PTR [rsp+48]
+ vmovdqu xmm15, OWORD PTR [rsp+64]
+ add rsp, 80
+ pop r13
+ pop r12
+ pop rsi
+ pop rdi
+ ret
+AES_GCM_init_avx512 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_aad_update_avx512 PROC
+ mov rax, rcx
+ sub rsp, 32
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqa xmm5, OWORD PTR [r8]
+ vmovdqa xmm6, OWORD PTR [r9]
+ xor ecx, ecx
+L_AES_GCM_aad_update_avx512_16_loop:
+ vmovdqu xmm7, OWORD PTR [rax+rcx]
+ vpshufb xmm7, xmm7, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vpxor xmm5, xmm5, xmm7
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm5, 78
+ vpshufd xmm2, xmm6, 78
+ vpclmulqdq xmm3, xmm6, xmm5, 17
+ vpclmulqdq xmm0, xmm6, xmm5, 0
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm2, xmm2, xmm6
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm4, xmm0
+ vmovdqa xmm5, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm4, xmm4, xmm2
+ vpxor xmm5, xmm5, xmm1
+ vpsrld xmm0, xmm4, 31
+ vpsrld xmm1, xmm5, 31
+ vpslld xmm4, xmm4, 1
+ vpslld xmm5, xmm5, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm5, xmm5, xmm2
+ vpor xmm4, xmm4, xmm0
+ vpor xmm5, xmm5, xmm1
+ vpslld xmm0, xmm4, 31
+ vpslld xmm1, xmm4, 30
+ vpslld xmm2, xmm4, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm4, xmm4, xmm0
+ vpsrld xmm2, xmm4, 1
+ vpsrld xmm3, xmm4, 2
+ vpsrld xmm0, xmm4, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm4
+ vpxor xmm5, xmm5, xmm2
+ add ecx, 16
+ cmp ecx, edx
+ jl L_AES_GCM_aad_update_avx512_16_loop
+ vmovdqa OWORD PTR [r8], xmm5
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ add rsp, 32
+ ret
+AES_GCM_aad_update_avx512 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_encrypt_block_avx512 PROC
+ mov r10, r8
+ mov r11, r9
+ mov rax, QWORD PTR [rsp+40]
+ vmovdqu xmm1, OWORD PTR [rax]
+ vpshufb xmm0, xmm1, OWORD PTR L_avx512_aes_gcm_bswap_epi64
+ vpaddd xmm1, xmm1, OWORD PTR L_avx512_aes_gcm_one
+ vmovdqu OWORD PTR [rax], xmm1
+ vpxor xmm0, xmm0, [rcx]
+ vaesenc xmm0, xmm0, [rcx+16]
+ vaesenc xmm0, xmm0, [rcx+32]
+ vaesenc xmm0, xmm0, [rcx+48]
+ vaesenc xmm0, xmm0, [rcx+64]
+ vaesenc xmm0, xmm0, [rcx+80]
+ vaesenc xmm0, xmm0, [rcx+96]
+ vaesenc xmm0, xmm0, [rcx+112]
+ vaesenc xmm0, xmm0, [rcx+128]
+ vaesenc xmm0, xmm0, [rcx+144]
+ cmp edx, 11
+ vmovdqa xmm1, OWORD PTR [rcx+160]
+ jl L_AES_GCM_encrypt_block_avx512_aesenc_block_last
+ vaesenc xmm0, xmm0, xmm1
+ vaesenc xmm0, xmm0, [rcx+176]
+ cmp edx, 13
+ vmovdqa xmm1, OWORD PTR [rcx+192]
+ jl L_AES_GCM_encrypt_block_avx512_aesenc_block_last
+ vaesenc xmm0, xmm0, xmm1
+ vaesenc xmm0, xmm0, [rcx+208]
+ vmovdqa xmm1, OWORD PTR [rcx+224]
+L_AES_GCM_encrypt_block_avx512_aesenc_block_last:
+ vaesenclast xmm0, xmm0, xmm1
+ vmovdqu xmm1, OWORD PTR [r11]
+ vpxor xmm0, xmm0, xmm1
+ vmovdqu OWORD PTR [r10], xmm0
+ vpshufb xmm0, xmm0, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vzeroupper
+ ret
+AES_GCM_encrypt_block_avx512 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_ghash_block_avx512 PROC
+ sub rsp, 32
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqa xmm4, OWORD PTR [rdx]
+ vmovdqa xmm5, OWORD PTR [r8]
+ vmovdqu xmm7, OWORD PTR [rcx]
+ vpshufb xmm7, xmm7, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vpxor xmm4, xmm4, xmm7
+ ; ghash_gfmul_avx
+ vpshufd xmm1, xmm4, 78
+ vpshufd xmm2, xmm5, 78
+ vpclmulqdq xmm3, xmm5, xmm4, 17
+ vpclmulqdq xmm0, xmm5, xmm4, 0
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm2, xmm2, xmm5
+ vpclmulqdq xmm1, xmm1, xmm2, 0
+ vpxor xmm1, xmm1, xmm0
+ vpxor xmm1, xmm1, xmm3
+ vmovdqa xmm6, xmm0
+ vmovdqa xmm4, xmm3
+ vpslldq xmm2, xmm1, 8
+ vpsrldq xmm1, xmm1, 8
+ vpxor xmm6, xmm6, xmm2
+ vpxor xmm4, xmm4, xmm1
+ vpsrld xmm0, xmm6, 31
+ vpsrld xmm1, xmm4, 31
+ vpslld xmm6, xmm6, 1
+ vpslld xmm4, xmm4, 1
+ vpsrldq xmm2, xmm0, 12
+ vpslldq xmm0, xmm0, 4
+ vpslldq xmm1, xmm1, 4
+ vpor xmm4, xmm4, xmm2
+ vpor xmm6, xmm6, xmm0
+ vpor xmm4, xmm4, xmm1
+ vpslld xmm0, xmm6, 31
+ vpslld xmm1, xmm6, 30
+ vpslld xmm2, xmm6, 25
+ vpxor xmm0, xmm0, xmm1
+ vpxor xmm0, xmm0, xmm2
+ vmovdqa xmm1, xmm0
+ vpsrldq xmm1, xmm1, 4
+ vpslldq xmm0, xmm0, 12
+ vpxor xmm6, xmm6, xmm0
+ vpsrld xmm2, xmm6, 1
+ vpsrld xmm3, xmm6, 2
+ vpsrld xmm0, xmm6, 7
+ vpxor xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm0
+ vpxor xmm2, xmm2, xmm1
+ vpxor xmm2, xmm2, xmm6
+ vpxor xmm4, xmm4, xmm2
+ vmovdqa OWORD PTR [rdx], xmm4
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ add rsp, 32
+ ret
+AES_GCM_ghash_block_avx512 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_encrypt_update_avx512 PROC
+ push r13
+ push r12
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ push rbp
+ mov rax, rcx
+ mov r10, r8
+ mov r8d, edx
+ mov r11, r9
+ mov r9d, DWORD PTR [rsp+104]
+ mov r12, QWORD PTR [rsp+112]
+ mov r14, QWORD PTR [rsp+120]
+ mov r15, QWORD PTR [rsp+128]
+ sub rsp, 1200
+ vmovdqu OWORD PTR [rsp+1040], xmm6
+ vmovdqu OWORD PTR [rsp+1056], xmm7
+ vmovdqu OWORD PTR [rsp+1072], xmm8
+ vmovdqu OWORD PTR [rsp+1088], xmm9
+ vmovdqu OWORD PTR [rsp+1104], xmm10
+ vmovdqu OWORD PTR [rsp+1120], xmm11
+ vmovdqu OWORD PTR [rsp+1136], xmm12
+ vmovdqu OWORD PTR [rsp+1152], xmm13
+ vmovdqu OWORD PTR [rsp+1168], xmm14
+ vmovdqu OWORD PTR [rsp+1184], xmm15
+ vmovdqa xmm6, OWORD PTR [r12]
+ vmovdqa xmm5, OWORD PTR [r14]
+ vpsrlq xmm9, xmm5, 63
+ vpsllq xmm8, xmm5, 1
+ vpslldq xmm9, xmm9, 8
+ vpor xmm8, xmm8, xmm9
+ vpshufd xmm5, xmm5, 255
+ vpsrad xmm5, xmm5, 31
+ vpand xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpxor xmm5, xmm5, xmm8
+ xor edi, edi
+ cmp r9d, 256
+ jl L_AES_GCM_encrypt_update_avx512_done_128
+ vmovdqa xmm2, xmm6
+ ; H ^ 1
+ vmovdqu OWORD PTR [rsp], xmm5
+ ; H ^ 2
+ vpclmulqdq xmm8, xmm5, xmm5, 0
+ vpclmulqdq xmm11, xmm5, xmm5, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm0, xmm11
+ vmovdqu OWORD PTR [rsp+16], xmm0
+ ; H ^ 3
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm5, 78
+ vpxor xmm9, xmm9, xmm5
+ vpshufd xmm10, xmm0, 78
+ vpxor xmm10, xmm10, xmm0
+ vpclmulqdq xmm8, xmm0, xmm5, 0
+ vpclmulqdq xmm11, xmm0, xmm5, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm1, xmm11
+ vmovdqu OWORD PTR [rsp+32], xmm1
+ ; H ^ 4
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm3, xmm11
+ vmovdqu OWORD PTR [rsp+48], xmm3
+ ; H ^ 5
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+64], xmm7
+ ; H ^ 6
+ vpclmulqdq xmm8, xmm1, xmm1, 0
+ vpclmulqdq xmm11, xmm1, xmm1, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+80], xmm7
+ ; H ^ 7
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm1, 78
+ vpxor xmm9, xmm9, xmm1
+ vpshufd xmm10, xmm3, 78
+ vpxor xmm10, xmm10, xmm3
+ vpclmulqdq xmm8, xmm3, xmm1, 0
+ vpclmulqdq xmm11, xmm3, xmm1, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm7
+ ; H ^ 8
+ vpclmulqdq xmm8, xmm3, xmm3, 0
+ vpclmulqdq xmm11, xmm3, xmm3, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+112], xmm7
+ ; H ^ 9
+ vmovdqu xmm0, OWORD PTR [rsp+48]
+ vmovdqu xmm1, OWORD PTR [rsp+64]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+128], xmm7
+ ; H ^ 10
+ vmovdqu xmm0, OWORD PTR [rsp+64]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+144], xmm7
+ ; H ^ 11
+ vmovdqu xmm0, OWORD PTR [rsp+64]
+ vmovdqu xmm1, OWORD PTR [rsp+80]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+160], xmm7
+ ; H ^ 12
+ vmovdqu xmm0, OWORD PTR [rsp+80]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+176], xmm7
+ ; H ^ 13
+ vmovdqu xmm0, OWORD PTR [rsp+80]
+ vmovdqu xmm1, OWORD PTR [rsp+96]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+192], xmm7
+ ; H ^ 14
+ vmovdqu xmm0, OWORD PTR [rsp+96]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+208], xmm7
+ ; H ^ 15
+ vmovdqu xmm0, OWORD PTR [rsp+96]
+ vmovdqu xmm1, OWORD PTR [rsp+112]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+224], xmm7
+ ; H ^ 16
+ vmovdqu xmm0, OWORD PTR [rsp+112]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+240], xmm7
+ cmp r9d, 512
+ jl L_AES_GCM_encrypt_update_avx512_no_ext
+ ; H ^ 17
+ vmovdqu xmm0, OWORD PTR [rsp+112]
+ vmovdqu xmm1, OWORD PTR [rsp+128]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+256], xmm7
+ ; H ^ 18
+ vmovdqu xmm0, OWORD PTR [rsp+128]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+272], xmm7
+ ; H ^ 19
+ vmovdqu xmm0, OWORD PTR [rsp+128]
+ vmovdqu xmm1, OWORD PTR [rsp+144]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+288], xmm7
+ ; H ^ 20
+ vmovdqu xmm0, OWORD PTR [rsp+144]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+304], xmm7
+ ; H ^ 21
+ vmovdqu xmm0, OWORD PTR [rsp+144]
+ vmovdqu xmm1, OWORD PTR [rsp+160]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+320], xmm7
+ ; H ^ 22
+ vmovdqu xmm0, OWORD PTR [rsp+160]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+336], xmm7
+ ; H ^ 23
+ vmovdqu xmm0, OWORD PTR [rsp+160]
+ vmovdqu xmm1, OWORD PTR [rsp+176]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+352], xmm7
+ ; H ^ 24
+ vmovdqu xmm0, OWORD PTR [rsp+176]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+368], xmm7
+ ; H ^ 25
+ vmovdqu xmm0, OWORD PTR [rsp+176]
+ vmovdqu xmm1, OWORD PTR [rsp+192]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+384], xmm7
+ ; H ^ 26
+ vmovdqu xmm0, OWORD PTR [rsp+192]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+400], xmm7
+ ; H ^ 27
+ vmovdqu xmm0, OWORD PTR [rsp+192]
+ vmovdqu xmm1, OWORD PTR [rsp+208]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+416], xmm7
+ ; H ^ 28
+ vmovdqu xmm0, OWORD PTR [rsp+208]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+432], xmm7
+ ; H ^ 29
+ vmovdqu xmm0, OWORD PTR [rsp+208]
+ vmovdqu xmm1, OWORD PTR [rsp+224]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+448], xmm7
+ ; H ^ 30
+ vmovdqu xmm0, OWORD PTR [rsp+224]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+464], xmm7
+ ; H ^ 31
+ vmovdqu xmm0, OWORD PTR [rsp+224]
+ vmovdqu xmm1, OWORD PTR [rsp+240]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+480], xmm7
+ ; H ^ 32
+ vmovdqu xmm0, OWORD PTR [rsp+240]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+496], xmm7
+L_AES_GCM_encrypt_update_avx512_no_ext:
+ vbroadcasti32x4 zmm22, ptr_L_avx512_aes_gcm_bswap_epi64
+ vbroadcasti32x4 zmm30, ptr_L_avx512_aes_gcm_bswap_mask
+ vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128
+ vbroadcasti32x4 zmm9, [rax]
+ vbroadcasti32x4 zmm10, [rax+16]
+ vbroadcasti32x4 zmm11, [rax+32]
+ vbroadcasti32x4 zmm12, [rax+48]
+ vbroadcasti32x4 zmm13, [rax+64]
+ vbroadcasti32x4 zmm14, [rax+80]
+ vbroadcasti32x4 zmm15, [rax+96]
+ vbroadcasti32x4 zmm1, [rax+112]
+ vbroadcasti32x4 zmm2, [rax+128]
+ vbroadcasti32x4 zmm3, [rax+144]
+ cmp r9d, 512
+ jl L_AES_GCM_encrypt_update_avx512_no_windows
+ mov ebp, r9d
+ and ebp, 4294966784
+ vmovdqu64 zmm23, [rsp+448]
+ vshufi64x2 zmm23, zmm23, zmm23, 27
+ vmovdqu64 zmm24, [rsp+384]
+ vshufi64x2 zmm24, zmm24, zmm24, 27
+ vmovdqu64 zmm25, [rsp+320]
+ vshufi64x2 zmm25, zmm25, zmm25, 27
+ vmovdqu64 zmm26, [rsp+256]
+ vshufi64x2 zmm26, zmm26, zmm26, 27
+ vmovdqu64 [rsp+512], zmm23
+ vmovdqu64 [rsp+576], zmm24
+ vmovdqu64 [rsp+640], zmm25
+ vmovdqu64 [rsp+704], zmm26
+ vmovdqu64 zmm23, [rsp+192]
+ vshufi64x2 zmm23, zmm23, zmm23, 27
+ vmovdqu64 zmm24, [rsp+128]
+ vshufi64x2 zmm24, zmm24, zmm24, 27
+ vmovdqu64 zmm25, [rsp+64]
+ vshufi64x2 zmm25, zmm25, zmm25, 27
+ vmovdqu64 zmm26, [rsp]
+ vshufi64x2 zmm26, zmm26, zmm26, 27
+ vmovdqu64 [rsp+768], zmm23
+ vmovdqu64 [rsp+832], zmm24
+ vmovdqu64 [rsp+896], zmm25
+ vmovdqu64 [rsp+960], zmm26
+ ; 512 bytes of input
+ lea rsi, QWORD PTR [r10+rdi]
+ vbroadcasti32x4 zmm20, [r15]
+ vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
+ vpshufb zmm16, zmm16, zmm22
+ vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
+ vpshufb zmm17, zmm17, zmm22
+ vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2
+ vpshufb zmm18, zmm18, zmm22
+ vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3
+ vpshufb zmm19, zmm19, zmm22
+ vmovdqu xmm8, OWORD PTR [r15]
+ vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen
+ vmovdqu OWORD PTR [r15], xmm8
+ vpxorq zmm16, zmm16, zmm9
+ vpxorq zmm17, zmm17, zmm9
+ vpxorq zmm18, zmm18, zmm9
+ vpxorq zmm19, zmm19, zmm9
+ vaesenc zmm16, zmm16, zmm10
+ vaesenc zmm17, zmm17, zmm10
+ vaesenc zmm18, zmm18, zmm10
+ vaesenc zmm19, zmm19, zmm10
+ vaesenc zmm16, zmm16, zmm11
+ vaesenc zmm17, zmm17, zmm11
+ vaesenc zmm18, zmm18, zmm11
+ vaesenc zmm19, zmm19, zmm11
+ vaesenc zmm16, zmm16, zmm12
+ vaesenc zmm17, zmm17, zmm12
+ vaesenc zmm18, zmm18, zmm12
+ vaesenc zmm19, zmm19, zmm12
+ vaesenc zmm16, zmm16, zmm13
+ vaesenc zmm17, zmm17, zmm13
+ vaesenc zmm18, zmm18, zmm13
+ vaesenc zmm19, zmm19, zmm13
+ vaesenc zmm16, zmm16, zmm14
+ vaesenc zmm17, zmm17, zmm14
+ vaesenc zmm18, zmm18, zmm14
+ vaesenc zmm19, zmm19, zmm14
+ vaesenc zmm16, zmm16, zmm15
+ vaesenc zmm17, zmm17, zmm15
+ vaesenc zmm18, zmm18, zmm15
+ vaesenc zmm19, zmm19, zmm15
+ vaesenc zmm16, zmm16, zmm1
+ vaesenc zmm17, zmm17, zmm1
+ vaesenc zmm18, zmm18, zmm1
+ vaesenc zmm19, zmm19, zmm1
+ vaesenc zmm16, zmm16, zmm2
+ vaesenc zmm17, zmm17, zmm2
+ vaesenc zmm18, zmm18, zmm2
+ vaesenc zmm19, zmm19, zmm2
+ vaesenc zmm16, zmm16, zmm3
+ vaesenc zmm17, zmm17, zmm3
+ vaesenc zmm18, zmm18, zmm3
+ vaesenc zmm19, zmm19, zmm3
+ cmp r8d, 11
+ vbroadcasti32x4 zmm20, [rax+160]
+ jl L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+176]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ cmp r8d, 13
+ vbroadcasti32x4 zmm20, [rax+192]
+ jl L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+208]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+224]
+L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last:
+ vaesenclast zmm16, zmm16, zmm20
+ vaesenclast zmm17, zmm17, zmm20
+ vaesenclast zmm18, zmm18, zmm20
+ vaesenclast zmm19, zmm19, zmm20
+ lea rcx, QWORD PTR [r11+rdi]
+ lea rdx, QWORD PTR [r10+rdi]
+ vmovdqu64 zmm21, [rcx]
+ vpxorq zmm16, zmm16, zmm21
+ vmovdqu64 [rdx], zmm16
+ vmovdqu64 zmm21, [rcx+64]
+ vpxorq zmm17, zmm17, zmm21
+ vmovdqu64 [rdx+64], zmm17
+ vmovdqu64 zmm21, [rcx+128]
+ vpxorq zmm18, zmm18, zmm21
+ vmovdqu64 [rdx+128], zmm18
+ vmovdqu64 zmm21, [rcx+192]
+ vpxorq zmm19, zmm19, zmm21
+ vmovdqu64 [rdx+192], zmm19
+ add edi, 256
+ vbroadcasti32x4 zmm20, [r15]
+ vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
+ vpshufb zmm16, zmm16, zmm22
+ vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
+ vpshufb zmm17, zmm17, zmm22
+ vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2
+ vpshufb zmm18, zmm18, zmm22
+ vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3
+ vpshufb zmm19, zmm19, zmm22
+ vmovdqu xmm8, OWORD PTR [r15]
+ vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen
+ vmovdqu OWORD PTR [r15], xmm8
+ vpxorq zmm16, zmm16, zmm9
+ vpxorq zmm17, zmm17, zmm9
+ vpxorq zmm18, zmm18, zmm9
+ vpxorq zmm19, zmm19, zmm9
+ vaesenc zmm16, zmm16, zmm10
+ vaesenc zmm17, zmm17, zmm10
+ vaesenc zmm18, zmm18, zmm10
+ vaesenc zmm19, zmm19, zmm10
+ vaesenc zmm16, zmm16, zmm11
+ vaesenc zmm17, zmm17, zmm11
+ vaesenc zmm18, zmm18, zmm11
+ vaesenc zmm19, zmm19, zmm11
+ vaesenc zmm16, zmm16, zmm12
+ vaesenc zmm17, zmm17, zmm12
+ vaesenc zmm18, zmm18, zmm12
+ vaesenc zmm19, zmm19, zmm12
+ vaesenc zmm16, zmm16, zmm13
+ vaesenc zmm17, zmm17, zmm13
+ vaesenc zmm18, zmm18, zmm13
+ vaesenc zmm19, zmm19, zmm13
+ vaesenc zmm16, zmm16, zmm14
+ vaesenc zmm17, zmm17, zmm14
+ vaesenc zmm18, zmm18, zmm14
+ vaesenc zmm19, zmm19, zmm14
+ vaesenc zmm16, zmm16, zmm15
+ vaesenc zmm17, zmm17, zmm15
+ vaesenc zmm18, zmm18, zmm15
+ vaesenc zmm19, zmm19, zmm15
+ vaesenc zmm16, zmm16, zmm1
+ vaesenc zmm17, zmm17, zmm1
+ vaesenc zmm18, zmm18, zmm1
+ vaesenc zmm19, zmm19, zmm1
+ vaesenc zmm16, zmm16, zmm2
+ vaesenc zmm17, zmm17, zmm2
+ vaesenc zmm18, zmm18, zmm2
+ vaesenc zmm19, zmm19, zmm2
+ vaesenc zmm16, zmm16, zmm3
+ vaesenc zmm17, zmm17, zmm3
+ vaesenc zmm18, zmm18, zmm3
+ vaesenc zmm19, zmm19, zmm3
+ cmp r8d, 11
+ vbroadcasti32x4 zmm20, [rax+160]
+ jl L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+176]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ cmp r8d, 13
+ vbroadcasti32x4 zmm20, [rax+192]
+ jl L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+208]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+224]
+L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last:
+ vaesenclast zmm16, zmm16, zmm20
+ vaesenclast zmm17, zmm17, zmm20
+ vaesenclast zmm18, zmm18, zmm20
+ vaesenclast zmm19, zmm19, zmm20
+ lea rcx, QWORD PTR [r11+rdi]
+ lea rdx, QWORD PTR [r10+rdi]
+ vmovdqu64 zmm21, [rcx]
+ vpxorq zmm16, zmm16, zmm21
+ vmovdqu64 [rdx], zmm16
+ vmovdqu64 zmm21, [rcx+64]
+ vpxorq zmm17, zmm17, zmm21
+ vmovdqu64 [rdx+64], zmm17
+ vmovdqu64 zmm21, [rcx+128]
+ vpxorq zmm18, zmm18, zmm21
+ vmovdqu64 [rdx+128], zmm18
+ vmovdqu64 zmm21, [rcx+192]
+ vpxorq zmm19, zmm19, zmm21
+ vmovdqu64 [rdx+192], zmm19
+ add edi, 256
+ cmp edi, ebp
+ jge L_AES_GCM_encrypt_update_avx512_last_win
+L_AES_GCM_encrypt_update_avx512_win_loop:
+ lea rbx, QWORD PTR [r10+rdi]
+ vpxorq zmm21, zmm21, zmm21
+ vinserti32x4 zmm21, zmm21, xmm6, 0
+ vbroadcasti32x4 zmm20, [r15]
+ vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
+ vpshufb zmm16, zmm16, zmm22
+ vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
+ vpshufb zmm17, zmm17, zmm22
+ vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2
+ vpshufb zmm18, zmm18, zmm22
+ vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3
+ vpshufb zmm19, zmm19, zmm22
+ vmovdqu xmm8, OWORD PTR [r15]
+ vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen
+ vmovdqu OWORD PTR [r15], xmm8
+ vpxorq zmm16, zmm16, zmm9
+ vpxorq zmm17, zmm17, zmm9
+ vpxorq zmm18, zmm18, zmm9
+ vpxorq zmm19, zmm19, zmm9
+ vmovdqu64 zmm31, [rsi]
+ vpshufb zmm31, zmm31, zmm30
+ vpxorq zmm31, zmm31, zmm21
+ vaesenc zmm16, zmm16, zmm10
+ vaesenc zmm17, zmm17, zmm10
+ vaesenc zmm18, zmm18, zmm10
+ vaesenc zmm19, zmm19, zmm10
+ vpclmulqdq zmm23, zmm31, [rsp+512], 0
+ vpclmulqdq zmm24, zmm31, [rsp+512], 1
+ vpclmulqdq zmm25, zmm31, [rsp+512], 16
+ vpclmulqdq zmm26, zmm31, [rsp+512], 17
+ vmovdqa64 zmm27, zmm23
+ vpxorq zmm28, zmm25, zmm24
+ vmovdqa64 zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm11
+ vaesenc zmm17, zmm17, zmm11
+ vaesenc zmm18, zmm18, zmm11
+ vaesenc zmm19, zmm19, zmm11
+ vmovdqu64 zmm31, [rsi+64]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm12
+ vaesenc zmm17, zmm17, zmm12
+ vaesenc zmm18, zmm18, zmm12
+ vaesenc zmm19, zmm19, zmm12
+ vpclmulqdq zmm23, zmm31, [rsp+576], 0
+ vpclmulqdq zmm24, zmm31, [rsp+576], 1
+ vpclmulqdq zmm25, zmm31, [rsp+576], 16
+ vpclmulqdq zmm26, zmm31, [rsp+576], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm13
+ vaesenc zmm17, zmm17, zmm13
+ vaesenc zmm18, zmm18, zmm13
+ vaesenc zmm19, zmm19, zmm13
+ vmovdqu64 zmm31, [rsi+128]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm14
+ vaesenc zmm17, zmm17, zmm14
+ vaesenc zmm18, zmm18, zmm14
+ vaesenc zmm19, zmm19, zmm14
+ vpclmulqdq zmm23, zmm31, [rsp+640], 0
+ vpclmulqdq zmm24, zmm31, [rsp+640], 1
+ vpclmulqdq zmm25, zmm31, [rsp+640], 16
+ vpclmulqdq zmm26, zmm31, [rsp+640], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm15
+ vaesenc zmm17, zmm17, zmm15
+ vaesenc zmm18, zmm18, zmm15
+ vaesenc zmm19, zmm19, zmm15
+ vmovdqu64 zmm31, [rsi+192]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm1
+ vaesenc zmm17, zmm17, zmm1
+ vaesenc zmm18, zmm18, zmm1
+ vaesenc zmm19, zmm19, zmm1
+ vpclmulqdq zmm23, zmm31, [rsp+704], 0
+ vpclmulqdq zmm24, zmm31, [rsp+704], 1
+ vpclmulqdq zmm25, zmm31, [rsp+704], 16
+ vpclmulqdq zmm26, zmm31, [rsp+704], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm2
+ vaesenc zmm17, zmm17, zmm2
+ vaesenc zmm18, zmm18, zmm2
+ vaesenc zmm19, zmm19, zmm2
+ vaesenc zmm16, zmm16, zmm3
+ vaesenc zmm17, zmm17, zmm3
+ vaesenc zmm18, zmm18, zmm3
+ vaesenc zmm19, zmm19, zmm3
+ cmp r8d, 11
+ vbroadcasti32x4 zmm20, [rax+160]
+ jl L_AES_GCM_encrypt_update_avx512_a_il_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+176]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ cmp r8d, 13
+ vbroadcasti32x4 zmm20, [rax+192]
+ jl L_AES_GCM_encrypt_update_avx512_a_il_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+208]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+224]
+L_AES_GCM_encrypt_update_avx512_a_il_last:
+ vaesenclast zmm16, zmm16, zmm20
+ vaesenclast zmm17, zmm17, zmm20
+ vaesenclast zmm18, zmm18, zmm20
+ vaesenclast zmm19, zmm19, zmm20
+ lea rcx, QWORD PTR [r11+rdi]
+ lea rdx, QWORD PTR [r10+rdi]
+ vmovdqu64 zmm21, [rcx]
+ vpxorq zmm16, zmm16, zmm21
+ vmovdqu64 [rdx], zmm16
+ vmovdqu64 zmm21, [rcx+64]
+ vpxorq zmm17, zmm17, zmm21
+ vmovdqu64 [rdx+64], zmm17
+ vmovdqu64 zmm21, [rcx+128]
+ vpxorq zmm18, zmm18, zmm21
+ vmovdqu64 [rdx+128], zmm18
+ vmovdqu64 zmm21, [rcx+192]
+ vpxorq zmm19, zmm19, zmm21
+ vmovdqu64 [rdx+192], zmm19
+ add edi, 256
+ vbroadcasti32x4 zmm20, [r15]
+ vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
+ vpshufb zmm16, zmm16, zmm22
+ vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
+ vpshufb zmm17, zmm17, zmm22
+ vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2
+ vpshufb zmm18, zmm18, zmm22
+ vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3
+ vpshufb zmm19, zmm19, zmm22
+ vmovdqu xmm8, OWORD PTR [r15]
+ vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen
+ vmovdqu OWORD PTR [r15], xmm8
+ vpxorq zmm16, zmm16, zmm9
+ vpxorq zmm17, zmm17, zmm9
+ vpxorq zmm18, zmm18, zmm9
+ vpxorq zmm19, zmm19, zmm9
+ vmovdqu64 zmm31, [rsi+256]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm10
+ vaesenc zmm17, zmm17, zmm10
+ vaesenc zmm18, zmm18, zmm10
+ vaesenc zmm19, zmm19, zmm10
+ vpclmulqdq zmm23, zmm31, [rsp+768], 0
+ vpclmulqdq zmm24, zmm31, [rsp+768], 1
+ vpclmulqdq zmm25, zmm31, [rsp+768], 16
+ vpclmulqdq zmm26, zmm31, [rsp+768], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm11
+ vaesenc zmm17, zmm17, zmm11
+ vaesenc zmm18, zmm18, zmm11
+ vaesenc zmm19, zmm19, zmm11
+ vmovdqu64 zmm31, [rsi+320]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm12
+ vaesenc zmm17, zmm17, zmm12
+ vaesenc zmm18, zmm18, zmm12
+ vaesenc zmm19, zmm19, zmm12
+ vpclmulqdq zmm23, zmm31, [rsp+832], 0
+ vpclmulqdq zmm24, zmm31, [rsp+832], 1
+ vpclmulqdq zmm25, zmm31, [rsp+832], 16
+ vpclmulqdq zmm26, zmm31, [rsp+832], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm13
+ vaesenc zmm17, zmm17, zmm13
+ vaesenc zmm18, zmm18, zmm13
+ vaesenc zmm19, zmm19, zmm13
+ vmovdqu64 zmm31, [rsi+384]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm14
+ vaesenc zmm17, zmm17, zmm14
+ vaesenc zmm18, zmm18, zmm14
+ vaesenc zmm19, zmm19, zmm14
+ vpclmulqdq zmm23, zmm31, [rsp+896], 0
+ vpclmulqdq zmm24, zmm31, [rsp+896], 1
+ vpclmulqdq zmm25, zmm31, [rsp+896], 16
+ vpclmulqdq zmm26, zmm31, [rsp+896], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm15
+ vaesenc zmm17, zmm17, zmm15
+ vaesenc zmm18, zmm18, zmm15
+ vaesenc zmm19, zmm19, zmm15
+ vmovdqu64 zmm31, [rsi+448]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm1
+ vaesenc zmm17, zmm17, zmm1
+ vaesenc zmm18, zmm18, zmm1
+ vaesenc zmm19, zmm19, zmm1
+ vpclmulqdq zmm23, zmm31, [rsp+960], 0
+ vpclmulqdq zmm24, zmm31, [rsp+960], 1
+ vpclmulqdq zmm25, zmm31, [rsp+960], 16
+ vpclmulqdq zmm26, zmm31, [rsp+960], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm2
+ vaesenc zmm17, zmm17, zmm2
+ vaesenc zmm18, zmm18, zmm2
+ vaesenc zmm19, zmm19, zmm2
+ vaesenc zmm16, zmm16, zmm3
+ vaesenc zmm17, zmm17, zmm3
+ vaesenc zmm18, zmm18, zmm3
+ vaesenc zmm19, zmm19, zmm3
+ cmp r8d, 11
+ vbroadcasti32x4 zmm20, [rax+160]
+ jl L_AES_GCM_encrypt_update_avx512_b_il_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+176]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ cmp r8d, 13
+ vbroadcasti32x4 zmm20, [rax+192]
+ jl L_AES_GCM_encrypt_update_avx512_b_il_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+208]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+224]
+L_AES_GCM_encrypt_update_avx512_b_il_last:
+ vaesenclast zmm16, zmm16, zmm20
+ vaesenclast zmm17, zmm17, zmm20
+ vaesenclast zmm18, zmm18, zmm20
+ vaesenclast zmm19, zmm19, zmm20
+ lea rcx, QWORD PTR [r11+rdi]
+ lea rdx, QWORD PTR [r10+rdi]
+ vmovdqu64 zmm21, [rcx]
+ vpxorq zmm16, zmm16, zmm21
+ vmovdqu64 [rdx], zmm16
+ vmovdqu64 zmm21, [rcx+64]
+ vpxorq zmm17, zmm17, zmm21
+ vmovdqu64 [rdx+64], zmm17
+ vmovdqu64 zmm21, [rcx+128]
+ vpxorq zmm18, zmm18, zmm21
+ vmovdqu64 [rdx+128], zmm18
+ vmovdqu64 zmm21, [rcx+192]
+ vpxorq zmm19, zmm19, zmm21
+ vmovdqu64 [rdx+192], zmm19
+ add edi, 256
+ vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128
+ vpclmulqdq zmm23, zmm31, zmm27, 1
+ vpshufd zmm27, zmm27, 78
+ vpternlogq zmm28, zmm27, zmm23, 150
+ vpclmulqdq zmm23, zmm31, zmm28, 1
+ vpshufd zmm28, zmm28, 78
+ vpternlogq zmm29, zmm28, zmm23, 150
+ vextracti32x4 xmm0, zmm29, 1
+ vextracti32x4 xmm4, zmm29, 2
+ vextracti32x4 xmm5, zmm29, 3
+ vpxorq xmm6, xmm29, xmm0
+ vpternlogq xmm6, xmm5, xmm4, 150
+ mov rsi, rbx
+ cmp edi, ebp
+ jl L_AES_GCM_encrypt_update_avx512_win_loop
+L_AES_GCM_encrypt_update_avx512_last_win:
+ vpxorq zmm20, zmm20, zmm20
+ vinserti32x4 zmm20, zmm20, xmm6, 0
+ vmovdqu64 zmm23, [rsp+512]
+ vmovdqu64 zmm24, [rsp+576]
+ vmovdqu64 zmm25, [rsp+640]
+ vmovdqu64 zmm26, [rsp+704]
+ vmovdqu64 zmm21, [rsi]
+ vpshufb zmm21, zmm21, zmm30
+ vpxorq zmm21, zmm21, zmm20
+ vpclmulqdq zmm16, zmm21, zmm23, 0
+ vpclmulqdq zmm17, zmm21, zmm23, 1
+ vpclmulqdq zmm18, zmm21, zmm23, 16
+ vpclmulqdq zmm19, zmm21, zmm23, 17
+ vmovdqa64 zmm27, zmm16
+ vpxorq zmm28, zmm18, zmm17
+ vmovdqa64 zmm29, zmm19
+ vmovdqu64 zmm21, [rsi+64]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm24, 0
+ vpclmulqdq zmm17, zmm21, zmm24, 1
+ vpclmulqdq zmm18, zmm21, zmm24, 16
+ vpclmulqdq zmm19, zmm21, zmm24, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rsi+128]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm25, 0
+ vpclmulqdq zmm17, zmm21, zmm25, 1
+ vpclmulqdq zmm18, zmm21, zmm25, 16
+ vpclmulqdq zmm19, zmm21, zmm25, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rsi+192]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm26, 0
+ vpclmulqdq zmm17, zmm21, zmm26, 1
+ vpclmulqdq zmm18, zmm21, zmm26, 16
+ vpclmulqdq zmm19, zmm21, zmm26, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm23, [rsp+768]
+ vmovdqu64 zmm24, [rsp+832]
+ vmovdqu64 zmm25, [rsp+896]
+ vmovdqu64 zmm26, [rsp+960]
+ vmovdqu64 zmm21, [rsi+256]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm23, 0
+ vpclmulqdq zmm17, zmm21, zmm23, 1
+ vpclmulqdq zmm18, zmm21, zmm23, 16
+ vpclmulqdq zmm19, zmm21, zmm23, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rsi+320]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm24, 0
+ vpclmulqdq zmm17, zmm21, zmm24, 1
+ vpclmulqdq zmm18, zmm21, zmm24, 16
+ vpclmulqdq zmm19, zmm21, zmm24, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rsi+384]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm25, 0
+ vpclmulqdq zmm17, zmm21, zmm25, 1
+ vpclmulqdq zmm18, zmm21, zmm25, 16
+ vpclmulqdq zmm19, zmm21, zmm25, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rsi+448]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm26, 0
+ vpclmulqdq zmm17, zmm21, zmm26, 1
+ vpclmulqdq zmm18, zmm21, zmm26, 16
+ vpclmulqdq zmm19, zmm21, zmm26, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vpclmulqdq zmm21, zmm31, zmm27, 1
+ vpshufd zmm27, zmm27, 78
+ vpternlogq zmm28, zmm27, zmm21, 150
+ vpclmulqdq zmm21, zmm31, zmm28, 1
+ vpshufd zmm28, zmm28, 78
+ vpternlogq zmm29, zmm28, zmm21, 150
+ vextracti32x4 xmm0, zmm29, 1
+ vextracti32x4 xmm4, zmm29, 2
+ vextracti32x4 xmm5, zmm29, 3
+ vpxorq xmm6, xmm29, xmm0
+ vpternlogq xmm6, xmm5, xmm4, 150
+L_AES_GCM_encrypt_update_avx512_no_windows:
+ vmovdqu64 zmm23, [rsp+192]
+ vshufi64x2 zmm23, zmm23, zmm23, 27
+ vmovdqu64 zmm24, [rsp+128]
+ vshufi64x2 zmm24, zmm24, zmm24, 27
+ vmovdqu64 zmm25, [rsp+64]
+ vshufi64x2 zmm25, zmm25, zmm25, 27
+ vmovdqu64 zmm26, [rsp]
+ vshufi64x2 zmm26, zmm26, zmm26, 27
+ mov r13d, r9d
+ and r13d, 4294967040
+ cmp edi, r13d
+ jge L_AES_GCM_encrypt_update_avx512_after_256
+ ; 256 bytes of input
+ vbroadcasti32x4 zmm20, [r15]
+ vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
+ vpshufb zmm16, zmm16, zmm22
+ vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
+ vpshufb zmm17, zmm17, zmm22
+ vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2
+ vpshufb zmm18, zmm18, zmm22
+ vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3
+ vpshufb zmm19, zmm19, zmm22
+ vmovdqu xmm8, OWORD PTR [r15]
+ vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen
+ vmovdqu OWORD PTR [r15], xmm8
+ vpxorq zmm16, zmm16, zmm9
+ vpxorq zmm17, zmm17, zmm9
+ vpxorq zmm18, zmm18, zmm9
+ vpxorq zmm19, zmm19, zmm9
+ vaesenc zmm16, zmm16, zmm10
+ vaesenc zmm17, zmm17, zmm10
+ vaesenc zmm18, zmm18, zmm10
+ vaesenc zmm19, zmm19, zmm10
+ vaesenc zmm16, zmm16, zmm11
+ vaesenc zmm17, zmm17, zmm11
+ vaesenc zmm18, zmm18, zmm11
+ vaesenc zmm19, zmm19, zmm11
+ vaesenc zmm16, zmm16, zmm12
+ vaesenc zmm17, zmm17, zmm12
+ vaesenc zmm18, zmm18, zmm12
+ vaesenc zmm19, zmm19, zmm12
+ vaesenc zmm16, zmm16, zmm13
+ vaesenc zmm17, zmm17, zmm13
+ vaesenc zmm18, zmm18, zmm13
+ vaesenc zmm19, zmm19, zmm13
+ vaesenc zmm16, zmm16, zmm14
+ vaesenc zmm17, zmm17, zmm14
+ vaesenc zmm18, zmm18, zmm14
+ vaesenc zmm19, zmm19, zmm14
+ vaesenc zmm16, zmm16, zmm15
+ vaesenc zmm17, zmm17, zmm15
+ vaesenc zmm18, zmm18, zmm15
+ vaesenc zmm19, zmm19, zmm15
+ vaesenc zmm16, zmm16, zmm1
+ vaesenc zmm17, zmm17, zmm1
+ vaesenc zmm18, zmm18, zmm1
+ vaesenc zmm19, zmm19, zmm1
+ vaesenc zmm16, zmm16, zmm2
+ vaesenc zmm17, zmm17, zmm2
+ vaesenc zmm18, zmm18, zmm2
+ vaesenc zmm19, zmm19, zmm2
+ vaesenc zmm16, zmm16, zmm3
+ vaesenc zmm17, zmm17, zmm3
+ vaesenc zmm18, zmm18, zmm3
+ vaesenc zmm19, zmm19, zmm3
+ cmp r8d, 11
+ vbroadcasti32x4 zmm20, [rax+160]
+ jl L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+176]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ cmp r8d, 13
+ vbroadcasti32x4 zmm20, [rax+192]
+ jl L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+208]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+224]
+L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last:
+ vaesenclast zmm16, zmm16, zmm20
+ vaesenclast zmm17, zmm17, zmm20
+ vaesenclast zmm18, zmm18, zmm20
+ vaesenclast zmm19, zmm19, zmm20
+ lea rcx, QWORD PTR [r11+rdi]
+ lea rdx, QWORD PTR [r10+rdi]
+ vmovdqu64 zmm21, [rcx]
+ vpxorq zmm16, zmm16, zmm21
+ vmovdqu64 [rdx], zmm16
+ vmovdqu64 zmm21, [rcx+64]
+ vpxorq zmm17, zmm17, zmm21
+ vmovdqu64 [rdx+64], zmm17
+ vmovdqu64 zmm21, [rcx+128]
+ vpxorq zmm18, zmm18, zmm21
+ vmovdqu64 [rdx+128], zmm18
+ vmovdqu64 zmm21, [rcx+192]
+ vpxorq zmm19, zmm19, zmm21
+ vmovdqu64 [rdx+192], zmm19
+ mov rsi, rdx
+ add edi, 256
+ cmp edi, r13d
+ jge L_AES_GCM_encrypt_update_avx512_last_ghash
+L_AES_GCM_encrypt_update_avx512_ghash_128:
+ vbroadcasti32x4 zmm20, [r15]
+ vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
+ vpshufb zmm16, zmm16, zmm22
+ vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
+ vpshufb zmm17, zmm17, zmm22
+ vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2
+ vpshufb zmm18, zmm18, zmm22
+ vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3
+ vpshufb zmm19, zmm19, zmm22
+ vmovdqu xmm8, OWORD PTR [r15]
+ vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen
+ vmovdqu OWORD PTR [r15], xmm8
+ vpxorq zmm16, zmm16, zmm9
+ vpxorq zmm17, zmm17, zmm9
+ vpxorq zmm18, zmm18, zmm9
+ vpxorq zmm19, zmm19, zmm9
+ vaesenc zmm16, zmm16, zmm10
+ vaesenc zmm17, zmm17, zmm10
+ vaesenc zmm18, zmm18, zmm10
+ vaesenc zmm19, zmm19, zmm10
+ vaesenc zmm16, zmm16, zmm11
+ vaesenc zmm17, zmm17, zmm11
+ vaesenc zmm18, zmm18, zmm11
+ vaesenc zmm19, zmm19, zmm11
+ vaesenc zmm16, zmm16, zmm12
+ vaesenc zmm17, zmm17, zmm12
+ vaesenc zmm18, zmm18, zmm12
+ vaesenc zmm19, zmm19, zmm12
+ vaesenc zmm16, zmm16, zmm13
+ vaesenc zmm17, zmm17, zmm13
+ vaesenc zmm18, zmm18, zmm13
+ vaesenc zmm19, zmm19, zmm13
+ vaesenc zmm16, zmm16, zmm14
+ vaesenc zmm17, zmm17, zmm14
+ vaesenc zmm18, zmm18, zmm14
+ vaesenc zmm19, zmm19, zmm14
+ vaesenc zmm16, zmm16, zmm15
+ vaesenc zmm17, zmm17, zmm15
+ vaesenc zmm18, zmm18, zmm15
+ vaesenc zmm19, zmm19, zmm15
+ vaesenc zmm16, zmm16, zmm1
+ vaesenc zmm17, zmm17, zmm1
+ vaesenc zmm18, zmm18, zmm1
+ vaesenc zmm19, zmm19, zmm1
+ vaesenc zmm16, zmm16, zmm2
+ vaesenc zmm17, zmm17, zmm2
+ vaesenc zmm18, zmm18, zmm2
+ vaesenc zmm19, zmm19, zmm2
+ vaesenc zmm16, zmm16, zmm3
+ vaesenc zmm17, zmm17, zmm3
+ vaesenc zmm18, zmm18, zmm3
+ vaesenc zmm19, zmm19, zmm3
+ cmp r8d, 11
+ vbroadcasti32x4 zmm20, [rax+160]
+ jl L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+176]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ cmp r8d, 13
+ vbroadcasti32x4 zmm20, [rax+192]
+ jl L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+208]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+224]
+L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last:
+ vaesenclast zmm16, zmm16, zmm20
+ vaesenclast zmm17, zmm17, zmm20
+ vaesenclast zmm18, zmm18, zmm20
+ vaesenclast zmm19, zmm19, zmm20
+ lea rcx, QWORD PTR [r11+rdi]
+ lea rdx, QWORD PTR [r10+rdi]
+ vmovdqu64 zmm21, [rcx]
+ vpxorq zmm16, zmm16, zmm21
+ vmovdqu64 [rdx], zmm16
+ vmovdqu64 zmm21, [rcx+64]
+ vpxorq zmm17, zmm17, zmm21
+ vmovdqu64 [rdx+64], zmm17
+ vmovdqu64 zmm21, [rcx+128]
+ vpxorq zmm18, zmm18, zmm21
+ vmovdqu64 [rdx+128], zmm18
+ vmovdqu64 zmm21, [rcx+192]
+ vpxorq zmm19, zmm19, zmm21
+ vmovdqu64 [rdx+192], zmm19
+ vpxorq zmm20, zmm20, zmm20
+ vinserti32x4 zmm20, zmm20, xmm6, 0
+ vmovdqu64 zmm21, [rsi]
+ vpshufb zmm21, zmm21, zmm30
+ vpxorq zmm21, zmm21, zmm20
+ vpclmulqdq zmm16, zmm21, zmm23, 0
+ vpclmulqdq zmm17, zmm21, zmm23, 1
+ vpclmulqdq zmm18, zmm21, zmm23, 16
+ vpclmulqdq zmm19, zmm21, zmm23, 17
+ vmovdqa64 zmm27, zmm16
+ vpxorq zmm28, zmm18, zmm17
+ vmovdqa64 zmm29, zmm19
+ vmovdqu64 zmm21, [rsi+64]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm24, 0
+ vpclmulqdq zmm17, zmm21, zmm24, 1
+ vpclmulqdq zmm18, zmm21, zmm24, 16
+ vpclmulqdq zmm19, zmm21, zmm24, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rsi+128]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm25, 0
+ vpclmulqdq zmm17, zmm21, zmm25, 1
+ vpclmulqdq zmm18, zmm21, zmm25, 16
+ vpclmulqdq zmm19, zmm21, zmm25, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rsi+192]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm26, 0
+ vpclmulqdq zmm17, zmm21, zmm26, 1
+ vpclmulqdq zmm18, zmm21, zmm26, 16
+ vpclmulqdq zmm19, zmm21, zmm26, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vpclmulqdq zmm21, zmm31, zmm27, 1
+ vpshufd zmm27, zmm27, 78
+ vpternlogq zmm28, zmm27, zmm21, 150
+ vpclmulqdq zmm21, zmm31, zmm28, 1
+ vpshufd zmm28, zmm28, 78
+ vpternlogq zmm29, zmm28, zmm21, 150
+ vextracti32x4 xmm0, zmm29, 1
+ vextracti32x4 xmm4, zmm29, 2
+ vextracti32x4 xmm5, zmm29, 3
+ vpxorq xmm6, xmm29, xmm0
+ vpternlogq xmm6, xmm5, xmm4, 150
+ mov rsi, rdx
+ add edi, 256
+ cmp edi, r13d
+ jl L_AES_GCM_encrypt_update_avx512_ghash_128
+L_AES_GCM_encrypt_update_avx512_last_ghash:
+ vpxorq zmm20, zmm20, zmm20
+ vinserti32x4 zmm20, zmm20, xmm6, 0
+ vmovdqu64 zmm21, [rsi]
+ vpshufb zmm21, zmm21, zmm30
+ vpxorq zmm21, zmm21, zmm20
+ vpclmulqdq zmm16, zmm21, zmm23, 0
+ vpclmulqdq zmm17, zmm21, zmm23, 1
+ vpclmulqdq zmm18, zmm21, zmm23, 16
+ vpclmulqdq zmm19, zmm21, zmm23, 17
+ vmovdqa64 zmm27, zmm16
+ vpxorq zmm28, zmm18, zmm17
+ vmovdqa64 zmm29, zmm19
+ vmovdqu64 zmm21, [rsi+64]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm24, 0
+ vpclmulqdq zmm17, zmm21, zmm24, 1
+ vpclmulqdq zmm18, zmm21, zmm24, 16
+ vpclmulqdq zmm19, zmm21, zmm24, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rsi+128]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm25, 0
+ vpclmulqdq zmm17, zmm21, zmm25, 1
+ vpclmulqdq zmm18, zmm21, zmm25, 16
+ vpclmulqdq zmm19, zmm21, zmm25, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rsi+192]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm26, 0
+ vpclmulqdq zmm17, zmm21, zmm26, 1
+ vpclmulqdq zmm18, zmm21, zmm26, 16
+ vpclmulqdq zmm19, zmm21, zmm26, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vpclmulqdq zmm21, zmm31, zmm27, 1
+ vpshufd zmm27, zmm27, 78
+ vpternlogq zmm28, zmm27, zmm21, 150
+ vpclmulqdq zmm21, zmm31, zmm28, 1
+ vpshufd zmm28, zmm28, 78
+ vpternlogq zmm29, zmm28, zmm21, 150
+ vextracti32x4 xmm0, zmm29, 1
+ vextracti32x4 xmm4, zmm29, 2
+ vextracti32x4 xmm5, zmm29, 3
+ vpxorq xmm6, xmm29, xmm0
+ vpternlogq xmm6, xmm5, xmm4, 150
+L_AES_GCM_encrypt_update_avx512_after_256:
+ vmovdqu xmm5, OWORD PTR [rsp]
+L_AES_GCM_encrypt_update_avx512_done_128:
+ mov edx, r9d
+ cmp edi, edx
+ jge L_AES_GCM_encrypt_update_avx512_done_enc
+ mov r13d, r9d
+ and r13d, 4294967280
+ cmp edi, r13d
+ jge L_AES_GCM_encrypt_update_avx512_last_block_done
+ vmovdqu xmm9, OWORD PTR [r15]
+ vpshufb xmm8, xmm9, OWORD PTR L_avx512_aes_gcm_bswap_epi64
+ vpaddd xmm9, xmm9, OWORD PTR L_avx512_aes_gcm_one
+ vmovdqu OWORD PTR [r15], xmm9
+ vpxor xmm8, xmm8, [rax]
+ vaesenc xmm8, xmm8, [rax+16]
+ vaesenc xmm8, xmm8, [rax+32]
+ vaesenc xmm8, xmm8, [rax+48]
+ vaesenc xmm8, xmm8, [rax+64]
+ vaesenc xmm8, xmm8, [rax+80]
+ vaesenc xmm8, xmm8, [rax+96]
+ vaesenc xmm8, xmm8, [rax+112]
+ vaesenc xmm8, xmm8, [rax+128]
+ vaesenc xmm8, xmm8, [rax+144]
+ cmp r8d, 11
+ vmovdqa xmm9, OWORD PTR [rax+160]
+ jl L_AES_GCM_encrypt_update_avx512_aesenc_block_last
+ vaesenc xmm8, xmm8, xmm9
+ vaesenc xmm8, xmm8, [rax+176]
+ cmp r8d, 13
+ vmovdqa xmm9, OWORD PTR [rax+192]
+ jl L_AES_GCM_encrypt_update_avx512_aesenc_block_last
+ vaesenc xmm8, xmm8, xmm9
+ vaesenc xmm8, xmm8, [rax+208]
+ vmovdqa xmm9, OWORD PTR [rax+224]
+L_AES_GCM_encrypt_update_avx512_aesenc_block_last:
+ vaesenclast xmm8, xmm8, xmm9
+ vmovdqu xmm9, OWORD PTR [r11+rdi]
+ vpxor xmm8, xmm8, xmm9
+ vmovdqu OWORD PTR [r10+rdi], xmm8
+ vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vpxor xmm6, xmm6, xmm8
+ add edi, 16
+ cmp edi, r13d
+ jge L_AES_GCM_encrypt_update_avx512_last_block_ghash
+L_AES_GCM_encrypt_update_avx512_last_block_start:
+ vmovdqu xmm13, OWORD PTR [r11+rdi]
+ vmovdqu xmm9, OWORD PTR [r15]
+ vpshufb xmm8, xmm9, OWORD PTR L_avx512_aes_gcm_bswap_epi64
+ vpaddd xmm9, xmm9, OWORD PTR L_avx512_aes_gcm_one
+ vmovdqu OWORD PTR [r15], xmm9
+ vpxor xmm8, xmm8, [rax]
+ vpclmulqdq xmm10, xmm6, xmm5, 16
+ vaesenc xmm8, xmm8, [rax+16]
+ vaesenc xmm8, xmm8, [rax+32]
+ vpclmulqdq xmm11, xmm6, xmm5, 1
+ vaesenc xmm8, xmm8, [rax+48]
+ vaesenc xmm8, xmm8, [rax+64]
+ vpclmulqdq xmm12, xmm6, xmm5, 0
+ vaesenc xmm8, xmm8, [rax+80]
+ vpclmulqdq xmm1, xmm6, xmm5, 17
+ vaesenc xmm8, xmm8, [rax+96]
+ vpxor xmm10, xmm10, xmm11
+ vpslldq xmm2, xmm10, 8
+ vpsrldq xmm10, xmm10, 8
+ vaesenc xmm8, xmm8, [rax+112]
+ vpxor xmm2, xmm2, xmm12
+ vpxor xmm3, xmm1, xmm10
+ vmovdqa xmm0, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm2, xmm0, 16
+ vaesenc xmm8, xmm8, [rax+128]
+ vpshufd xmm10, xmm2, 78
+ vpxor xmm10, xmm10, xmm11
+ vpclmulqdq xmm11, xmm10, xmm0, 16
+ vaesenc xmm8, xmm8, [rax+144]
+ vpshufd xmm10, xmm10, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm6, xmm10, xmm3
+ cmp r8d, 11
+ vmovdqa xmm9, OWORD PTR [rax+160]
+ jl L_AES_GCM_encrypt_update_avx512_aesenc_gfmul_last
+ vaesenc xmm8, xmm8, xmm9
+ vaesenc xmm8, xmm8, [rax+176]
+ cmp r8d, 13
+ vmovdqa xmm9, OWORD PTR [rax+192]
+ jl L_AES_GCM_encrypt_update_avx512_aesenc_gfmul_last
+ vaesenc xmm8, xmm8, xmm9
+ vaesenc xmm8, xmm8, [rax+208]
+ vmovdqa xmm9, OWORD PTR [rax+224]
+L_AES_GCM_encrypt_update_avx512_aesenc_gfmul_last:
+ vaesenclast xmm8, xmm8, xmm9
+ vmovdqa xmm0, xmm13
+ vpxor xmm8, xmm8, xmm0
+ vmovdqu OWORD PTR [r10+rdi], xmm8
+ vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ add edi, 16
+ vpxor xmm6, xmm6, xmm8
+ cmp edi, r13d
+ jl L_AES_GCM_encrypt_update_avx512_last_block_start
+L_AES_GCM_encrypt_update_avx512_last_block_ghash:
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm5, 78
+ vpxor xmm9, xmm9, xmm5
+ vpshufd xmm10, xmm6, 78
+ vpxor xmm10, xmm10, xmm6
+ vpclmulqdq xmm8, xmm6, xmm5, 0
+ vpclmulqdq xmm11, xmm6, xmm5, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm6, xmm11
+L_AES_GCM_encrypt_update_avx512_last_block_done:
+L_AES_GCM_encrypt_update_avx512_done_enc:
+ vmovdqa OWORD PTR [r12], xmm6
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp+1040]
+ vmovdqu xmm7, OWORD PTR [rsp+1056]
+ vmovdqu xmm8, OWORD PTR [rsp+1072]
+ vmovdqu xmm9, OWORD PTR [rsp+1088]
+ vmovdqu xmm10, OWORD PTR [rsp+1104]
+ vmovdqu xmm11, OWORD PTR [rsp+1120]
+ vmovdqu xmm12, OWORD PTR [rsp+1136]
+ vmovdqu xmm13, OWORD PTR [rsp+1152]
+ vmovdqu xmm14, OWORD PTR [rsp+1168]
+ vmovdqu xmm15, OWORD PTR [rsp+1184]
+ add rsp, 1200
+ pop rbp
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r12
+ pop r13
+ ret
+AES_GCM_encrypt_update_avx512 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_encrypt_final_avx512 PROC
+ push r13
+ push r12
+ push r14
+ mov rax, rcx
+ mov r10d, r9d
+ mov r9, rdx
+ mov r11d, DWORD PTR [rsp+64]
+ mov r12, QWORD PTR [rsp+72]
+ mov r14, QWORD PTR [rsp+80]
+ sub rsp, 144
+ vmovdqu OWORD PTR [rsp+16], xmm6
+ vmovdqu OWORD PTR [rsp+32], xmm7
+ vmovdqu OWORD PTR [rsp+48], xmm8
+ vmovdqu OWORD PTR [rsp+64], xmm9
+ vmovdqu OWORD PTR [rsp+80], xmm10
+ vmovdqu OWORD PTR [rsp+96], xmm11
+ vmovdqu OWORD PTR [rsp+112], xmm12
+ vmovdqu OWORD PTR [rsp+128], xmm13
+ vmovdqa xmm4, OWORD PTR [rax]
+ vmovdqa xmm5, OWORD PTR [r12]
+ vmovdqa xmm6, OWORD PTR [r14]
+ vpsrlq xmm8, xmm5, 63
+ vpsllq xmm7, xmm5, 1
+ vpslldq xmm8, xmm8, 8
+ vpor xmm7, xmm7, xmm8
+ vpshufd xmm5, xmm5, 255
+ vpsrad xmm5, xmm5, 31
+ vpand xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpxor xmm5, xmm5, xmm7
+ mov edx, r10d
+ mov ecx, r11d
+ shl rdx, 3
+ shl rcx, 3
+ vmovq xmm0, rdx
+ vmovq xmm1, rcx
+ vpunpcklqdq xmm0, xmm0, xmm1
+ vpxor xmm4, xmm4, xmm0
+ ; ghash_gfmul_red_avx
+ vpshufd xmm8, xmm5, 78
+ vpxor xmm8, xmm8, xmm5
+ vpshufd xmm9, xmm4, 78
+ vpxor xmm9, xmm9, xmm4
+ vpclmulqdq xmm7, xmm4, xmm5, 0
+ vpclmulqdq xmm10, xmm4, xmm5, 17
+ vpclmulqdq xmm8, xmm8, xmm9, 0
+ vpternlogq xmm8, xmm10, xmm7, 150
+ vmovdqa xmm9, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpternlogq xmm8, xmm7, xmm11, 150
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm10, xmm8, xmm11, 150
+ vmovdqa xmm4, xmm10
+ vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vpxor xmm0, xmm4, xmm6
+ cmp r8d, 16
+ je L_AES_GCM_encrypt_final_avx512_store_tag_16
+ xor rcx, rcx
+ vmovdqu OWORD PTR [rsp], xmm0
+L_AES_GCM_encrypt_final_avx512_store_tag_loop:
+ movzx r13d, BYTE PTR [rsp+rcx]
+ mov BYTE PTR [r9+rcx], r13b
+ inc ecx
+ cmp ecx, r8d
+ jne L_AES_GCM_encrypt_final_avx512_store_tag_loop
+ jmp L_AES_GCM_encrypt_final_avx512_store_tag_done
+L_AES_GCM_encrypt_final_avx512_store_tag_16:
+ vmovdqu OWORD PTR [r9], xmm0
+L_AES_GCM_encrypt_final_avx512_store_tag_done:
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp+16]
+ vmovdqu xmm7, OWORD PTR [rsp+32]
+ vmovdqu xmm8, OWORD PTR [rsp+48]
+ vmovdqu xmm9, OWORD PTR [rsp+64]
+ vmovdqu xmm10, OWORD PTR [rsp+80]
+ vmovdqu xmm11, OWORD PTR [rsp+96]
+ vmovdqu xmm12, OWORD PTR [rsp+112]
+ vmovdqu xmm13, OWORD PTR [rsp+128]
+ add rsp, 144
+ pop r14
+ pop r12
+ pop r13
+ ret
+AES_GCM_encrypt_final_avx512 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_decrypt_update_avx512 PROC
+ push r13
+ push r12
+ push r14
+ push r15
+ push rdi
+ push rsi
+ push rbx
+ mov rax, rcx
+ mov r10, r8
+ mov r8d, edx
+ mov r11, r9
+ mov r9d, DWORD PTR [rsp+96]
+ mov r12, QWORD PTR [rsp+104]
+ mov r14, QWORD PTR [rsp+112]
+ mov r15, QWORD PTR [rsp+120]
+ sub rsp, 1200
+ vmovdqu OWORD PTR [rsp+1040], xmm6
+ vmovdqu OWORD PTR [rsp+1056], xmm7
+ vmovdqu OWORD PTR [rsp+1072], xmm8
+ vmovdqu OWORD PTR [rsp+1088], xmm9
+ vmovdqu OWORD PTR [rsp+1104], xmm10
+ vmovdqu OWORD PTR [rsp+1120], xmm11
+ vmovdqu OWORD PTR [rsp+1136], xmm12
+ vmovdqu OWORD PTR [rsp+1152], xmm13
+ vmovdqu OWORD PTR [rsp+1168], xmm14
+ vmovdqu OWORD PTR [rsp+1184], xmm15
+ vmovdqa xmm6, OWORD PTR [r12]
+ vmovdqa xmm5, OWORD PTR [r14]
+ vpsrlq xmm9, xmm5, 63
+ vpsllq xmm8, xmm5, 1
+ vpslldq xmm9, xmm9, 8
+ vpor xmm8, xmm8, xmm9
+ vpshufd xmm5, xmm5, 255
+ vpsrad xmm5, xmm5, 31
+ vpand xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpxor xmm5, xmm5, xmm8
+ xor edi, edi
+ cmp r9d, 256
+ jl L_AES_GCM_decrypt_update_avx512_done_128
+ vmovdqa xmm2, xmm6
+ ; H ^ 1
+ vmovdqu OWORD PTR [rsp], xmm5
+ ; H ^ 2
+ vpclmulqdq xmm8, xmm5, xmm5, 0
+ vpclmulqdq xmm11, xmm5, xmm5, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm0, xmm11
+ vmovdqu OWORD PTR [rsp+16], xmm0
+ ; H ^ 3
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm5, 78
+ vpxor xmm9, xmm9, xmm5
+ vpshufd xmm10, xmm0, 78
+ vpxor xmm10, xmm10, xmm0
+ vpclmulqdq xmm8, xmm0, xmm5, 0
+ vpclmulqdq xmm11, xmm0, xmm5, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm1, xmm11
+ vmovdqu OWORD PTR [rsp+32], xmm1
+ ; H ^ 4
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm3, xmm11
+ vmovdqu OWORD PTR [rsp+48], xmm3
+ ; H ^ 5
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+64], xmm7
+ ; H ^ 6
+ vpclmulqdq xmm8, xmm1, xmm1, 0
+ vpclmulqdq xmm11, xmm1, xmm1, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+80], xmm7
+ ; H ^ 7
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm1, 78
+ vpxor xmm9, xmm9, xmm1
+ vpshufd xmm10, xmm3, 78
+ vpxor xmm10, xmm10, xmm3
+ vpclmulqdq xmm8, xmm3, xmm1, 0
+ vpclmulqdq xmm11, xmm3, xmm1, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm7
+ ; H ^ 8
+ vpclmulqdq xmm8, xmm3, xmm3, 0
+ vpclmulqdq xmm11, xmm3, xmm3, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+112], xmm7
+ ; H ^ 9
+ vmovdqu xmm0, OWORD PTR [rsp+48]
+ vmovdqu xmm1, OWORD PTR [rsp+64]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+128], xmm7
+ ; H ^ 10
+ vmovdqu xmm0, OWORD PTR [rsp+64]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+144], xmm7
+ ; H ^ 11
+ vmovdqu xmm0, OWORD PTR [rsp+64]
+ vmovdqu xmm1, OWORD PTR [rsp+80]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+160], xmm7
+ ; H ^ 12
+ vmovdqu xmm0, OWORD PTR [rsp+80]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+176], xmm7
+ ; H ^ 13
+ vmovdqu xmm0, OWORD PTR [rsp+80]
+ vmovdqu xmm1, OWORD PTR [rsp+96]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+192], xmm7
+ ; H ^ 14
+ vmovdqu xmm0, OWORD PTR [rsp+96]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+208], xmm7
+ ; H ^ 15
+ vmovdqu xmm0, OWORD PTR [rsp+96]
+ vmovdqu xmm1, OWORD PTR [rsp+112]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+224], xmm7
+ ; H ^ 16
+ vmovdqu xmm0, OWORD PTR [rsp+112]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+240], xmm7
+ cmp r9d, 512
+ jl L_AES_GCM_decrypt_update_avx512_no_ext
+ ; H ^ 17
+ vmovdqu xmm0, OWORD PTR [rsp+112]
+ vmovdqu xmm1, OWORD PTR [rsp+128]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+256], xmm7
+ ; H ^ 18
+ vmovdqu xmm0, OWORD PTR [rsp+128]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+272], xmm7
+ ; H ^ 19
+ vmovdqu xmm0, OWORD PTR [rsp+128]
+ vmovdqu xmm1, OWORD PTR [rsp+144]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+288], xmm7
+ ; H ^ 20
+ vmovdqu xmm0, OWORD PTR [rsp+144]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+304], xmm7
+ ; H ^ 21
+ vmovdqu xmm0, OWORD PTR [rsp+144]
+ vmovdqu xmm1, OWORD PTR [rsp+160]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+320], xmm7
+ ; H ^ 22
+ vmovdqu xmm0, OWORD PTR [rsp+160]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+336], xmm7
+ ; H ^ 23
+ vmovdqu xmm0, OWORD PTR [rsp+160]
+ vmovdqu xmm1, OWORD PTR [rsp+176]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+352], xmm7
+ ; H ^ 24
+ vmovdqu xmm0, OWORD PTR [rsp+176]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+368], xmm7
+ ; H ^ 25
+ vmovdqu xmm0, OWORD PTR [rsp+176]
+ vmovdqu xmm1, OWORD PTR [rsp+192]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+384], xmm7
+ ; H ^ 26
+ vmovdqu xmm0, OWORD PTR [rsp+192]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+400], xmm7
+ ; H ^ 27
+ vmovdqu xmm0, OWORD PTR [rsp+192]
+ vmovdqu xmm1, OWORD PTR [rsp+208]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+416], xmm7
+ ; H ^ 28
+ vmovdqu xmm0, OWORD PTR [rsp+208]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+432], xmm7
+ ; H ^ 29
+ vmovdqu xmm0, OWORD PTR [rsp+208]
+ vmovdqu xmm1, OWORD PTR [rsp+224]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+448], xmm7
+ ; H ^ 30
+ vmovdqu xmm0, OWORD PTR [rsp+224]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+464], xmm7
+ ; H ^ 31
+ vmovdqu xmm0, OWORD PTR [rsp+224]
+ vmovdqu xmm1, OWORD PTR [rsp+240]
+ ; ghash_gfmul_red_avx
+ vpshufd xmm9, xmm0, 78
+ vpxor xmm9, xmm9, xmm0
+ vpshufd xmm10, xmm1, 78
+ vpxor xmm10, xmm10, xmm1
+ vpclmulqdq xmm8, xmm1, xmm0, 0
+ vpclmulqdq xmm11, xmm1, xmm0, 17
+ vpclmulqdq xmm9, xmm9, xmm10, 0
+ vpternlogq xmm9, xmm11, xmm8, 150
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+480], xmm7
+ ; H ^ 32
+ vmovdqu xmm0, OWORD PTR [rsp+240]
+ vpclmulqdq xmm8, xmm0, xmm0, 0
+ vpclmulqdq xmm11, xmm0, xmm0, 17
+ vpxor xmm9, xmm9, xmm9
+ vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm12, xmm10, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm9, xmm8, xmm12, 150
+ vpclmulqdq xmm12, xmm10, xmm9, 1
+ vpshufd xmm9, xmm9, 78
+ vpternlogq xmm11, xmm9, xmm12, 150
+ vmovdqa xmm7, xmm11
+ vmovdqu OWORD PTR [rsp+496], xmm7
+L_AES_GCM_decrypt_update_avx512_no_ext:
+ vbroadcasti32x4 zmm22, ptr_L_avx512_aes_gcm_bswap_epi64
+ vbroadcasti32x4 zmm30, ptr_L_avx512_aes_gcm_bswap_mask
+ vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128
+ vbroadcasti32x4 zmm9, [rax]
+ vbroadcasti32x4 zmm10, [rax+16]
+ vbroadcasti32x4 zmm11, [rax+32]
+ vbroadcasti32x4 zmm12, [rax+48]
+ vbroadcasti32x4 zmm13, [rax+64]
+ vbroadcasti32x4 zmm14, [rax+80]
+ vbroadcasti32x4 zmm15, [rax+96]
+ vbroadcasti32x4 zmm1, [rax+112]
+ vbroadcasti32x4 zmm2, [rax+128]
+ vbroadcasti32x4 zmm3, [rax+144]
+ cmp r9d, 512
+ jl L_AES_GCM_decrypt_update_avx512_no_windows
+ mov r13d, r9d
+ and r13d, 4294966784
+ vmovdqu64 zmm23, [rsp+448]
+ vshufi64x2 zmm23, zmm23, zmm23, 27
+ vmovdqu64 zmm24, [rsp+384]
+ vshufi64x2 zmm24, zmm24, zmm24, 27
+ vmovdqu64 zmm25, [rsp+320]
+ vshufi64x2 zmm25, zmm25, zmm25, 27
+ vmovdqu64 zmm26, [rsp+256]
+ vshufi64x2 zmm26, zmm26, zmm26, 27
+ vmovdqu64 [rsp+512], zmm23
+ vmovdqu64 [rsp+576], zmm24
+ vmovdqu64 [rsp+640], zmm25
+ vmovdqu64 [rsp+704], zmm26
+ vmovdqu64 zmm23, [rsp+192]
+ vshufi64x2 zmm23, zmm23, zmm23, 27
+ vmovdqu64 zmm24, [rsp+128]
+ vshufi64x2 zmm24, zmm24, zmm24, 27
+ vmovdqu64 zmm25, [rsp+64]
+ vshufi64x2 zmm25, zmm25, zmm25, 27
+ vmovdqu64 zmm26, [rsp]
+ vshufi64x2 zmm26, zmm26, zmm26, 27
+ vmovdqu64 [rsp+768], zmm23
+ vmovdqu64 [rsp+832], zmm24
+ vmovdqu64 [rsp+896], zmm25
+ vmovdqu64 [rsp+960], zmm26
+ ; 512 bytes of input
+ xor esi, esi
+ lea rbx, QWORD PTR [r11+rdi]
+ vpxorq zmm20, zmm20, zmm20
+ vinserti32x4 zmm20, zmm20, xmm6, 0
+ vmovdqu64 zmm23, [rsp+512]
+ vmovdqu64 zmm24, [rsp+576]
+ vmovdqu64 zmm25, [rsp+640]
+ vmovdqu64 zmm26, [rsp+704]
+ vmovdqu64 zmm21, [rbx]
+ vpshufb zmm21, zmm21, zmm30
+ vpxorq zmm21, zmm21, zmm20
+ vpclmulqdq zmm16, zmm21, zmm23, 0
+ vpclmulqdq zmm17, zmm21, zmm23, 1
+ vpclmulqdq zmm18, zmm21, zmm23, 16
+ vpclmulqdq zmm19, zmm21, zmm23, 17
+ vmovdqa64 zmm27, zmm16
+ vpxorq zmm28, zmm18, zmm17
+ vmovdqa64 zmm29, zmm19
+ vmovdqu64 zmm21, [rbx+64]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm24, 0
+ vpclmulqdq zmm17, zmm21, zmm24, 1
+ vpclmulqdq zmm18, zmm21, zmm24, 16
+ vpclmulqdq zmm19, zmm21, zmm24, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rbx+128]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm25, 0
+ vpclmulqdq zmm17, zmm21, zmm25, 1
+ vpclmulqdq zmm18, zmm21, zmm25, 16
+ vpclmulqdq zmm19, zmm21, zmm25, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rbx+192]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm26, 0
+ vpclmulqdq zmm17, zmm21, zmm26, 1
+ vpclmulqdq zmm18, zmm21, zmm26, 16
+ vpclmulqdq zmm19, zmm21, zmm26, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm23, [rsp+768]
+ vmovdqu64 zmm24, [rsp+832]
+ vmovdqu64 zmm25, [rsp+896]
+ vmovdqu64 zmm26, [rsp+960]
+ vmovdqu64 zmm21, [rbx+256]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm23, 0
+ vpclmulqdq zmm17, zmm21, zmm23, 1
+ vpclmulqdq zmm18, zmm21, zmm23, 16
+ vpclmulqdq zmm19, zmm21, zmm23, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rbx+320]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm24, 0
+ vpclmulqdq zmm17, zmm21, zmm24, 1
+ vpclmulqdq zmm18, zmm21, zmm24, 16
+ vpclmulqdq zmm19, zmm21, zmm24, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rbx+384]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm25, 0
+ vpclmulqdq zmm17, zmm21, zmm25, 1
+ vpclmulqdq zmm18, zmm21, zmm25, 16
+ vpclmulqdq zmm19, zmm21, zmm25, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rbx+448]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm26, 0
+ vpclmulqdq zmm17, zmm21, zmm26, 1
+ vpclmulqdq zmm18, zmm21, zmm26, 16
+ vpclmulqdq zmm19, zmm21, zmm26, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vpclmulqdq zmm21, zmm31, zmm27, 1
+ vpshufd zmm27, zmm27, 78
+ vpternlogq zmm28, zmm27, zmm21, 150
+ vpclmulqdq zmm21, zmm31, zmm28, 1
+ vpshufd zmm28, zmm28, 78
+ vpternlogq zmm29, zmm28, zmm21, 150
+ vextracti32x4 xmm0, zmm29, 1
+ vextracti32x4 xmm4, zmm29, 2
+ vextracti32x4 xmm5, zmm29, 3
+ vpxorq xmm6, xmm29, xmm0
+ vpternlogq xmm6, xmm5, xmm4, 150
+ add edi, 512
+ cmp edi, r13d
+ jge L_AES_GCM_decrypt_update_avx512_last_aes
+L_AES_GCM_decrypt_update_avx512_win_loop:
+ lea rbx, QWORD PTR [r11+rdi]
+ vpxorq zmm21, zmm21, zmm21
+ vinserti32x4 zmm21, zmm21, xmm6, 0
+ vbroadcasti32x4 zmm20, [r15]
+ vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
+ vpshufb zmm16, zmm16, zmm22
+ vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
+ vpshufb zmm17, zmm17, zmm22
+ vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2
+ vpshufb zmm18, zmm18, zmm22
+ vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3
+ vpshufb zmm19, zmm19, zmm22
+ vmovdqu xmm8, OWORD PTR [r15]
+ vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen
+ vmovdqu OWORD PTR [r15], xmm8
+ vpxorq zmm16, zmm16, zmm9
+ vpxorq zmm17, zmm17, zmm9
+ vpxorq zmm18, zmm18, zmm9
+ vpxorq zmm19, zmm19, zmm9
+ vmovdqu64 zmm31, [rbx]
+ vpshufb zmm31, zmm31, zmm30
+ vpxorq zmm31, zmm31, zmm21
+ vaesenc zmm16, zmm16, zmm10
+ vaesenc zmm17, zmm17, zmm10
+ vaesenc zmm18, zmm18, zmm10
+ vaesenc zmm19, zmm19, zmm10
+ vpclmulqdq zmm23, zmm31, [rsp+512], 0
+ vpclmulqdq zmm24, zmm31, [rsp+512], 1
+ vpclmulqdq zmm25, zmm31, [rsp+512], 16
+ vpclmulqdq zmm26, zmm31, [rsp+512], 17
+ vmovdqa64 zmm27, zmm23
+ vpxorq zmm28, zmm25, zmm24
+ vmovdqa64 zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm11
+ vaesenc zmm17, zmm17, zmm11
+ vaesenc zmm18, zmm18, zmm11
+ vaesenc zmm19, zmm19, zmm11
+ vmovdqu64 zmm31, [rbx+64]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm12
+ vaesenc zmm17, zmm17, zmm12
+ vaesenc zmm18, zmm18, zmm12
+ vaesenc zmm19, zmm19, zmm12
+ vpclmulqdq zmm23, zmm31, [rsp+576], 0
+ vpclmulqdq zmm24, zmm31, [rsp+576], 1
+ vpclmulqdq zmm25, zmm31, [rsp+576], 16
+ vpclmulqdq zmm26, zmm31, [rsp+576], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm13
+ vaesenc zmm17, zmm17, zmm13
+ vaesenc zmm18, zmm18, zmm13
+ vaesenc zmm19, zmm19, zmm13
+ vmovdqu64 zmm31, [rbx+128]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm14
+ vaesenc zmm17, zmm17, zmm14
+ vaesenc zmm18, zmm18, zmm14
+ vaesenc zmm19, zmm19, zmm14
+ vpclmulqdq zmm23, zmm31, [rsp+640], 0
+ vpclmulqdq zmm24, zmm31, [rsp+640], 1
+ vpclmulqdq zmm25, zmm31, [rsp+640], 16
+ vpclmulqdq zmm26, zmm31, [rsp+640], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm15
+ vaesenc zmm17, zmm17, zmm15
+ vaesenc zmm18, zmm18, zmm15
+ vaesenc zmm19, zmm19, zmm15
+ vmovdqu64 zmm31, [rbx+192]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm1
+ vaesenc zmm17, zmm17, zmm1
+ vaesenc zmm18, zmm18, zmm1
+ vaesenc zmm19, zmm19, zmm1
+ vpclmulqdq zmm23, zmm31, [rsp+704], 0
+ vpclmulqdq zmm24, zmm31, [rsp+704], 1
+ vpclmulqdq zmm25, zmm31, [rsp+704], 16
+ vpclmulqdq zmm26, zmm31, [rsp+704], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm2
+ vaesenc zmm17, zmm17, zmm2
+ vaesenc zmm18, zmm18, zmm2
+ vaesenc zmm19, zmm19, zmm2
+ vaesenc zmm16, zmm16, zmm3
+ vaesenc zmm17, zmm17, zmm3
+ vaesenc zmm18, zmm18, zmm3
+ vaesenc zmm19, zmm19, zmm3
+ cmp r8d, 11
+ vbroadcasti32x4 zmm20, [rax+160]
+ jl L_AES_GCM_decrypt_update_avx512_a_il_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+176]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ cmp r8d, 13
+ vbroadcasti32x4 zmm20, [rax+192]
+ jl L_AES_GCM_decrypt_update_avx512_a_il_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+208]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+224]
+L_AES_GCM_decrypt_update_avx512_a_il_last:
+ vaesenclast zmm16, zmm16, zmm20
+ vaesenclast zmm17, zmm17, zmm20
+ vaesenclast zmm18, zmm18, zmm20
+ vaesenclast zmm19, zmm19, zmm20
+ lea rcx, QWORD PTR [r11+rsi]
+ lea rdx, QWORD PTR [r10+rsi]
+ vmovdqu64 zmm21, [rcx]
+ vpxorq zmm16, zmm16, zmm21
+ vmovdqu64 [rdx], zmm16
+ vmovdqu64 zmm21, [rcx+64]
+ vpxorq zmm17, zmm17, zmm21
+ vmovdqu64 [rdx+64], zmm17
+ vmovdqu64 zmm21, [rcx+128]
+ vpxorq zmm18, zmm18, zmm21
+ vmovdqu64 [rdx+128], zmm18
+ vmovdqu64 zmm21, [rcx+192]
+ vpxorq zmm19, zmm19, zmm21
+ vmovdqu64 [rdx+192], zmm19
+ add esi, 256
+ vbroadcasti32x4 zmm20, [r15]
+ vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
+ vpshufb zmm16, zmm16, zmm22
+ vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
+ vpshufb zmm17, zmm17, zmm22
+ vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2
+ vpshufb zmm18, zmm18, zmm22
+ vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3
+ vpshufb zmm19, zmm19, zmm22
+ vmovdqu xmm8, OWORD PTR [r15]
+ vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen
+ vmovdqu OWORD PTR [r15], xmm8
+ vpxorq zmm16, zmm16, zmm9
+ vpxorq zmm17, zmm17, zmm9
+ vpxorq zmm18, zmm18, zmm9
+ vpxorq zmm19, zmm19, zmm9
+ vmovdqu64 zmm31, [rbx+256]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm10
+ vaesenc zmm17, zmm17, zmm10
+ vaesenc zmm18, zmm18, zmm10
+ vaesenc zmm19, zmm19, zmm10
+ vpclmulqdq zmm23, zmm31, [rsp+768], 0
+ vpclmulqdq zmm24, zmm31, [rsp+768], 1
+ vpclmulqdq zmm25, zmm31, [rsp+768], 16
+ vpclmulqdq zmm26, zmm31, [rsp+768], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm11
+ vaesenc zmm17, zmm17, zmm11
+ vaesenc zmm18, zmm18, zmm11
+ vaesenc zmm19, zmm19, zmm11
+ vmovdqu64 zmm31, [rbx+320]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm12
+ vaesenc zmm17, zmm17, zmm12
+ vaesenc zmm18, zmm18, zmm12
+ vaesenc zmm19, zmm19, zmm12
+ vpclmulqdq zmm23, zmm31, [rsp+832], 0
+ vpclmulqdq zmm24, zmm31, [rsp+832], 1
+ vpclmulqdq zmm25, zmm31, [rsp+832], 16
+ vpclmulqdq zmm26, zmm31, [rsp+832], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm13
+ vaesenc zmm17, zmm17, zmm13
+ vaesenc zmm18, zmm18, zmm13
+ vaesenc zmm19, zmm19, zmm13
+ vmovdqu64 zmm31, [rbx+384]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm14
+ vaesenc zmm17, zmm17, zmm14
+ vaesenc zmm18, zmm18, zmm14
+ vaesenc zmm19, zmm19, zmm14
+ vpclmulqdq zmm23, zmm31, [rsp+896], 0
+ vpclmulqdq zmm24, zmm31, [rsp+896], 1
+ vpclmulqdq zmm25, zmm31, [rsp+896], 16
+ vpclmulqdq zmm26, zmm31, [rsp+896], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm15
+ vaesenc zmm17, zmm17, zmm15
+ vaesenc zmm18, zmm18, zmm15
+ vaesenc zmm19, zmm19, zmm15
+ vmovdqu64 zmm31, [rbx+448]
+ vpshufb zmm31, zmm31, zmm30
+ vaesenc zmm16, zmm16, zmm1
+ vaesenc zmm17, zmm17, zmm1
+ vaesenc zmm18, zmm18, zmm1
+ vaesenc zmm19, zmm19, zmm1
+ vpclmulqdq zmm23, zmm31, [rsp+960], 0
+ vpclmulqdq zmm24, zmm31, [rsp+960], 1
+ vpclmulqdq zmm25, zmm31, [rsp+960], 16
+ vpclmulqdq zmm26, zmm31, [rsp+960], 17
+ vpxorq zmm27, zmm27, zmm23
+ vpternlogq zmm28, zmm25, zmm24, 150
+ vpxorq zmm29, zmm29, zmm26
+ vaesenc zmm16, zmm16, zmm2
+ vaesenc zmm17, zmm17, zmm2
+ vaesenc zmm18, zmm18, zmm2
+ vaesenc zmm19, zmm19, zmm2
+ vaesenc zmm16, zmm16, zmm3
+ vaesenc zmm17, zmm17, zmm3
+ vaesenc zmm18, zmm18, zmm3
+ vaesenc zmm19, zmm19, zmm3
+ cmp r8d, 11
+ vbroadcasti32x4 zmm20, [rax+160]
+ jl L_AES_GCM_decrypt_update_avx512_b_il_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+176]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ cmp r8d, 13
+ vbroadcasti32x4 zmm20, [rax+192]
+ jl L_AES_GCM_decrypt_update_avx512_b_il_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+208]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+224]
+L_AES_GCM_decrypt_update_avx512_b_il_last:
+ vaesenclast zmm16, zmm16, zmm20
+ vaesenclast zmm17, zmm17, zmm20
+ vaesenclast zmm18, zmm18, zmm20
+ vaesenclast zmm19, zmm19, zmm20
+ lea rcx, QWORD PTR [r11+rsi]
+ lea rdx, QWORD PTR [r10+rsi]
+ vmovdqu64 zmm21, [rcx]
+ vpxorq zmm16, zmm16, zmm21
+ vmovdqu64 [rdx], zmm16
+ vmovdqu64 zmm21, [rcx+64]
+ vpxorq zmm17, zmm17, zmm21
+ vmovdqu64 [rdx+64], zmm17
+ vmovdqu64 zmm21, [rcx+128]
+ vpxorq zmm18, zmm18, zmm21
+ vmovdqu64 [rdx+128], zmm18
+ vmovdqu64 zmm21, [rcx+192]
+ vpxorq zmm19, zmm19, zmm21
+ vmovdqu64 [rdx+192], zmm19
+ add esi, 256
+ vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128
+ vpclmulqdq zmm23, zmm31, zmm27, 1
+ vpshufd zmm27, zmm27, 78
+ vpternlogq zmm28, zmm27, zmm23, 150
+ vpclmulqdq zmm23, zmm31, zmm28, 1
+ vpshufd zmm28, zmm28, 78
+ vpternlogq zmm29, zmm28, zmm23, 150
+ vextracti32x4 xmm0, zmm29, 1
+ vextracti32x4 xmm4, zmm29, 2
+ vextracti32x4 xmm5, zmm29, 3
+ vpxorq xmm6, xmm29, xmm0
+ vpternlogq xmm6, xmm5, xmm4, 150
+ add edi, 512
+ cmp edi, r13d
+ jl L_AES_GCM_decrypt_update_avx512_win_loop
+L_AES_GCM_decrypt_update_avx512_last_aes:
+ vbroadcasti32x4 zmm20, [r15]
+ vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
+ vpshufb zmm16, zmm16, zmm22
+ vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
+ vpshufb zmm17, zmm17, zmm22
+ vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2
+ vpshufb zmm18, zmm18, zmm22
+ vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3
+ vpshufb zmm19, zmm19, zmm22
+ vmovdqu xmm8, OWORD PTR [r15]
+ vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen
+ vmovdqu OWORD PTR [r15], xmm8
+ vpxorq zmm16, zmm16, zmm9
+ vpxorq zmm17, zmm17, zmm9
+ vpxorq zmm18, zmm18, zmm9
+ vpxorq zmm19, zmm19, zmm9
+ vaesenc zmm16, zmm16, zmm10
+ vaesenc zmm17, zmm17, zmm10
+ vaesenc zmm18, zmm18, zmm10
+ vaesenc zmm19, zmm19, zmm10
+ vaesenc zmm16, zmm16, zmm11
+ vaesenc zmm17, zmm17, zmm11
+ vaesenc zmm18, zmm18, zmm11
+ vaesenc zmm19, zmm19, zmm11
+ vaesenc zmm16, zmm16, zmm12
+ vaesenc zmm17, zmm17, zmm12
+ vaesenc zmm18, zmm18, zmm12
+ vaesenc zmm19, zmm19, zmm12
+ vaesenc zmm16, zmm16, zmm13
+ vaesenc zmm17, zmm17, zmm13
+ vaesenc zmm18, zmm18, zmm13
+ vaesenc zmm19, zmm19, zmm13
+ vaesenc zmm16, zmm16, zmm14
+ vaesenc zmm17, zmm17, zmm14
+ vaesenc zmm18, zmm18, zmm14
+ vaesenc zmm19, zmm19, zmm14
+ vaesenc zmm16, zmm16, zmm15
+ vaesenc zmm17, zmm17, zmm15
+ vaesenc zmm18, zmm18, zmm15
+ vaesenc zmm19, zmm19, zmm15
+ vaesenc zmm16, zmm16, zmm1
+ vaesenc zmm17, zmm17, zmm1
+ vaesenc zmm18, zmm18, zmm1
+ vaesenc zmm19, zmm19, zmm1
+ vaesenc zmm16, zmm16, zmm2
+ vaesenc zmm17, zmm17, zmm2
+ vaesenc zmm18, zmm18, zmm2
+ vaesenc zmm19, zmm19, zmm2
+ vaesenc zmm16, zmm16, zmm3
+ vaesenc zmm17, zmm17, zmm3
+ vaesenc zmm18, zmm18, zmm3
+ vaesenc zmm19, zmm19, zmm3
+ cmp r8d, 11
+ vbroadcasti32x4 zmm20, [rax+160]
+ jl L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+176]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ cmp r8d, 13
+ vbroadcasti32x4 zmm20, [rax+192]
+ jl L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+208]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+224]
+L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last:
+ vaesenclast zmm16, zmm16, zmm20
+ vaesenclast zmm17, zmm17, zmm20
+ vaesenclast zmm18, zmm18, zmm20
+ vaesenclast zmm19, zmm19, zmm20
+ lea rcx, QWORD PTR [r11+rsi]
+ lea rdx, QWORD PTR [r10+rsi]
+ vmovdqu64 zmm21, [rcx]
+ vpxorq zmm16, zmm16, zmm21
+ vmovdqu64 [rdx], zmm16
+ vmovdqu64 zmm21, [rcx+64]
+ vpxorq zmm17, zmm17, zmm21
+ vmovdqu64 [rdx+64], zmm17
+ vmovdqu64 zmm21, [rcx+128]
+ vpxorq zmm18, zmm18, zmm21
+ vmovdqu64 [rdx+128], zmm18
+ vmovdqu64 zmm21, [rcx+192]
+ vpxorq zmm19, zmm19, zmm21
+ vmovdqu64 [rdx+192], zmm19
+ add esi, 256
+ vbroadcasti32x4 zmm20, [r15]
+ vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
+ vpshufb zmm16, zmm16, zmm22
+ vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
+ vpshufb zmm17, zmm17, zmm22
+ vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2
+ vpshufb zmm18, zmm18, zmm22
+ vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3
+ vpshufb zmm19, zmm19, zmm22
+ vmovdqu xmm8, OWORD PTR [r15]
+ vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen
+ vmovdqu OWORD PTR [r15], xmm8
+ vpxorq zmm16, zmm16, zmm9
+ vpxorq zmm17, zmm17, zmm9
+ vpxorq zmm18, zmm18, zmm9
+ vpxorq zmm19, zmm19, zmm9
+ vaesenc zmm16, zmm16, zmm10
+ vaesenc zmm17, zmm17, zmm10
+ vaesenc zmm18, zmm18, zmm10
+ vaesenc zmm19, zmm19, zmm10
+ vaesenc zmm16, zmm16, zmm11
+ vaesenc zmm17, zmm17, zmm11
+ vaesenc zmm18, zmm18, zmm11
+ vaesenc zmm19, zmm19, zmm11
+ vaesenc zmm16, zmm16, zmm12
+ vaesenc zmm17, zmm17, zmm12
+ vaesenc zmm18, zmm18, zmm12
+ vaesenc zmm19, zmm19, zmm12
+ vaesenc zmm16, zmm16, zmm13
+ vaesenc zmm17, zmm17, zmm13
+ vaesenc zmm18, zmm18, zmm13
+ vaesenc zmm19, zmm19, zmm13
+ vaesenc zmm16, zmm16, zmm14
+ vaesenc zmm17, zmm17, zmm14
+ vaesenc zmm18, zmm18, zmm14
+ vaesenc zmm19, zmm19, zmm14
+ vaesenc zmm16, zmm16, zmm15
+ vaesenc zmm17, zmm17, zmm15
+ vaesenc zmm18, zmm18, zmm15
+ vaesenc zmm19, zmm19, zmm15
+ vaesenc zmm16, zmm16, zmm1
+ vaesenc zmm17, zmm17, zmm1
+ vaesenc zmm18, zmm18, zmm1
+ vaesenc zmm19, zmm19, zmm1
+ vaesenc zmm16, zmm16, zmm2
+ vaesenc zmm17, zmm17, zmm2
+ vaesenc zmm18, zmm18, zmm2
+ vaesenc zmm19, zmm19, zmm2
+ vaesenc zmm16, zmm16, zmm3
+ vaesenc zmm17, zmm17, zmm3
+ vaesenc zmm18, zmm18, zmm3
+ vaesenc zmm19, zmm19, zmm3
+ cmp r8d, 11
+ vbroadcasti32x4 zmm20, [rax+160]
+ jl L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+176]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ cmp r8d, 13
+ vbroadcasti32x4 zmm20, [rax+192]
+ jl L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+208]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+224]
+L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last:
+ vaesenclast zmm16, zmm16, zmm20
+ vaesenclast zmm17, zmm17, zmm20
+ vaesenclast zmm18, zmm18, zmm20
+ vaesenclast zmm19, zmm19, zmm20
+ lea rcx, QWORD PTR [r11+rsi]
+ lea rdx, QWORD PTR [r10+rsi]
+ vmovdqu64 zmm21, [rcx]
+ vpxorq zmm16, zmm16, zmm21
+ vmovdqu64 [rdx], zmm16
+ vmovdqu64 zmm21, [rcx+64]
+ vpxorq zmm17, zmm17, zmm21
+ vmovdqu64 [rdx+64], zmm17
+ vmovdqu64 zmm21, [rcx+128]
+ vpxorq zmm18, zmm18, zmm21
+ vmovdqu64 [rdx+128], zmm18
+ vmovdqu64 zmm21, [rcx+192]
+ vpxorq zmm19, zmm19, zmm21
+ vmovdqu64 [rdx+192], zmm19
+ add esi, 256
+L_AES_GCM_decrypt_update_avx512_no_windows:
+ vmovdqu64 zmm23, [rsp+192]
+ vshufi64x2 zmm23, zmm23, zmm23, 27
+ vmovdqu64 zmm24, [rsp+128]
+ vshufi64x2 zmm24, zmm24, zmm24, 27
+ vmovdqu64 zmm25, [rsp+64]
+ vshufi64x2 zmm25, zmm25, zmm25, 27
+ vmovdqu64 zmm26, [rsp]
+ vshufi64x2 zmm26, zmm26, zmm26, 27
+ mov r13d, r9d
+ and r13d, 4294967040
+ cmp edi, r13d
+ jge L_AES_GCM_decrypt_update_avx512_after_256
+ ; 256 bytes of input
+ lea rbx, QWORD PTR [r11+rdi]
+ vpxorq zmm20, zmm20, zmm20
+ vinserti32x4 zmm20, zmm20, xmm6, 0
+ vmovdqu64 zmm21, [rbx]
+ vpshufb zmm21, zmm21, zmm30
+ vpxorq zmm21, zmm21, zmm20
+ vpclmulqdq zmm16, zmm21, zmm23, 0
+ vpclmulqdq zmm17, zmm21, zmm23, 1
+ vpclmulqdq zmm18, zmm21, zmm23, 16
+ vpclmulqdq zmm19, zmm21, zmm23, 17
+ vmovdqa64 zmm27, zmm16
+ vpxorq zmm28, zmm18, zmm17
+ vmovdqa64 zmm29, zmm19
+ vmovdqu64 zmm21, [rbx+64]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm24, 0
+ vpclmulqdq zmm17, zmm21, zmm24, 1
+ vpclmulqdq zmm18, zmm21, zmm24, 16
+ vpclmulqdq zmm19, zmm21, zmm24, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rbx+128]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm25, 0
+ vpclmulqdq zmm17, zmm21, zmm25, 1
+ vpclmulqdq zmm18, zmm21, zmm25, 16
+ vpclmulqdq zmm19, zmm21, zmm25, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vmovdqu64 zmm21, [rbx+192]
+ vpshufb zmm21, zmm21, zmm30
+ vpclmulqdq zmm16, zmm21, zmm26, 0
+ vpclmulqdq zmm17, zmm21, zmm26, 1
+ vpclmulqdq zmm18, zmm21, zmm26, 16
+ vpclmulqdq zmm19, zmm21, zmm26, 17
+ vpxorq zmm27, zmm27, zmm16
+ vpternlogq zmm28, zmm18, zmm17, 150
+ vpxorq zmm29, zmm29, zmm19
+ vpclmulqdq zmm21, zmm31, zmm27, 1
+ vpshufd zmm27, zmm27, 78
+ vpternlogq zmm28, zmm27, zmm21, 150
+ vpclmulqdq zmm21, zmm31, zmm28, 1
+ vpshufd zmm28, zmm28, 78
+ vpternlogq zmm29, zmm28, zmm21, 150
+ vextracti32x4 xmm0, zmm29, 1
+ vextracti32x4 xmm4, zmm29, 2
+ vextracti32x4 xmm5, zmm29, 3
+ vpxorq xmm6, xmm29, xmm0
+ vpternlogq xmm6, xmm5, xmm4, 150
+ vbroadcasti32x4 zmm20, [r15]
+ vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0
+ vpshufb zmm16, zmm16, zmm22
+ vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1
+ vpshufb zmm17, zmm17, zmm22
+ vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2
+ vpshufb zmm18, zmm18, zmm22
+ vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3
+ vpshufb zmm19, zmm19, zmm22
+ vmovdqu xmm8, OWORD PTR [r15]
+ vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen
+ vmovdqu OWORD PTR [r15], xmm8
+ vpxorq zmm16, zmm16, zmm9
+ vpxorq zmm17, zmm17, zmm9
+ vpxorq zmm18, zmm18, zmm9
+ vpxorq zmm19, zmm19, zmm9
+ vaesenc zmm16, zmm16, zmm10
+ vaesenc zmm17, zmm17, zmm10
+ vaesenc zmm18, zmm18, zmm10
+ vaesenc zmm19, zmm19, zmm10
+ vaesenc zmm16, zmm16, zmm11
+ vaesenc zmm17, zmm17, zmm11
+ vaesenc zmm18, zmm18, zmm11
+ vaesenc zmm19, zmm19, zmm11
+ vaesenc zmm16, zmm16, zmm12
+ vaesenc zmm17, zmm17, zmm12
+ vaesenc zmm18, zmm18, zmm12
+ vaesenc zmm19, zmm19, zmm12
+ vaesenc zmm16, zmm16, zmm13
+ vaesenc zmm17, zmm17, zmm13
+ vaesenc zmm18, zmm18, zmm13
+ vaesenc zmm19, zmm19, zmm13
+ vaesenc zmm16, zmm16, zmm14
+ vaesenc zmm17, zmm17, zmm14
+ vaesenc zmm18, zmm18, zmm14
+ vaesenc zmm19, zmm19, zmm14
+ vaesenc zmm16, zmm16, zmm15
+ vaesenc zmm17, zmm17, zmm15
+ vaesenc zmm18, zmm18, zmm15
+ vaesenc zmm19, zmm19, zmm15
+ vaesenc zmm16, zmm16, zmm1
+ vaesenc zmm17, zmm17, zmm1
+ vaesenc zmm18, zmm18, zmm1
+ vaesenc zmm19, zmm19, zmm1
+ vaesenc zmm16, zmm16, zmm2
+ vaesenc zmm17, zmm17, zmm2
+ vaesenc zmm18, zmm18, zmm2
+ vaesenc zmm19, zmm19, zmm2
+ vaesenc zmm16, zmm16, zmm3
+ vaesenc zmm17, zmm17, zmm3
+ vaesenc zmm18, zmm18, zmm3
+ vaesenc zmm19, zmm19, zmm3
+ cmp r8d, 11
+ vbroadcasti32x4 zmm20, [rax+160]
+ jl L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+176]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ cmp r8d, 13
+ vbroadcasti32x4 zmm20, [rax+192]
+ jl L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+208]
+ vaesenc zmm16, zmm16, zmm20
+ vaesenc zmm17, zmm17, zmm20
+ vaesenc zmm18, zmm18, zmm20
+ vaesenc zmm19, zmm19, zmm20
+ vbroadcasti32x4 zmm20, [rax+224]
+L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last:
+ vaesenclast zmm16, zmm16, zmm20
+ vaesenclast zmm17, zmm17, zmm20
+ vaesenclast zmm18, zmm18, zmm20
+ vaesenclast zmm19, zmm19, zmm20
+ lea rcx, QWORD PTR [r11+rdi]
+ lea rdx, QWORD PTR [r10+rdi]
+ vmovdqu64 zmm21, [rcx]
+ vpxorq zmm16, zmm16, zmm21
+ vmovdqu64 [rdx], zmm16
+ vmovdqu64 zmm21, [rcx+64]
+ vpxorq zmm17, zmm17, zmm21
+ vmovdqu64 [rdx+64], zmm17
+ vmovdqu64 zmm21, [rcx+128]
+ vpxorq zmm18, zmm18, zmm21
+ vmovdqu64 [rdx+128], zmm18
+ vmovdqu64 zmm21, [rcx+192]
+ vpxorq zmm19, zmm19, zmm21
+ vmovdqu64 [rdx+192], zmm19
+ add edi, 256
+L_AES_GCM_decrypt_update_avx512_after_256:
+ vmovdqu xmm5, OWORD PTR [rsp]
+L_AES_GCM_decrypt_update_avx512_done_128:
+ mov edx, r9d
+ cmp edi, edx
+ jge L_AES_GCM_decrypt_update_avx512_done_dec
+ mov r13d, r9d
+ and r13d, 4294967280
+ cmp edi, r13d
+ jge L_AES_GCM_decrypt_update_avx512_last_block_done
+L_AES_GCM_decrypt_update_avx512_last_block_start:
+ vmovdqu xmm13, OWORD PTR [r11+rdi]
+ vmovdqa xmm0, xmm5
+ vpshufb xmm1, xmm13, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vpxor xmm1, xmm1, xmm6
+ vmovdqu xmm9, OWORD PTR [r15]
+ vpshufb xmm8, xmm9, OWORD PTR L_avx512_aes_gcm_bswap_epi64
+ vpaddd xmm9, xmm9, OWORD PTR L_avx512_aes_gcm_one
+ vmovdqu OWORD PTR [r15], xmm9
+ vpxor xmm8, xmm8, [rax]
+ vpclmulqdq xmm10, xmm1, xmm0, 16
+ vaesenc xmm8, xmm8, [rax+16]
+ vaesenc xmm8, xmm8, [rax+32]
+ vpclmulqdq xmm11, xmm1, xmm0, 1
+ vaesenc xmm8, xmm8, [rax+48]
+ vaesenc xmm8, xmm8, [rax+64]
+ vpclmulqdq xmm12, xmm1, xmm0, 0
+ vaesenc xmm8, xmm8, [rax+80]
+ vpclmulqdq xmm1, xmm1, xmm0, 17
+ vaesenc xmm8, xmm8, [rax+96]
+ vpxor xmm10, xmm10, xmm11
+ vpslldq xmm2, xmm10, 8
+ vpsrldq xmm10, xmm10, 8
+ vaesenc xmm8, xmm8, [rax+112]
+ vpxor xmm2, xmm2, xmm12
+ vpxor xmm3, xmm1, xmm10
+ vmovdqa xmm0, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm2, xmm0, 16
+ vaesenc xmm8, xmm8, [rax+128]
+ vpshufd xmm10, xmm2, 78
+ vpxor xmm10, xmm10, xmm11
+ vpclmulqdq xmm11, xmm10, xmm0, 16
+ vaesenc xmm8, xmm8, [rax+144]
+ vpshufd xmm10, xmm10, 78
+ vpxor xmm10, xmm10, xmm11
+ vpxor xmm6, xmm10, xmm3
+ cmp r8d, 11
+ vmovdqa xmm9, OWORD PTR [rax+160]
+ jl L_AES_GCM_decrypt_update_avx512_aesenc_gfmul_last
+ vaesenc xmm8, xmm8, xmm9
+ vaesenc xmm8, xmm8, [rax+176]
+ cmp r8d, 13
+ vmovdqa xmm9, OWORD PTR [rax+192]
+ jl L_AES_GCM_decrypt_update_avx512_aesenc_gfmul_last
+ vaesenc xmm8, xmm8, xmm9
+ vaesenc xmm8, xmm8, [rax+208]
+ vmovdqa xmm9, OWORD PTR [rax+224]
+L_AES_GCM_decrypt_update_avx512_aesenc_gfmul_last:
+ vaesenclast xmm8, xmm8, xmm9
+ vmovdqa xmm0, xmm13
+ vpxor xmm8, xmm8, xmm0
+ vmovdqu OWORD PTR [r10+rdi], xmm8
+ add edi, 16
+ cmp edi, r13d
+ jl L_AES_GCM_decrypt_update_avx512_last_block_start
+L_AES_GCM_decrypt_update_avx512_last_block_done:
+L_AES_GCM_decrypt_update_avx512_done_dec:
+ vmovdqa OWORD PTR [r12], xmm6
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp+1040]
+ vmovdqu xmm7, OWORD PTR [rsp+1056]
+ vmovdqu xmm8, OWORD PTR [rsp+1072]
+ vmovdqu xmm9, OWORD PTR [rsp+1088]
+ vmovdqu xmm10, OWORD PTR [rsp+1104]
+ vmovdqu xmm11, OWORD PTR [rsp+1120]
+ vmovdqu xmm12, OWORD PTR [rsp+1136]
+ vmovdqu xmm13, OWORD PTR [rsp+1152]
+ vmovdqu xmm14, OWORD PTR [rsp+1168]
+ vmovdqu xmm15, OWORD PTR [rsp+1184]
+ add rsp, 1200
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r15
+ pop r14
+ pop r12
+ pop r13
+ ret
+AES_GCM_decrypt_update_avx512 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_GCM_decrypt_final_avx512 PROC
+ push r13
+ push r12
+ push r14
+ push rbp
+ push r15
+ mov rax, rcx
+ mov r10d, r9d
+ mov r9, rdx
+ mov r11d, DWORD PTR [rsp+80]
+ mov r12, QWORD PTR [rsp+88]
+ mov r14, QWORD PTR [rsp+96]
+ mov rbp, QWORD PTR [rsp+104]
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp+16], xmm6
+ vmovdqu OWORD PTR [rsp+32], xmm7
+ vmovdqu OWORD PTR [rsp+48], xmm8
+ vmovdqu OWORD PTR [rsp+64], xmm9
+ vmovdqu OWORD PTR [rsp+80], xmm10
+ vmovdqu OWORD PTR [rsp+96], xmm11
+ vmovdqu OWORD PTR [rsp+112], xmm12
+ vmovdqu OWORD PTR [rsp+128], xmm13
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ vmovdqa xmm6, OWORD PTR [rax]
+ vmovdqa xmm5, OWORD PTR [r12]
+ vmovdqa xmm15, OWORD PTR [r14]
+ vpsrlq xmm8, xmm5, 63
+ vpsllq xmm7, xmm5, 1
+ vpslldq xmm8, xmm8, 8
+ vpor xmm7, xmm7, xmm8
+ vpshufd xmm5, xmm5, 255
+ vpsrad xmm5, xmm5, 31
+ vpand xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpxor xmm5, xmm5, xmm7
+ mov edx, r10d
+ mov ecx, r11d
+ shl rdx, 3
+ shl rcx, 3
+ vmovq xmm0, rdx
+ vmovq xmm1, rcx
+ vpunpcklqdq xmm0, xmm0, xmm1
+ vpxor xmm6, xmm6, xmm0
+ ; ghash_gfmul_red_avx
+ vpshufd xmm8, xmm5, 78
+ vpxor xmm8, xmm8, xmm5
+ vpshufd xmm9, xmm6, 78
+ vpxor xmm9, xmm9, xmm6
+ vpclmulqdq xmm7, xmm6, xmm5, 0
+ vpclmulqdq xmm10, xmm6, xmm5, 17
+ vpclmulqdq xmm8, xmm8, xmm9, 0
+ vpternlogq xmm8, xmm10, xmm7, 150
+ vmovdqa xmm9, OWORD PTR L_avx512_aes_gcm_mod2_128
+ vpclmulqdq xmm11, xmm9, xmm7, 1
+ vpshufd xmm7, xmm7, 78
+ vpternlogq xmm8, xmm7, xmm11, 150
+ vpclmulqdq xmm11, xmm9, xmm8, 1
+ vpshufd xmm8, xmm8, 78
+ vpternlogq xmm10, xmm8, xmm11, 150
+ vmovdqa xmm6, xmm10
+ vpshufb xmm6, xmm6, OWORD PTR L_avx512_aes_gcm_bswap_mask
+ vpxor xmm0, xmm6, xmm15
+ cmp r8d, 16
+ je L_AES_GCM_decrypt_final_avx512_cmp_tag_16
+ sub rsp, 16
+ xor rcx, rcx
+ xor r15, r15
+ vmovdqu OWORD PTR [rsp], xmm0
+L_AES_GCM_decrypt_final_avx512_cmp_tag_loop:
+ movzx r13d, BYTE PTR [rsp+rcx]
+ xor r13b, BYTE PTR [r9+rcx]
+ or r15b, r13b
+ inc ecx
+ cmp ecx, r8d
+ jne L_AES_GCM_decrypt_final_avx512_cmp_tag_loop
+ cmp r15b, 0
+ sete r15b
+ add rsp, 16
+ xor rcx, rcx
+ jmp L_AES_GCM_decrypt_final_avx512_cmp_tag_done
+L_AES_GCM_decrypt_final_avx512_cmp_tag_16:
+ vmovdqu xmm1, OWORD PTR [r9]
+ vpcmpeqb xmm0, xmm0, xmm1
+ vpmovmskb rdx, xmm0
+ ; %%edx == 0xFFFF then return 1 else => return 0
+ xor r15d, r15d
+ cmp edx, 65535
+ sete r15b
+L_AES_GCM_decrypt_final_avx512_cmp_tag_done:
+ mov DWORD PTR [rbp], r15d
+ vzeroupper
+ vmovdqu xmm6, OWORD PTR [rsp+16]
+ vmovdqu xmm7, OWORD PTR [rsp+32]
+ vmovdqu xmm8, OWORD PTR [rsp+48]
+ vmovdqu xmm9, OWORD PTR [rsp+64]
+ vmovdqu xmm10, OWORD PTR [rsp+80]
+ vmovdqu xmm11, OWORD PTR [rsp+96]
+ vmovdqu xmm12, OWORD PTR [rsp+112]
+ vmovdqu xmm13, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ pop r15
+ pop rbp
+ pop r14
+ pop r12
+ pop r13
+ ret
+AES_GCM_decrypt_final_avx512 ENDP
+_TEXT ENDS
+ENDIF
END
diff --git a/wolfcrypt/src/aes_x86_64_asm.S b/wolfcrypt/src/aes_x86_64_asm.S
new file mode 100644
index 00000000000..9eb85b49c73
--- /dev/null
+++ b/wolfcrypt/src/aes_x86_64_asm.S
@@ -0,0 +1,4375 @@
+/* aes_x86_64_asm.S */
+/*
+ * Copyright (C) 2006-2026 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#ifdef WOLFSSL_USER_SETTINGS
+#ifdef WOLFSSL_USER_SETTINGS_ASM
+/*
+ * user_settings_asm.h is a file generated by the script user_settings_asm.sh.
+ * The script takes in a user_settings.h and produces user_settings_asm.h, which
+ * is a stripped down version of user_settings.h containing only preprocessor
+ * directives. This makes the header safe to include in assembly (.S) files.
+ */
+#include "user_settings_asm.h"
+#else
+/*
+ * Note: if user_settings.h contains any C code (e.g. a typedef or function
+ * prototype), including it here in an assembly (.S) file will cause an
+ * assembler failure. See user_settings_asm.h above.
+ */
+#include "user_settings.h"
+#endif /* WOLFSSL_USER_SETTINGS_ASM */
+#endif /* WOLFSSL_USER_SETTINGS */
+
+#ifndef HAVE_INTEL_AVX1
+#define HAVE_INTEL_AVX1
+#endif /* HAVE_INTEL_AVX1 */
+#ifndef NO_AVX2_SUPPORT
+#ifndef HAVE_INTEL_AVX2
+#define HAVE_INTEL_AVX2
+#endif /* HAVE_INTEL_AVX2 */
+#endif /* NO_AVX2_SUPPORT */
+#ifndef NO_VAES_SUPPORT
+#ifndef HAVE_INTEL_VAES
+#define HAVE_INTEL_VAES
+#endif /* HAVE_INTEL_VAES */
+#endif /* NO_VAES_SUPPORT */
+#ifndef NO_AVX512_SUPPORT
+#ifndef HAVE_INTEL_AVX512
+#define HAVE_INTEL_AVX512
+#endif /* HAVE_INTEL_AVX512 */
+#endif /* NO_AVX512_SUPPORT */
+
+#ifdef WOLFSSL_X86_64_BUILD
+#ifndef __APPLE__
+.text
+.globl AES_128_Key_Expansion_AESNI
+.type AES_128_Key_Expansion_AESNI,@function
+.align 16
+AES_128_Key_Expansion_AESNI:
+#else
+.section __TEXT,__text
+.globl _AES_128_Key_Expansion_AESNI
+.p2align 4
+_AES_128_Key_Expansion_AESNI:
+#endif /* __APPLE__ */
+ movdqu (%rdi), %xmm0
+ movdqu %xmm0, (%rsi)
+ aeskeygenassist $0x01, %xmm0, %xmm1
+ pshufd $0xff, %xmm1, %xmm1
+ movdqa %xmm0, %xmm2
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pxor %xmm1, %xmm0
+ movdqu %xmm0, 16(%rsi)
+ aeskeygenassist $2, %xmm0, %xmm1
+ pshufd $0xff, %xmm1, %xmm1
+ movdqa %xmm0, %xmm2
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pxor %xmm1, %xmm0
+ movdqu %xmm0, 32(%rsi)
+ aeskeygenassist $4, %xmm0, %xmm1
+ pshufd $0xff, %xmm1, %xmm1
+ movdqa %xmm0, %xmm2
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pxor %xmm1, %xmm0
+ movdqu %xmm0, 48(%rsi)
+ aeskeygenassist $8, %xmm0, %xmm1
+ pshufd $0xff, %xmm1, %xmm1
+ movdqa %xmm0, %xmm2
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pxor %xmm1, %xmm0
+ movdqu %xmm0, 64(%rsi)
+ aeskeygenassist $16, %xmm0, %xmm1
+ pshufd $0xff, %xmm1, %xmm1
+ movdqa %xmm0, %xmm2
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pxor %xmm1, %xmm0
+ movdqu %xmm0, 80(%rsi)
+ aeskeygenassist $32, %xmm0, %xmm1
+ pshufd $0xff, %xmm1, %xmm1
+ movdqa %xmm0, %xmm2
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pxor %xmm1, %xmm0
+ movdqu %xmm0, 96(%rsi)
+ aeskeygenassist $0x40, %xmm0, %xmm1
+ pshufd $0xff, %xmm1, %xmm1
+ movdqa %xmm0, %xmm2
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pxor %xmm1, %xmm0
+ movdqu %xmm0, 112(%rsi)
+ aeskeygenassist $0x80, %xmm0, %xmm1
+ pshufd $0xff, %xmm1, %xmm1
+ movdqa %xmm0, %xmm2
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pxor %xmm1, %xmm0
+ movdqu %xmm0, 128(%rsi)
+ aeskeygenassist $27, %xmm0, %xmm1
+ pshufd $0xff, %xmm1, %xmm1
+ movdqa %xmm0, %xmm2
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pxor %xmm1, %xmm0
+ movdqu %xmm0, 144(%rsi)
+ aeskeygenassist $54, %xmm0, %xmm1
+ pshufd $0xff, %xmm1, %xmm1
+ movdqa %xmm0, %xmm2
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pslldq $4, %xmm2
+ pxor %xmm2, %xmm0
+ pxor %xmm1, %xmm0
+ movdqu %xmm0, 160(%rsi)
+ repz retq
+#ifndef __APPLE__
+.size AES_128_Key_Expansion_AESNI,.-AES_128_Key_Expansion_AESNI
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_192_Key_Expansion_AESNI
+.type AES_192_Key_Expansion_AESNI,@function
+.align 16
+AES_192_Key_Expansion_AESNI:
+#else
+.section __TEXT,__text
+.globl _AES_192_Key_Expansion_AESNI
+.p2align 4
+_AES_192_Key_Expansion_AESNI:
+#endif /* __APPLE__ */
+ movdqu (%rdi), %xmm0
+ pxor %xmm1, %xmm1
+ pinsrq $0x00, 16(%rdi), %xmm1
+ movdqu %xmm0, (%rsi)
+ movdqa %xmm1, %xmm4
+ aeskeygenassist $0x01, %xmm1, %xmm2
+ pshufd $0x55, %xmm2, %xmm2
+ movdqa %xmm0, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pxor %xmm2, %xmm0
+ pshufd $0xff, %xmm0, %xmm2
+ movdqa %xmm1, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pxor %xmm2, %xmm1
+ shufpd $0x00, %xmm0, %xmm4
+ movdqu %xmm4, 16(%rsi)
+ movdqa %xmm0, %xmm5
+ shufpd $0x01, %xmm1, %xmm5
+ movdqu %xmm5, 32(%rsi)
+ aeskeygenassist $2, %xmm1, %xmm2
+ pshufd $0x55, %xmm2, %xmm2
+ movdqa %xmm0, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pxor %xmm2, %xmm0
+ pshufd $0xff, %xmm0, %xmm2
+ movdqa %xmm1, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pxor %xmm2, %xmm1
+ movdqu %xmm0, 48(%rsi)
+ movdqa %xmm1, %xmm4
+ aeskeygenassist $4, %xmm1, %xmm2
+ pshufd $0x55, %xmm2, %xmm2
+ movdqa %xmm0, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pxor %xmm2, %xmm0
+ pshufd $0xff, %xmm0, %xmm2
+ movdqa %xmm1, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pxor %xmm2, %xmm1
+ shufpd $0x00, %xmm0, %xmm4
+ movdqu %xmm4, 64(%rsi)
+ movdqa %xmm0, %xmm5
+ shufpd $0x01, %xmm1, %xmm5
+ movdqu %xmm5, 80(%rsi)
+ aeskeygenassist $8, %xmm1, %xmm2
+ pshufd $0x55, %xmm2, %xmm2
+ movdqa %xmm0, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pxor %xmm2, %xmm0
+ pshufd $0xff, %xmm0, %xmm2
+ movdqa %xmm1, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pxor %xmm2, %xmm1
+ movdqu %xmm0, 96(%rsi)
+ movdqa %xmm1, %xmm4
+ aeskeygenassist $16, %xmm1, %xmm2
+ pshufd $0x55, %xmm2, %xmm2
+ movdqa %xmm0, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pxor %xmm2, %xmm0
+ pshufd $0xff, %xmm0, %xmm2
+ movdqa %xmm1, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pxor %xmm2, %xmm1
+ shufpd $0x00, %xmm0, %xmm4
+ movdqu %xmm4, 112(%rsi)
+ movdqa %xmm0, %xmm5
+ shufpd $0x01, %xmm1, %xmm5
+ movdqu %xmm5, 128(%rsi)
+ aeskeygenassist $32, %xmm1, %xmm2
+ pshufd $0x55, %xmm2, %xmm2
+ movdqa %xmm0, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pxor %xmm2, %xmm0
+ pshufd $0xff, %xmm0, %xmm2
+ movdqa %xmm1, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pxor %xmm2, %xmm1
+ movdqu %xmm0, 144(%rsi)
+ movdqa %xmm1, %xmm4
+ aeskeygenassist $0x40, %xmm1, %xmm2
+ pshufd $0x55, %xmm2, %xmm2
+ movdqa %xmm0, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pxor %xmm2, %xmm0
+ pshufd $0xff, %xmm0, %xmm2
+ movdqa %xmm1, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pxor %xmm2, %xmm1
+ shufpd $0x00, %xmm0, %xmm4
+ movdqu %xmm4, 160(%rsi)
+ movdqa %xmm0, %xmm5
+ shufpd $0x01, %xmm1, %xmm5
+ movdqu %xmm5, 176(%rsi)
+ aeskeygenassist $0x80, %xmm1, %xmm2
+ pshufd $0x55, %xmm2, %xmm2
+ movdqa %xmm0, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pxor %xmm2, %xmm0
+ pshufd $0xff, %xmm0, %xmm2
+ movdqa %xmm1, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pxor %xmm2, %xmm1
+ movdqu %xmm0, 192(%rsi)
+ movdqu %xmm1, 208(%rsi)
+ repz retq
+#ifndef __APPLE__
+.size AES_192_Key_Expansion_AESNI,.-AES_192_Key_Expansion_AESNI
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_256_Key_Expansion_AESNI
+.type AES_256_Key_Expansion_AESNI,@function
+.align 16
+AES_256_Key_Expansion_AESNI:
+#else
+.section __TEXT,__text
+.globl _AES_256_Key_Expansion_AESNI
+.p2align 4
+_AES_256_Key_Expansion_AESNI:
+#endif /* __APPLE__ */
+ movdqu (%rdi), %xmm0
+ movdqu 16(%rdi), %xmm1
+ movdqu %xmm0, (%rsi)
+ movdqu %xmm1, 16(%rsi)
+ aeskeygenassist $0x01, %xmm1, %xmm2
+ pshufd $0xff, %xmm2, %xmm2
+ movdqa %xmm0, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pxor %xmm2, %xmm0
+ movdqu %xmm0, 32(%rsi)
+ aeskeygenassist $0x00, %xmm0, %xmm2
+ pshufd $0xaa, %xmm2, %xmm2
+ movdqa %xmm1, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pxor %xmm2, %xmm1
+ movdqu %xmm1, 48(%rsi)
+ aeskeygenassist $2, %xmm1, %xmm2
+ pshufd $0xff, %xmm2, %xmm2
+ movdqa %xmm0, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pxor %xmm2, %xmm0
+ movdqu %xmm0, 64(%rsi)
+ aeskeygenassist $0x00, %xmm0, %xmm2
+ pshufd $0xaa, %xmm2, %xmm2
+ movdqa %xmm1, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pxor %xmm2, %xmm1
+ movdqu %xmm1, 80(%rsi)
+ aeskeygenassist $4, %xmm1, %xmm2
+ pshufd $0xff, %xmm2, %xmm2
+ movdqa %xmm0, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pxor %xmm2, %xmm0
+ movdqu %xmm0, 96(%rsi)
+ aeskeygenassist $0x00, %xmm0, %xmm2
+ pshufd $0xaa, %xmm2, %xmm2
+ movdqa %xmm1, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pxor %xmm2, %xmm1
+ movdqu %xmm1, 112(%rsi)
+ aeskeygenassist $8, %xmm1, %xmm2
+ pshufd $0xff, %xmm2, %xmm2
+ movdqa %xmm0, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pxor %xmm2, %xmm0
+ movdqu %xmm0, 128(%rsi)
+ aeskeygenassist $0x00, %xmm0, %xmm2
+ pshufd $0xaa, %xmm2, %xmm2
+ movdqa %xmm1, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pxor %xmm2, %xmm1
+ movdqu %xmm1, 144(%rsi)
+ aeskeygenassist $16, %xmm1, %xmm2
+ pshufd $0xff, %xmm2, %xmm2
+ movdqa %xmm0, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pxor %xmm2, %xmm0
+ movdqu %xmm0, 160(%rsi)
+ aeskeygenassist $0x00, %xmm0, %xmm2
+ pshufd $0xaa, %xmm2, %xmm2
+ movdqa %xmm1, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pxor %xmm2, %xmm1
+ movdqu %xmm1, 176(%rsi)
+ aeskeygenassist $32, %xmm1, %xmm2
+ pshufd $0xff, %xmm2, %xmm2
+ movdqa %xmm0, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pxor %xmm2, %xmm0
+ movdqu %xmm0, 192(%rsi)
+ aeskeygenassist $0x00, %xmm0, %xmm2
+ pshufd $0xaa, %xmm2, %xmm2
+ movdqa %xmm1, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm1
+ pxor %xmm2, %xmm1
+ movdqu %xmm1, 208(%rsi)
+ aeskeygenassist $0x40, %xmm1, %xmm2
+ pshufd $0xff, %xmm2, %xmm2
+ movdqa %xmm0, %xmm3
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pslldq $4, %xmm3
+ pxor %xmm3, %xmm0
+ pxor %xmm2, %xmm0
+ movdqu %xmm0, 224(%rsi)
+ repz retq
+#ifndef __APPLE__
+.size AES_256_Key_Expansion_AESNI,.-AES_256_Key_Expansion_AESNI
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_ECB_encrypt_AESNI
+.type AES_ECB_encrypt_AESNI,@function
+.align 16
+AES_ECB_encrypt_AESNI:
+#else
+.section __TEXT,__text
+.globl _AES_ECB_encrypt_AESNI
+.p2align 4
+_AES_ECB_encrypt_AESNI:
+#endif /* __APPLE__ */
+ xorl %eax, %eax
+ cmpl $0x40, %edx
+ movl %edx, %r9d
+ jl L_AES_ECB_encrypt_AESNI_done_64
+ andl $0xffffffc0, %r9d
+L_AES_ECB_encrypt_AESNI_enc_64:
+ # 64 bytes of input
+ # aes_ecb_enc_64
+ leaq (%rdi,%rax,1), %r10
+ leaq (%rsi,%rax,1), %r11
+ movdqu (%r10), %xmm0
+ movdqu 16(%r10), %xmm1
+ movdqu 32(%r10), %xmm2
+ movdqu 48(%r10), %xmm3
+ # aes_enc_block
+ movdqu (%rcx), %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm4, %xmm1
+ pxor %xmm4, %xmm2
+ pxor %xmm4, %xmm3
+ movdqu 16(%rcx), %xmm4
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ movdqu 32(%rcx), %xmm4
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ movdqu 48(%rcx), %xmm4
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ movdqu 64(%rcx), %xmm4
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ movdqu 80(%rcx), %xmm4
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ movdqu 96(%rcx), %xmm4
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ movdqu 112(%rcx), %xmm4
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ movdqu 128(%rcx), %xmm4
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ movdqu 144(%rcx), %xmm4
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ cmpl $11, %r8d
+ movdqu 160(%rcx), %xmm4
+ jl L_AES_ECB_encrypt_AESNI_64_aes_enc_block_last
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ movdqu 176(%rcx), %xmm4
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ cmpl $13, %r8d
+ movdqu 192(%rcx), %xmm4
+ jl L_AES_ECB_encrypt_AESNI_64_aes_enc_block_last
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ movdqu 208(%rcx), %xmm4
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ movdqu 224(%rcx), %xmm4
+L_AES_ECB_encrypt_AESNI_64_aes_enc_block_last:
+ aesenclast %xmm4, %xmm0
+ aesenclast %xmm4, %xmm1
+ aesenclast %xmm4, %xmm2
+ aesenclast %xmm4, %xmm3
+ movdqu %xmm0, (%r11)
+ movdqu %xmm1, 16(%r11)
+ movdqu %xmm2, 32(%r11)
+ movdqu %xmm3, 48(%r11)
+ addl $0x40, %eax
+ cmpl %r9d, %eax
+ jl L_AES_ECB_encrypt_AESNI_enc_64
+L_AES_ECB_encrypt_AESNI_done_64:
+ cmpl %edx, %eax
+ movl %edx, %r9d
+ je L_AES_ECB_encrypt_AESNI_done_enc
+ andl $0xfffffff0, %r9d
+L_AES_ECB_encrypt_AESNI_enc_16:
+ # 16 bytes of input
+ leaq (%rdi,%rax,1), %r10
+ movdqu (%r10), %xmm0
+ # aes_enc_block
+ pxor (%rcx), %xmm0
+ movdqu 16(%rcx), %xmm5
+ aesenc %xmm5, %xmm0
+ movdqu 32(%rcx), %xmm5
+ aesenc %xmm5, %xmm0
+ movdqu 48(%rcx), %xmm5
+ aesenc %xmm5, %xmm0
+ movdqu 64(%rcx), %xmm5
+ aesenc %xmm5, %xmm0
+ movdqu 80(%rcx), %xmm5
+ aesenc %xmm5, %xmm0
+ movdqu 96(%rcx), %xmm5
+ aesenc %xmm5, %xmm0
+ movdqu 112(%rcx), %xmm5
+ aesenc %xmm5, %xmm0
+ movdqu 128(%rcx), %xmm5
+ aesenc %xmm5, %xmm0
+ movdqu 144(%rcx), %xmm5
+ aesenc %xmm5, %xmm0
+ cmpl $11, %r8d
+ movdqu 160(%rcx), %xmm5
+ jl L_AES_ECB_encrypt_AESNI_16_aes_enc_block_last
+ aesenc %xmm5, %xmm0
+ movdqu 176(%rcx), %xmm6
+ aesenc %xmm6, %xmm0
+ cmpl $13, %r8d
+ movdqu 192(%rcx), %xmm5
+ jl L_AES_ECB_encrypt_AESNI_16_aes_enc_block_last
+ aesenc %xmm5, %xmm0
+ movdqu 208(%rcx), %xmm6
+ aesenc %xmm6, %xmm0
+ movdqu 224(%rcx), %xmm5
+L_AES_ECB_encrypt_AESNI_16_aes_enc_block_last:
+ aesenclast %xmm5, %xmm0
+ leaq (%rsi,%rax,1), %r10
+ movdqu %xmm0, (%r10)
+ addl $16, %eax
+ cmpl %r9d, %eax
+ jl L_AES_ECB_encrypt_AESNI_enc_16
+L_AES_ECB_encrypt_AESNI_done_enc:
+ repz retq
+#ifndef __APPLE__
+.size AES_ECB_encrypt_AESNI,.-AES_ECB_encrypt_AESNI
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_ECB_decrypt_AESNI
+.type AES_ECB_decrypt_AESNI,@function
+.align 16
+AES_ECB_decrypt_AESNI:
+#else
+.section __TEXT,__text
+.globl _AES_ECB_decrypt_AESNI
+.p2align 4
+_AES_ECB_decrypt_AESNI:
+#endif /* __APPLE__ */
+ xorl %eax, %eax
+ cmpl $0x40, %edx
+ movl %edx, %r9d
+ jl L_AES_ECB_decrypt_AESNI_done_64
+ andl $0xffffffc0, %r9d
+L_AES_ECB_decrypt_AESNI_dec_64:
+ # 64 bytes of input
+ # aes_ecb_dec_64
+ leaq (%rdi,%rax,1), %r10
+ leaq (%rsi,%rax,1), %r11
+ movdqu (%r10), %xmm0
+ movdqu 16(%r10), %xmm1
+ movdqu 32(%r10), %xmm2
+ movdqu 48(%r10), %xmm3
+ # aes_dec_block
+ movdqu (%rcx), %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm4, %xmm1
+ pxor %xmm4, %xmm2
+ pxor %xmm4, %xmm3
+ movdqu 16(%rcx), %xmm4
+ aesdec %xmm4, %xmm0
+ aesdec %xmm4, %xmm1
+ aesdec %xmm4, %xmm2
+ aesdec %xmm4, %xmm3
+ movdqu 32(%rcx), %xmm4
+ aesdec %xmm4, %xmm0
+ aesdec %xmm4, %xmm1
+ aesdec %xmm4, %xmm2
+ aesdec %xmm4, %xmm3
+ movdqu 48(%rcx), %xmm4
+ aesdec %xmm4, %xmm0
+ aesdec %xmm4, %xmm1
+ aesdec %xmm4, %xmm2
+ aesdec %xmm4, %xmm3
+ movdqu 64(%rcx), %xmm4
+ aesdec %xmm4, %xmm0
+ aesdec %xmm4, %xmm1
+ aesdec %xmm4, %xmm2
+ aesdec %xmm4, %xmm3
+ movdqu 80(%rcx), %xmm4
+ aesdec %xmm4, %xmm0
+ aesdec %xmm4, %xmm1
+ aesdec %xmm4, %xmm2
+ aesdec %xmm4, %xmm3
+ movdqu 96(%rcx), %xmm4
+ aesdec %xmm4, %xmm0
+ aesdec %xmm4, %xmm1
+ aesdec %xmm4, %xmm2
+ aesdec %xmm4, %xmm3
+ movdqu 112(%rcx), %xmm4
+ aesdec %xmm4, %xmm0
+ aesdec %xmm4, %xmm1
+ aesdec %xmm4, %xmm2
+ aesdec %xmm4, %xmm3
+ movdqu 128(%rcx), %xmm4
+ aesdec %xmm4, %xmm0
+ aesdec %xmm4, %xmm1
+ aesdec %xmm4, %xmm2
+ aesdec %xmm4, %xmm3
+ movdqu 144(%rcx), %xmm4
+ aesdec %xmm4, %xmm0
+ aesdec %xmm4, %xmm1
+ aesdec %xmm4, %xmm2
+ aesdec %xmm4, %xmm3
+ cmpl $11, %r8d
+ movdqu 160(%rcx), %xmm4
+ jl L_AES_ECB_decrypt_AESNI_64_aes_dec_block_last
+ aesdec %xmm4, %xmm0
+ aesdec %xmm4, %xmm1
+ aesdec %xmm4, %xmm2
+ aesdec %xmm4, %xmm3
+ movdqu 176(%rcx), %xmm4
+ aesdec %xmm4, %xmm0
+ aesdec %xmm4, %xmm1
+ aesdec %xmm4, %xmm2
+ aesdec %xmm4, %xmm3
+ cmpl $13, %r8d
+ movdqu 192(%rcx), %xmm4
+ jl L_AES_ECB_decrypt_AESNI_64_aes_dec_block_last
+ aesdec %xmm4, %xmm0
+ aesdec %xmm4, %xmm1
+ aesdec %xmm4, %xmm2
+ aesdec %xmm4, %xmm3
+ movdqu 208(%rcx), %xmm4
+ aesdec %xmm4, %xmm0
+ aesdec %xmm4, %xmm1
+ aesdec %xmm4, %xmm2
+ aesdec %xmm4, %xmm3
+ movdqu 224(%rcx), %xmm4
+L_AES_ECB_decrypt_AESNI_64_aes_dec_block_last:
+ aesdeclast %xmm4, %xmm0
+ aesdeclast %xmm4, %xmm1
+ aesdeclast %xmm4, %xmm2
+ aesdeclast %xmm4, %xmm3
+ movdqu %xmm0, (%r11)
+ movdqu %xmm1, 16(%r11)
+ movdqu %xmm2, 32(%r11)
+ movdqu %xmm3, 48(%r11)
+ addl $0x40, %eax
+ cmpl %r9d, %eax
+ jl L_AES_ECB_decrypt_AESNI_dec_64
+L_AES_ECB_decrypt_AESNI_done_64:
+ cmpl %edx, %eax
+ movl %edx, %r9d
+ je L_AES_ECB_decrypt_AESNI_done_dec
+ andl $0xfffffff0, %r9d
+L_AES_ECB_decrypt_AESNI_dec_16:
+ # 16 bytes of input
+ leaq (%rdi,%rax,1), %r10
+ movdqu (%r10), %xmm0
+ # aes_dec_block
+ pxor (%rcx), %xmm0
+ movdqu 16(%rcx), %xmm5
+ aesdec %xmm5, %xmm0
+ movdqu 32(%rcx), %xmm5
+ aesdec %xmm5, %xmm0
+ movdqu 48(%rcx), %xmm5
+ aesdec %xmm5, %xmm0
+ movdqu 64(%rcx), %xmm5
+ aesdec %xmm5, %xmm0
+ movdqu 80(%rcx), %xmm5
+ aesdec %xmm5, %xmm0
+ movdqu 96(%rcx), %xmm5
+ aesdec %xmm5, %xmm0
+ movdqu 112(%rcx), %xmm5
+ aesdec %xmm5, %xmm0
+ movdqu 128(%rcx), %xmm5
+ aesdec %xmm5, %xmm0
+ movdqu 144(%rcx), %xmm5
+ aesdec %xmm5, %xmm0
+ cmpl $11, %r8d
+ movdqu 160(%rcx), %xmm5
+ jl L_AES_ECB_decrypt_AESNI_16_aes_dec_block_last
+ aesdec %xmm5, %xmm0
+ movdqu 176(%rcx), %xmm6
+ aesdec %xmm6, %xmm0
+ cmpl $13, %r8d
+ movdqu 192(%rcx), %xmm5
+ jl L_AES_ECB_decrypt_AESNI_16_aes_dec_block_last
+ aesdec %xmm5, %xmm0
+ movdqu 208(%rcx), %xmm6
+ aesdec %xmm6, %xmm0
+ movdqu 224(%rcx), %xmm5
+L_AES_ECB_decrypt_AESNI_16_aes_dec_block_last:
+ aesdeclast %xmm5, %xmm0
+ leaq (%rsi,%rax,1), %r10
+ movdqu %xmm0, (%r10)
+ addl $16, %eax
+ cmpl %r9d, %eax
+ jl L_AES_ECB_decrypt_AESNI_dec_16
+L_AES_ECB_decrypt_AESNI_done_dec:
+ repz retq
+#ifndef __APPLE__
+.size AES_ECB_decrypt_AESNI,.-AES_ECB_decrypt_AESNI
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_CBC_encrypt_AESNI
+.type AES_CBC_encrypt_AESNI,@function
+.align 16
+AES_CBC_encrypt_AESNI:
+#else
+.section __TEXT,__text
+.globl _AES_CBC_encrypt_AESNI
+.p2align 4
+_AES_CBC_encrypt_AESNI:
+#endif /* __APPLE__ */
+ movdqu (%rdx), %xmm0
+ xorl %eax, %eax
+ cmpl %ecx, %eax
+ je L_AES_CBC_encrypt_AESNI_done
+L_AES_CBC_encrypt_AESNI_loop:
+ # 16 bytes of input
+ leaq (%rdi,%rax,1), %r10
+ movdqu (%r10), %xmm1
+ pxor %xmm0, %xmm1
+ # aes_enc_block
+ pxor (%r8), %xmm1
+ movdqu 16(%r8), %xmm3
+ aesenc %xmm3, %xmm1
+ movdqu 32(%r8), %xmm3
+ aesenc %xmm3, %xmm1
+ movdqu 48(%r8), %xmm3
+ aesenc %xmm3, %xmm1
+ movdqu 64(%r8), %xmm3
+ aesenc %xmm3, %xmm1
+ movdqu 80(%r8), %xmm3
+ aesenc %xmm3, %xmm1
+ movdqu 96(%r8), %xmm3
+ aesenc %xmm3, %xmm1
+ movdqu 112(%r8), %xmm3
+ aesenc %xmm3, %xmm1
+ movdqu 128(%r8), %xmm3
+ aesenc %xmm3, %xmm1
+ movdqu 144(%r8), %xmm3
+ aesenc %xmm3, %xmm1
+ cmpl $11, %r9d
+ movdqu 160(%r8), %xmm3
+ jl L_AES_CBC_encrypt_AESNI_aes_enc_block_last
+ aesenc %xmm3, %xmm1
+ movdqu 176(%r8), %xmm4
+ aesenc %xmm4, %xmm1
+ cmpl $13, %r9d
+ movdqu 192(%r8), %xmm3
+ jl L_AES_CBC_encrypt_AESNI_aes_enc_block_last
+ aesenc %xmm3, %xmm1
+ movdqu 208(%r8), %xmm4
+ aesenc %xmm4, %xmm1
+ movdqu 224(%r8), %xmm3
+L_AES_CBC_encrypt_AESNI_aes_enc_block_last:
+ aesenclast %xmm3, %xmm1
+ leaq (%rsi,%rax,1), %r11
+ movdqu %xmm1, (%r11)
+ movdqa %xmm1, %xmm0
+ addl $16, %eax
+ cmpl %ecx, %eax
+ jl L_AES_CBC_encrypt_AESNI_loop
+L_AES_CBC_encrypt_AESNI_done:
+ movdqu %xmm0, (%rdx)
+ repz retq
+#ifndef __APPLE__
+.size AES_CBC_encrypt_AESNI,.-AES_CBC_encrypt_AESNI
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_CBC_decrypt_AESNI
+.type AES_CBC_decrypt_AESNI,@function
+.align 16
+AES_CBC_decrypt_AESNI:
+#else
+.section __TEXT,__text
+.globl _AES_CBC_decrypt_AESNI
+.p2align 4
+_AES_CBC_decrypt_AESNI:
+#endif /* __APPLE__ */
+ pushq %r12
+ movdqu (%rdx), %xmm4
+ xorl %eax, %eax
+ cmpl $0x40, %ecx
+ movl %ecx, %r10d
+ jl L_AES_CBC_decrypt_AESNI_done_64
+ andl $0xffffffc0, %r10d
+L_AES_CBC_decrypt_AESNI_dec_64:
+ # 64 bytes of input
+ # aes_cbc_dec_64
+ leaq (%rdi,%rax,1), %r11
+ leaq (%rsi,%rax,1), %r12
+ movdqu (%r11), %xmm0
+ movdqu 16(%r11), %xmm1
+ movdqu 32(%r11), %xmm2
+ movdqu 48(%r11), %xmm3
+ # aes_dec_block
+ movdqu (%r8), %xmm5
+ pxor %xmm5, %xmm0
+ pxor %xmm5, %xmm1
+ pxor %xmm5, %xmm2
+ pxor %xmm5, %xmm3
+ movdqu 16(%r8), %xmm5
+ aesdec %xmm5, %xmm0
+ aesdec %xmm5, %xmm1
+ aesdec %xmm5, %xmm2
+ aesdec %xmm5, %xmm3
+ movdqu 32(%r8), %xmm5
+ aesdec %xmm5, %xmm0
+ aesdec %xmm5, %xmm1
+ aesdec %xmm5, %xmm2
+ aesdec %xmm5, %xmm3
+ movdqu 48(%r8), %xmm5
+ aesdec %xmm5, %xmm0
+ aesdec %xmm5, %xmm1
+ aesdec %xmm5, %xmm2
+ aesdec %xmm5, %xmm3
+ movdqu 64(%r8), %xmm5
+ aesdec %xmm5, %xmm0
+ aesdec %xmm5, %xmm1
+ aesdec %xmm5, %xmm2
+ aesdec %xmm5, %xmm3
+ movdqu 80(%r8), %xmm5
+ aesdec %xmm5, %xmm0
+ aesdec %xmm5, %xmm1
+ aesdec %xmm5, %xmm2
+ aesdec %xmm5, %xmm3
+ movdqu 96(%r8), %xmm5
+ aesdec %xmm5, %xmm0
+ aesdec %xmm5, %xmm1
+ aesdec %xmm5, %xmm2
+ aesdec %xmm5, %xmm3
+ movdqu 112(%r8), %xmm5
+ aesdec %xmm5, %xmm0
+ aesdec %xmm5, %xmm1
+ aesdec %xmm5, %xmm2
+ aesdec %xmm5, %xmm3
+ movdqu 128(%r8), %xmm5
+ aesdec %xmm5, %xmm0
+ aesdec %xmm5, %xmm1
+ aesdec %xmm5, %xmm2
+ aesdec %xmm5, %xmm3
+ movdqu 144(%r8), %xmm5
+ aesdec %xmm5, %xmm0
+ aesdec %xmm5, %xmm1
+ aesdec %xmm5, %xmm2
+ aesdec %xmm5, %xmm3
+ cmpl $11, %r9d
+ movdqu 160(%r8), %xmm5
+ jl L_AES_CBC_decrypt_AESNI_64_aes_dec_block_last
+ aesdec %xmm5, %xmm0
+ aesdec %xmm5, %xmm1
+ aesdec %xmm5, %xmm2
+ aesdec %xmm5, %xmm3
+ movdqu 176(%r8), %xmm5
+ aesdec %xmm5, %xmm0
+ aesdec %xmm5, %xmm1
+ aesdec %xmm5, %xmm2
+ aesdec %xmm5, %xmm3
+ cmpl $13, %r9d
+ movdqu 192(%r8), %xmm5
+ jl L_AES_CBC_decrypt_AESNI_64_aes_dec_block_last
+ aesdec %xmm5, %xmm0
+ aesdec %xmm5, %xmm1
+ aesdec %xmm5, %xmm2
+ aesdec %xmm5, %xmm3
+ movdqu 208(%r8), %xmm5
+ aesdec %xmm5, %xmm0
+ aesdec %xmm5, %xmm1
+ aesdec %xmm5, %xmm2
+ aesdec %xmm5, %xmm3
+ movdqu 224(%r8), %xmm5
+L_AES_CBC_decrypt_AESNI_64_aes_dec_block_last:
+ aesdeclast %xmm5, %xmm0
+ aesdeclast %xmm5, %xmm1
+ aesdeclast %xmm5, %xmm2
+ aesdeclast %xmm5, %xmm3
+ pxor %xmm4, %xmm0
+ movdqu (%r11), %xmm5
+ pxor %xmm5, %xmm1
+ movdqu 16(%r11), %xmm5
+ pxor %xmm5, %xmm2
+ movdqu 32(%r11), %xmm5
+ pxor %xmm5, %xmm3
+ movdqu 48(%r11), %xmm4
+ movdqu %xmm0, (%r12)
+ movdqu %xmm1, 16(%r12)
+ movdqu %xmm2, 32(%r12)
+ movdqu %xmm3, 48(%r12)
+ addl $0x40, %eax
+ cmpl %r10d, %eax
+ jl L_AES_CBC_decrypt_AESNI_dec_64
+L_AES_CBC_decrypt_AESNI_done_64:
+ cmpl %ecx, %eax
+ movl %ecx, %r10d
+ je L_AES_CBC_decrypt_AESNI_done_dec
+ andl $0xfffffff0, %r10d
+L_AES_CBC_decrypt_AESNI_dec_16:
+ # 16 bytes of input
+ leaq (%rdi,%rax,1), %r11
+ movdqu (%r11), %xmm0
+ movdqa %xmm0, %xmm8
+ # aes_dec_block
+ pxor (%r8), %xmm0
+ movdqu 16(%r8), %xmm6
+ aesdec %xmm6, %xmm0
+ movdqu 32(%r8), %xmm6
+ aesdec %xmm6, %xmm0
+ movdqu 48(%r8), %xmm6
+ aesdec %xmm6, %xmm0
+ movdqu 64(%r8), %xmm6
+ aesdec %xmm6, %xmm0
+ movdqu 80(%r8), %xmm6
+ aesdec %xmm6, %xmm0
+ movdqu 96(%r8), %xmm6
+ aesdec %xmm6, %xmm0
+ movdqu 112(%r8), %xmm6
+ aesdec %xmm6, %xmm0
+ movdqu 128(%r8), %xmm6
+ aesdec %xmm6, %xmm0
+ movdqu 144(%r8), %xmm6
+ aesdec %xmm6, %xmm0
+ cmpl $11, %r9d
+ movdqu 160(%r8), %xmm6
+ jl L_AES_CBC_decrypt_AESNI_16_aes_dec_block_last
+ aesdec %xmm6, %xmm0
+ movdqu 176(%r8), %xmm7
+ aesdec %xmm7, %xmm0
+ cmpl $13, %r9d
+ movdqu 192(%r8), %xmm6
+ jl L_AES_CBC_decrypt_AESNI_16_aes_dec_block_last
+ aesdec %xmm6, %xmm0
+ movdqu 208(%r8), %xmm7
+ aesdec %xmm7, %xmm0
+ movdqu 224(%r8), %xmm6
+L_AES_CBC_decrypt_AESNI_16_aes_dec_block_last:
+ aesdeclast %xmm6, %xmm0
+ pxor %xmm4, %xmm0
+ movdqa %xmm8, %xmm4
+ leaq (%rsi,%rax,1), %r11
+ movdqu %xmm0, (%r11)
+ addl $16, %eax
+ cmpl %r10d, %eax
+ jl L_AES_CBC_decrypt_AESNI_dec_16
+L_AES_CBC_decrypt_AESNI_done_dec:
+ movdqu %xmm4, (%rdx)
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size AES_CBC_decrypt_AESNI,.-AES_CBC_decrypt_AESNI
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_aes_ctr_aesni_bswap:
+.quad 0x08090a0b0c0d0e0f,0x0001020304050607
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_aes_ctr_aesni_one:
+.quad 0x0000000000000001,0x0000000000000000
+#ifndef __APPLE__
+.text
+.globl AES_CTR_encrypt_AESNI
+.type AES_CTR_encrypt_AESNI,@function
+.align 16
+AES_CTR_encrypt_AESNI:
+#else
+.section __TEXT,__text
+.globl _AES_CTR_encrypt_AESNI
+.p2align 4
+_AES_CTR_encrypt_AESNI:
+#endif /* __APPLE__ */
+ pushq %rbx
+ movdqu L_aes_ctr_aesni_bswap(%rip), %xmm8
+ movdqu L_aes_ctr_aesni_one(%rip), %xmm9
+ pxor %xmm10, %xmm10
+ movdqu (%r9), %xmm7
+ pshufb %xmm8, %xmm7
+ xorl %eax, %eax
+ cmpl $0x40, %edx
+ movl %edx, %r10d
+ jl L_AES_CTR_encrypt_AESNI_done_64
+ andl $0xffffffc0, %r10d
+L_AES_CTR_encrypt_AESNI_enc_64:
+ # 64 bytes of input
+ # aes_ctr_enc_64
+ leaq (%rdi,%rax,1), %r11
+ leaq (%rsi,%rax,1), %rbx
+ movdqa %xmm7, %xmm0
+ pshufb %xmm8, %xmm0
+ paddq %xmm9, %xmm7
+ movdqa %xmm7, %xmm11
+ pcmpeqq %xmm10, %xmm11
+ pslldq $8, %xmm11
+ psrlq $63, %xmm11
+ paddq %xmm11, %xmm7
+ movdqa %xmm7, %xmm1
+ pshufb %xmm8, %xmm1
+ paddq %xmm9, %xmm7
+ movdqa %xmm7, %xmm11
+ pcmpeqq %xmm10, %xmm11
+ pslldq $8, %xmm11
+ psrlq $63, %xmm11
+ paddq %xmm11, %xmm7
+ movdqa %xmm7, %xmm2
+ pshufb %xmm8, %xmm2
+ paddq %xmm9, %xmm7
+ movdqa %xmm7, %xmm11
+ pcmpeqq %xmm10, %xmm11
+ pslldq $8, %xmm11
+ psrlq $63, %xmm11
+ paddq %xmm11, %xmm7
+ movdqa %xmm7, %xmm3
+ pshufb %xmm8, %xmm3
+ paddq %xmm9, %xmm7
+ movdqa %xmm7, %xmm11
+ pcmpeqq %xmm10, %xmm11
+ pslldq $8, %xmm11
+ psrlq $63, %xmm11
+ paddq %xmm11, %xmm7
+ # aes_enc_block
+ movdqu (%rcx), %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm4, %xmm1
+ pxor %xmm4, %xmm2
+ pxor %xmm4, %xmm3
+ movdqu 16(%rcx), %xmm4
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ movdqu 32(%rcx), %xmm4
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ movdqu 48(%rcx), %xmm4
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ movdqu 64(%rcx), %xmm4
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ movdqu 80(%rcx), %xmm4
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ movdqu 96(%rcx), %xmm4
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ movdqu 112(%rcx), %xmm4
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ movdqu 128(%rcx), %xmm4
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ movdqu 144(%rcx), %xmm4
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ cmpl $11, %r8d
+ movdqu 160(%rcx), %xmm4
+ jl L_AES_CTR_encrypt_AESNI_64_aes_enc_block_last
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ movdqu 176(%rcx), %xmm4
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ cmpl $13, %r8d
+ movdqu 192(%rcx), %xmm4
+ jl L_AES_CTR_encrypt_AESNI_64_aes_enc_block_last
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ movdqu 208(%rcx), %xmm4
+ aesenc %xmm4, %xmm0
+ aesenc %xmm4, %xmm1
+ aesenc %xmm4, %xmm2
+ aesenc %xmm4, %xmm3
+ movdqu 224(%rcx), %xmm4
+L_AES_CTR_encrypt_AESNI_64_aes_enc_block_last:
+ aesenclast %xmm4, %xmm0
+ aesenclast %xmm4, %xmm1
+ aesenclast %xmm4, %xmm2
+ aesenclast %xmm4, %xmm3
+ movdqu (%r11), %xmm4
+ pxor %xmm4, %xmm0
+ movdqu 16(%r11), %xmm4
+ pxor %xmm4, %xmm1
+ movdqu 32(%r11), %xmm4
+ pxor %xmm4, %xmm2
+ movdqu 48(%r11), %xmm4
+ pxor %xmm4, %xmm3
+ movdqu %xmm0, (%rbx)
+ movdqu %xmm1, 16(%rbx)
+ movdqu %xmm2, 32(%rbx)
+ movdqu %xmm3, 48(%rbx)
+ addl $0x40, %eax
+ cmpl %r10d, %eax
+ jl L_AES_CTR_encrypt_AESNI_enc_64
+L_AES_CTR_encrypt_AESNI_done_64:
+ cmpl %edx, %eax
+ movl %edx, %r10d
+ je L_AES_CTR_encrypt_AESNI_done_enc
+ andl $0xfffffff0, %r10d
+L_AES_CTR_encrypt_AESNI_enc_16:
+ # 16 bytes of input
+ movdqa %xmm7, %xmm0
+ pshufb %xmm8, %xmm0
+ paddq %xmm9, %xmm7
+ movdqa %xmm7, %xmm11
+ pcmpeqq %xmm10, %xmm11
+ pslldq $8, %xmm11
+ psrlq $63, %xmm11
+ paddq %xmm11, %xmm7
+ # aes_enc_block
+ pxor (%rcx), %xmm0
+ movdqu 16(%rcx), %xmm5
+ aesenc %xmm5, %xmm0
+ movdqu 32(%rcx), %xmm5
+ aesenc %xmm5, %xmm0
+ movdqu 48(%rcx), %xmm5
+ aesenc %xmm5, %xmm0
+ movdqu 64(%rcx), %xmm5
+ aesenc %xmm5, %xmm0
+ movdqu 80(%rcx), %xmm5
+ aesenc %xmm5, %xmm0
+ movdqu 96(%rcx), %xmm5
+ aesenc %xmm5, %xmm0
+ movdqu 112(%rcx), %xmm5
+ aesenc %xmm5, %xmm0
+ movdqu 128(%rcx), %xmm5
+ aesenc %xmm5, %xmm0
+ movdqu 144(%rcx), %xmm5
+ aesenc %xmm5, %xmm0
+ cmpl $11, %r8d
+ movdqu 160(%rcx), %xmm5
+ jl L_AES_CTR_encrypt_AESNI_16_aes_enc_block_last
+ aesenc %xmm5, %xmm0
+ movdqu 176(%rcx), %xmm6
+ aesenc %xmm6, %xmm0
+ cmpl $13, %r8d
+ movdqu 192(%rcx), %xmm5
+ jl L_AES_CTR_encrypt_AESNI_16_aes_enc_block_last
+ aesenc %xmm5, %xmm0
+ movdqu 208(%rcx), %xmm6
+ aesenc %xmm6, %xmm0
+ movdqu 224(%rcx), %xmm5
+L_AES_CTR_encrypt_AESNI_16_aes_enc_block_last:
+ aesenclast %xmm5, %xmm0
+ leaq (%rdi,%rax,1), %r11
+ movdqu (%r11), %xmm4
+ pxor %xmm4, %xmm0
+ leaq (%rsi,%rax,1), %r11
+ movdqu %xmm0, (%r11)
+ addl $16, %eax
+ cmpl %r10d, %eax
+ jl L_AES_CTR_encrypt_AESNI_enc_16
+L_AES_CTR_encrypt_AESNI_done_enc:
+ pshufb %xmm8, %xmm7
+ movdqu %xmm7, (%r9)
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size AES_CTR_encrypt_AESNI,.-AES_CTR_encrypt_AESNI
+#endif /* __APPLE__ */
+#ifdef HAVE_INTEL_AVX1
+#ifndef __APPLE__
+.text
+.globl AES_ECB_encrypt_avx1
+.type AES_ECB_encrypt_avx1,@function
+.align 16
+AES_ECB_encrypt_avx1:
+#else
+.section __TEXT,__text
+.globl _AES_ECB_encrypt_avx1
+.p2align 4
+_AES_ECB_encrypt_avx1:
+#endif /* __APPLE__ */
+ xorl %eax, %eax
+ cmpl $0x40, %edx
+ movl %edx, %r9d
+ jl L_AES_ECB_encrypt_avx1_done_64
+ andl $0xffffffc0, %r9d
+L_AES_ECB_encrypt_avx1_enc_64:
+ # 64 bytes of input
+ # aes_ecb_enc_64
+ leaq (%rdi,%rax,1), %r10
+ leaq (%rsi,%rax,1), %r11
+ vmovdqu (%r10), %xmm0
+ vmovdqu 16(%r10), %xmm1
+ vmovdqu 32(%r10), %xmm2
+ vmovdqu 48(%r10), %xmm3
+ # aes_enc_block
+ vmovdqu (%rcx), %xmm4
+ vpxor %xmm4, %xmm0, %xmm0
+ vpxor %xmm4, %xmm1, %xmm1
+ vpxor %xmm4, %xmm2, %xmm2
+ vpxor %xmm4, %xmm3, %xmm3
+ vmovdqu 16(%rcx), %xmm4
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ vmovdqu 32(%rcx), %xmm4
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ vmovdqu 48(%rcx), %xmm4
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ vmovdqu 64(%rcx), %xmm4
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ vmovdqu 80(%rcx), %xmm4
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ vmovdqu 96(%rcx), %xmm4
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ vmovdqu 112(%rcx), %xmm4
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ vmovdqu 128(%rcx), %xmm4
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ vmovdqu 144(%rcx), %xmm4
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ cmpl $11, %r8d
+ vmovdqu 160(%rcx), %xmm4
+ jl L_AES_ECB_encrypt_avx1_64_aes_enc_block_last
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ vmovdqu 176(%rcx), %xmm4
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ cmpl $13, %r8d
+ vmovdqu 192(%rcx), %xmm4
+ jl L_AES_ECB_encrypt_avx1_64_aes_enc_block_last
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ vmovdqu 208(%rcx), %xmm4
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ vmovdqu 224(%rcx), %xmm4
+L_AES_ECB_encrypt_avx1_64_aes_enc_block_last:
+ vaesenclast %xmm4, %xmm0, %xmm0
+ vaesenclast %xmm4, %xmm1, %xmm1
+ vaesenclast %xmm4, %xmm2, %xmm2
+ vaesenclast %xmm4, %xmm3, %xmm3
+ vmovdqu %xmm0, (%r11)
+ vmovdqu %xmm1, 16(%r11)
+ vmovdqu %xmm2, 32(%r11)
+ vmovdqu %xmm3, 48(%r11)
+ addl $0x40, %eax
+ cmpl %r9d, %eax
+ jl L_AES_ECB_encrypt_avx1_enc_64
+L_AES_ECB_encrypt_avx1_done_64:
+ cmpl %edx, %eax
+ movl %edx, %r9d
+ je L_AES_ECB_encrypt_avx1_done_enc
+ andl $0xfffffff0, %r9d
+L_AES_ECB_encrypt_avx1_enc_16:
+ # 16 bytes of input
+ leaq (%rdi,%rax,1), %r10
+ vmovdqu (%r10), %xmm0
+ # aes_enc_block
+ vpxor (%rcx), %xmm0, %xmm0
+ vmovdqu 16(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ cmpl $11, %r8d
+ vmovdqu 160(%rcx), %xmm5
+ jl L_AES_ECB_encrypt_avx1_16_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%rcx), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ cmpl $13, %r8d
+ vmovdqu 192(%rcx), %xmm5
+ jl L_AES_ECB_encrypt_avx1_16_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%rcx), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%rcx), %xmm5
+L_AES_ECB_encrypt_avx1_16_aes_enc_block_last:
+ vaesenclast %xmm5, %xmm0, %xmm0
+ leaq (%rsi,%rax,1), %r10
+ vmovdqu %xmm0, (%r10)
+ addl $16, %eax
+ cmpl %r9d, %eax
+ jl L_AES_ECB_encrypt_avx1_enc_16
+L_AES_ECB_encrypt_avx1_done_enc:
+ repz retq
+#ifndef __APPLE__
+.size AES_ECB_encrypt_avx1,.-AES_ECB_encrypt_avx1
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_ECB_decrypt_avx1
+.type AES_ECB_decrypt_avx1,@function
+.align 16
+AES_ECB_decrypt_avx1:
+#else
+.section __TEXT,__text
+.globl _AES_ECB_decrypt_avx1
+.p2align 4
+_AES_ECB_decrypt_avx1:
+#endif /* __APPLE__ */
+ xorl %eax, %eax
+ cmpl $0x40, %edx
+ movl %edx, %r9d
+ jl L_AES_ECB_decrypt_avx1_done_64
+ andl $0xffffffc0, %r9d
+L_AES_ECB_decrypt_avx1_dec_64:
+ # 64 bytes of input
+ # aes_ecb_dec_64
+ leaq (%rdi,%rax,1), %r10
+ leaq (%rsi,%rax,1), %r11
+ vmovdqu (%r10), %xmm0
+ vmovdqu 16(%r10), %xmm1
+ vmovdqu 32(%r10), %xmm2
+ vmovdqu 48(%r10), %xmm3
+ # aes_dec_block
+ vmovdqu (%rcx), %xmm4
+ vpxor %xmm4, %xmm0, %xmm0
+ vpxor %xmm4, %xmm1, %xmm1
+ vpxor %xmm4, %xmm2, %xmm2
+ vpxor %xmm4, %xmm3, %xmm3
+ vmovdqu 16(%rcx), %xmm4
+ vaesdec %xmm4, %xmm0, %xmm0
+ vaesdec %xmm4, %xmm1, %xmm1
+ vaesdec %xmm4, %xmm2, %xmm2
+ vaesdec %xmm4, %xmm3, %xmm3
+ vmovdqu 32(%rcx), %xmm4
+ vaesdec %xmm4, %xmm0, %xmm0
+ vaesdec %xmm4, %xmm1, %xmm1
+ vaesdec %xmm4, %xmm2, %xmm2
+ vaesdec %xmm4, %xmm3, %xmm3
+ vmovdqu 48(%rcx), %xmm4
+ vaesdec %xmm4, %xmm0, %xmm0
+ vaesdec %xmm4, %xmm1, %xmm1
+ vaesdec %xmm4, %xmm2, %xmm2
+ vaesdec %xmm4, %xmm3, %xmm3
+ vmovdqu 64(%rcx), %xmm4
+ vaesdec %xmm4, %xmm0, %xmm0
+ vaesdec %xmm4, %xmm1, %xmm1
+ vaesdec %xmm4, %xmm2, %xmm2
+ vaesdec %xmm4, %xmm3, %xmm3
+ vmovdqu 80(%rcx), %xmm4
+ vaesdec %xmm4, %xmm0, %xmm0
+ vaesdec %xmm4, %xmm1, %xmm1
+ vaesdec %xmm4, %xmm2, %xmm2
+ vaesdec %xmm4, %xmm3, %xmm3
+ vmovdqu 96(%rcx), %xmm4
+ vaesdec %xmm4, %xmm0, %xmm0
+ vaesdec %xmm4, %xmm1, %xmm1
+ vaesdec %xmm4, %xmm2, %xmm2
+ vaesdec %xmm4, %xmm3, %xmm3
+ vmovdqu 112(%rcx), %xmm4
+ vaesdec %xmm4, %xmm0, %xmm0
+ vaesdec %xmm4, %xmm1, %xmm1
+ vaesdec %xmm4, %xmm2, %xmm2
+ vaesdec %xmm4, %xmm3, %xmm3
+ vmovdqu 128(%rcx), %xmm4
+ vaesdec %xmm4, %xmm0, %xmm0
+ vaesdec %xmm4, %xmm1, %xmm1
+ vaesdec %xmm4, %xmm2, %xmm2
+ vaesdec %xmm4, %xmm3, %xmm3
+ vmovdqu 144(%rcx), %xmm4
+ vaesdec %xmm4, %xmm0, %xmm0
+ vaesdec %xmm4, %xmm1, %xmm1
+ vaesdec %xmm4, %xmm2, %xmm2
+ vaesdec %xmm4, %xmm3, %xmm3
+ cmpl $11, %r8d
+ vmovdqu 160(%rcx), %xmm4
+ jl L_AES_ECB_decrypt_avx1_64_aes_dec_block_last
+ vaesdec %xmm4, %xmm0, %xmm0
+ vaesdec %xmm4, %xmm1, %xmm1
+ vaesdec %xmm4, %xmm2, %xmm2
+ vaesdec %xmm4, %xmm3, %xmm3
+ vmovdqu 176(%rcx), %xmm4
+ vaesdec %xmm4, %xmm0, %xmm0
+ vaesdec %xmm4, %xmm1, %xmm1
+ vaesdec %xmm4, %xmm2, %xmm2
+ vaesdec %xmm4, %xmm3, %xmm3
+ cmpl $13, %r8d
+ vmovdqu 192(%rcx), %xmm4
+ jl L_AES_ECB_decrypt_avx1_64_aes_dec_block_last
+ vaesdec %xmm4, %xmm0, %xmm0
+ vaesdec %xmm4, %xmm1, %xmm1
+ vaesdec %xmm4, %xmm2, %xmm2
+ vaesdec %xmm4, %xmm3, %xmm3
+ vmovdqu 208(%rcx), %xmm4
+ vaesdec %xmm4, %xmm0, %xmm0
+ vaesdec %xmm4, %xmm1, %xmm1
+ vaesdec %xmm4, %xmm2, %xmm2
+ vaesdec %xmm4, %xmm3, %xmm3
+ vmovdqu 224(%rcx), %xmm4
+L_AES_ECB_decrypt_avx1_64_aes_dec_block_last:
+ vaesdeclast %xmm4, %xmm0, %xmm0
+ vaesdeclast %xmm4, %xmm1, %xmm1
+ vaesdeclast %xmm4, %xmm2, %xmm2
+ vaesdeclast %xmm4, %xmm3, %xmm3
+ vmovdqu %xmm0, (%r11)
+ vmovdqu %xmm1, 16(%r11)
+ vmovdqu %xmm2, 32(%r11)
+ vmovdqu %xmm3, 48(%r11)
+ addl $0x40, %eax
+ cmpl %r9d, %eax
+ jl L_AES_ECB_decrypt_avx1_dec_64
+L_AES_ECB_decrypt_avx1_done_64:
+ cmpl %edx, %eax
+ movl %edx, %r9d
+ je L_AES_ECB_decrypt_avx1_done_dec
+ andl $0xfffffff0, %r9d
+L_AES_ECB_decrypt_avx1_dec_16:
+ # 16 bytes of input
+ leaq (%rdi,%rax,1), %r10
+ vmovdqu (%r10), %xmm0
+ # aes_dec_block
+ vpxor (%rcx), %xmm0, %xmm0
+ vmovdqu 16(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ cmpl $11, %r8d
+ vmovdqu 160(%rcx), %xmm5
+ jl L_AES_ECB_decrypt_avx1_16_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%rcx), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ cmpl $13, %r8d
+ vmovdqu 192(%rcx), %xmm5
+ jl L_AES_ECB_decrypt_avx1_16_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%rcx), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%rcx), %xmm5
+L_AES_ECB_decrypt_avx1_16_aes_dec_block_last:
+ vaesdeclast %xmm5, %xmm0, %xmm0
+ leaq (%rsi,%rax,1), %r10
+ vmovdqu %xmm0, (%r10)
+ addl $16, %eax
+ cmpl %r9d, %eax
+ jl L_AES_ECB_decrypt_avx1_dec_16
+L_AES_ECB_decrypt_avx1_done_dec:
+ repz retq
+#ifndef __APPLE__
+.size AES_ECB_decrypt_avx1,.-AES_ECB_decrypt_avx1
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_CBC_encrypt_avx1
+.type AES_CBC_encrypt_avx1,@function
+.align 16
+AES_CBC_encrypt_avx1:
+#else
+.section __TEXT,__text
+.globl _AES_CBC_encrypt_avx1
+.p2align 4
+_AES_CBC_encrypt_avx1:
+#endif /* __APPLE__ */
+ vmovdqu (%rdx), %xmm0
+ xorl %eax, %eax
+ cmpl %ecx, %eax
+ je L_AES_CBC_encrypt_avx1_done
+L_AES_CBC_encrypt_avx1_loop:
+ # 16 bytes of input
+ leaq (%rdi,%rax,1), %r10
+ vmovdqu (%r10), %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ # aes_enc_block
+ vpxor (%r8), %xmm1, %xmm1
+ vmovdqu 16(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 32(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 48(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 64(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 80(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 96(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 112(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 128(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 144(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ cmpl $11, %r9d
+ vmovdqu 160(%r8), %xmm3
+ jl L_AES_CBC_encrypt_avx1_aes_enc_block_last
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 176(%r8), %xmm4
+ vaesenc %xmm4, %xmm1, %xmm1
+ cmpl $13, %r9d
+ vmovdqu 192(%r8), %xmm3
+ jl L_AES_CBC_encrypt_avx1_aes_enc_block_last
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 208(%r8), %xmm4
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqu 224(%r8), %xmm3
+L_AES_CBC_encrypt_avx1_aes_enc_block_last:
+ vaesenclast %xmm3, %xmm1, %xmm1
+ leaq (%rsi,%rax,1), %r11
+ vmovdqu %xmm1, (%r11)
+ vmovdqa %xmm1, %xmm0
+ addl $16, %eax
+ cmpl %ecx, %eax
+ jl L_AES_CBC_encrypt_avx1_loop
+L_AES_CBC_encrypt_avx1_done:
+ vmovdqu %xmm0, (%rdx)
+ repz retq
+#ifndef __APPLE__
+.size AES_CBC_encrypt_avx1,.-AES_CBC_encrypt_avx1
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_CBC_decrypt_avx1
+.type AES_CBC_decrypt_avx1,@function
+.align 16
+AES_CBC_decrypt_avx1:
+#else
+.section __TEXT,__text
+.globl _AES_CBC_decrypt_avx1
+.p2align 4
+_AES_CBC_decrypt_avx1:
+#endif /* __APPLE__ */
+ pushq %r12
+ vmovdqu (%rdx), %xmm4
+ xorl %eax, %eax
+ cmpl $0x40, %ecx
+ movl %ecx, %r10d
+ jl L_AES_CBC_decrypt_avx1_done_64
+ andl $0xffffffc0, %r10d
+L_AES_CBC_decrypt_avx1_dec_64:
+ # 64 bytes of input
+ # aes_cbc_dec_64
+ leaq (%rdi,%rax,1), %r11
+ leaq (%rsi,%rax,1), %r12
+ vmovdqu (%r11), %xmm0
+ vmovdqu 16(%r11), %xmm1
+ vmovdqu 32(%r11), %xmm2
+ vmovdqu 48(%r11), %xmm3
+ # aes_dec_block
+ vmovdqu (%r8), %xmm5
+ vpxor %xmm5, %xmm0, %xmm0
+ vpxor %xmm5, %xmm1, %xmm1
+ vpxor %xmm5, %xmm2, %xmm2
+ vpxor %xmm5, %xmm3, %xmm3
+ vmovdqu 16(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vaesdec %xmm5, %xmm1, %xmm1
+ vaesdec %xmm5, %xmm2, %xmm2
+ vaesdec %xmm5, %xmm3, %xmm3
+ vmovdqu 32(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vaesdec %xmm5, %xmm1, %xmm1
+ vaesdec %xmm5, %xmm2, %xmm2
+ vaesdec %xmm5, %xmm3, %xmm3
+ vmovdqu 48(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vaesdec %xmm5, %xmm1, %xmm1
+ vaesdec %xmm5, %xmm2, %xmm2
+ vaesdec %xmm5, %xmm3, %xmm3
+ vmovdqu 64(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vaesdec %xmm5, %xmm1, %xmm1
+ vaesdec %xmm5, %xmm2, %xmm2
+ vaesdec %xmm5, %xmm3, %xmm3
+ vmovdqu 80(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vaesdec %xmm5, %xmm1, %xmm1
+ vaesdec %xmm5, %xmm2, %xmm2
+ vaesdec %xmm5, %xmm3, %xmm3
+ vmovdqu 96(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vaesdec %xmm5, %xmm1, %xmm1
+ vaesdec %xmm5, %xmm2, %xmm2
+ vaesdec %xmm5, %xmm3, %xmm3
+ vmovdqu 112(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vaesdec %xmm5, %xmm1, %xmm1
+ vaesdec %xmm5, %xmm2, %xmm2
+ vaesdec %xmm5, %xmm3, %xmm3
+ vmovdqu 128(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vaesdec %xmm5, %xmm1, %xmm1
+ vaesdec %xmm5, %xmm2, %xmm2
+ vaesdec %xmm5, %xmm3, %xmm3
+ vmovdqu 144(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vaesdec %xmm5, %xmm1, %xmm1
+ vaesdec %xmm5, %xmm2, %xmm2
+ vaesdec %xmm5, %xmm3, %xmm3
+ cmpl $11, %r9d
+ vmovdqu 160(%r8), %xmm5
+ jl L_AES_CBC_decrypt_avx1_64_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vaesdec %xmm5, %xmm1, %xmm1
+ vaesdec %xmm5, %xmm2, %xmm2
+ vaesdec %xmm5, %xmm3, %xmm3
+ vmovdqu 176(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vaesdec %xmm5, %xmm1, %xmm1
+ vaesdec %xmm5, %xmm2, %xmm2
+ vaesdec %xmm5, %xmm3, %xmm3
+ cmpl $13, %r9d
+ vmovdqu 192(%r8), %xmm5
+ jl L_AES_CBC_decrypt_avx1_64_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vaesdec %xmm5, %xmm1, %xmm1
+ vaesdec %xmm5, %xmm2, %xmm2
+ vaesdec %xmm5, %xmm3, %xmm3
+ vmovdqu 208(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vaesdec %xmm5, %xmm1, %xmm1
+ vaesdec %xmm5, %xmm2, %xmm2
+ vaesdec %xmm5, %xmm3, %xmm3
+ vmovdqu 224(%r8), %xmm5
+L_AES_CBC_decrypt_avx1_64_aes_dec_block_last:
+ vaesdeclast %xmm5, %xmm0, %xmm0
+ vaesdeclast %xmm5, %xmm1, %xmm1
+ vaesdeclast %xmm5, %xmm2, %xmm2
+ vaesdeclast %xmm5, %xmm3, %xmm3
+ vpxor %xmm4, %xmm0, %xmm0
+ vpxor (%r11), %xmm1, %xmm1
+ vpxor 16(%r11), %xmm2, %xmm2
+ vpxor 32(%r11), %xmm3, %xmm3
+ vmovdqu 48(%r11), %xmm4
+ vmovdqu %xmm0, (%r12)
+ vmovdqu %xmm1, 16(%r12)
+ vmovdqu %xmm2, 32(%r12)
+ vmovdqu %xmm3, 48(%r12)
+ addl $0x40, %eax
+ cmpl %r10d, %eax
+ jl L_AES_CBC_decrypt_avx1_dec_64
+L_AES_CBC_decrypt_avx1_done_64:
+ cmpl %ecx, %eax
+ movl %ecx, %r10d
+ je L_AES_CBC_decrypt_avx1_done_dec
+ andl $0xfffffff0, %r10d
+L_AES_CBC_decrypt_avx1_dec_16:
+ # 16 bytes of input
+ leaq (%rdi,%rax,1), %r11
+ vmovdqu (%r11), %xmm0
+ vmovdqa %xmm0, %xmm8
+ # aes_dec_block
+ vpxor (%r8), %xmm0, %xmm0
+ vmovdqu 16(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 32(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 48(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 64(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 80(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 96(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 112(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 128(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 144(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ cmpl $11, %r9d
+ vmovdqu 160(%r8), %xmm6
+ jl L_AES_CBC_decrypt_avx1_16_aes_dec_block_last
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 176(%r8), %xmm7
+ vaesdec %xmm7, %xmm0, %xmm0
+ cmpl $13, %r9d
+ vmovdqu 192(%r8), %xmm6
+ jl L_AES_CBC_decrypt_avx1_16_aes_dec_block_last
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 208(%r8), %xmm7
+ vaesdec %xmm7, %xmm0, %xmm0
+ vmovdqu 224(%r8), %xmm6
+L_AES_CBC_decrypt_avx1_16_aes_dec_block_last:
+ vaesdeclast %xmm6, %xmm0, %xmm0
+ vpxor %xmm4, %xmm0, %xmm0
+ vmovdqa %xmm8, %xmm4
+ leaq (%rsi,%rax,1), %r11
+ vmovdqu %xmm0, (%r11)
+ addl $16, %eax
+ cmpl %r10d, %eax
+ jl L_AES_CBC_decrypt_avx1_dec_16
+L_AES_CBC_decrypt_avx1_done_dec:
+ vmovdqu %xmm4, (%rdx)
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size AES_CBC_decrypt_avx1,.-AES_CBC_decrypt_avx1
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_aes_ctr_avx1_bswap:
+.quad 0x08090a0b0c0d0e0f,0x0001020304050607
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_aes_ctr_avx1_one:
+.quad 0x0000000000000001,0x0000000000000000
+#ifndef __APPLE__
+.text
+.globl AES_CTR_encrypt_avx1
+.type AES_CTR_encrypt_avx1,@function
+.align 16
+AES_CTR_encrypt_avx1:
+#else
+.section __TEXT,__text
+.globl _AES_CTR_encrypt_avx1
+.p2align 4
+_AES_CTR_encrypt_avx1:
+#endif /* __APPLE__ */
+ pushq %rbx
+ vmovdqu L_aes_ctr_avx1_bswap(%rip), %xmm8
+ vmovdqu L_aes_ctr_avx1_one(%rip), %xmm9
+ vpxor %xmm10, %xmm10, %xmm10
+ vmovdqu (%r9), %xmm7
+ vpshufb %xmm8, %xmm7, %xmm7
+ xorl %eax, %eax
+ cmpl $0x40, %edx
+ movl %edx, %r10d
+ jl L_AES_CTR_encrypt_avx1_done_64
+ andl $0xffffffc0, %r10d
+L_AES_CTR_encrypt_avx1_enc_64:
+ # 64 bytes of input
+ # aes_ctr_enc_64
+ leaq (%rdi,%rax,1), %r11
+ leaq (%rsi,%rax,1), %rbx
+ vpshufb %xmm8, %xmm7, %xmm0
+ vpaddq %xmm9, %xmm7, %xmm7
+ vpcmpeqq %xmm10, %xmm7, %xmm11
+ vpslldq $8, %xmm11, %xmm11
+ vpsrlq $63, %xmm11, %xmm11
+ vpaddq %xmm11, %xmm7, %xmm7
+ vpshufb %xmm8, %xmm7, %xmm1
+ vpaddq %xmm9, %xmm7, %xmm7
+ vpcmpeqq %xmm10, %xmm7, %xmm11
+ vpslldq $8, %xmm11, %xmm11
+ vpsrlq $63, %xmm11, %xmm11
+ vpaddq %xmm11, %xmm7, %xmm7
+ vpshufb %xmm8, %xmm7, %xmm2
+ vpaddq %xmm9, %xmm7, %xmm7
+ vpcmpeqq %xmm10, %xmm7, %xmm11
+ vpslldq $8, %xmm11, %xmm11
+ vpsrlq $63, %xmm11, %xmm11
+ vpaddq %xmm11, %xmm7, %xmm7
+ vpshufb %xmm8, %xmm7, %xmm3
+ vpaddq %xmm9, %xmm7, %xmm7
+ vpcmpeqq %xmm10, %xmm7, %xmm11
+ vpslldq $8, %xmm11, %xmm11
+ vpsrlq $63, %xmm11, %xmm11
+ vpaddq %xmm11, %xmm7, %xmm7
+ # aes_enc_block
+ vmovdqu (%rcx), %xmm4
+ vpxor %xmm4, %xmm0, %xmm0
+ vpxor %xmm4, %xmm1, %xmm1
+ vpxor %xmm4, %xmm2, %xmm2
+ vpxor %xmm4, %xmm3, %xmm3
+ vmovdqu 16(%rcx), %xmm4
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ vmovdqu 32(%rcx), %xmm4
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ vmovdqu 48(%rcx), %xmm4
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ vmovdqu 64(%rcx), %xmm4
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ vmovdqu 80(%rcx), %xmm4
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ vmovdqu 96(%rcx), %xmm4
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ vmovdqu 112(%rcx), %xmm4
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ vmovdqu 128(%rcx), %xmm4
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ vmovdqu 144(%rcx), %xmm4
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ cmpl $11, %r8d
+ vmovdqu 160(%rcx), %xmm4
+ jl L_AES_CTR_encrypt_avx1_64_aes_enc_block_last
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ vmovdqu 176(%rcx), %xmm4
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ cmpl $13, %r8d
+ vmovdqu 192(%rcx), %xmm4
+ jl L_AES_CTR_encrypt_avx1_64_aes_enc_block_last
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ vmovdqu 208(%rcx), %xmm4
+ vaesenc %xmm4, %xmm0, %xmm0
+ vaesenc %xmm4, %xmm1, %xmm1
+ vaesenc %xmm4, %xmm2, %xmm2
+ vaesenc %xmm4, %xmm3, %xmm3
+ vmovdqu 224(%rcx), %xmm4
+L_AES_CTR_encrypt_avx1_64_aes_enc_block_last:
+ vaesenclast %xmm4, %xmm0, %xmm0
+ vaesenclast %xmm4, %xmm1, %xmm1
+ vaesenclast %xmm4, %xmm2, %xmm2
+ vaesenclast %xmm4, %xmm3, %xmm3
+ vpxor (%r11), %xmm0, %xmm0
+ vpxor 16(%r11), %xmm1, %xmm1
+ vpxor 32(%r11), %xmm2, %xmm2
+ vpxor 48(%r11), %xmm3, %xmm3
+ vmovdqu %xmm0, (%rbx)
+ vmovdqu %xmm1, 16(%rbx)
+ vmovdqu %xmm2, 32(%rbx)
+ vmovdqu %xmm3, 48(%rbx)
+ addl $0x40, %eax
+ cmpl %r10d, %eax
+ jl L_AES_CTR_encrypt_avx1_enc_64
+L_AES_CTR_encrypt_avx1_done_64:
+ cmpl %edx, %eax
+ movl %edx, %r10d
+ je L_AES_CTR_encrypt_avx1_done_enc
+ andl $0xfffffff0, %r10d
+L_AES_CTR_encrypt_avx1_enc_16:
+ # 16 bytes of input
+ vpshufb %xmm8, %xmm7, %xmm0
+ vpaddq %xmm9, %xmm7, %xmm7
+ vpcmpeqq %xmm10, %xmm7, %xmm11
+ vpslldq $8, %xmm11, %xmm11
+ vpsrlq $63, %xmm11, %xmm11
+ vpaddq %xmm11, %xmm7, %xmm7
+ # aes_enc_block
+ vpxor (%rcx), %xmm0, %xmm0
+ vmovdqu 16(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ cmpl $11, %r8d
+ vmovdqu 160(%rcx), %xmm5
+ jl L_AES_CTR_encrypt_avx1_16_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%rcx), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ cmpl $13, %r8d
+ vmovdqu 192(%rcx), %xmm5
+ jl L_AES_CTR_encrypt_avx1_16_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%rcx), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%rcx), %xmm5
+L_AES_CTR_encrypt_avx1_16_aes_enc_block_last:
+ vaesenclast %xmm5, %xmm0, %xmm0
+ leaq (%rdi,%rax,1), %r11
+ vpxor (%r11), %xmm0, %xmm0
+ leaq (%rsi,%rax,1), %r11
+ vmovdqu %xmm0, (%r11)
+ addl $16, %eax
+ cmpl %r10d, %eax
+ jl L_AES_CTR_encrypt_avx1_enc_16
+L_AES_CTR_encrypt_avx1_done_enc:
+ vpshufb %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, (%r9)
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size AES_CTR_encrypt_avx1,.-AES_CTR_encrypt_avx1
+#endif /* __APPLE__ */
+#endif /* HAVE_INTEL_AVX1 */
+#ifdef HAVE_INTEL_VAES
+#ifndef __APPLE__
+.text
+.globl AES_ECB_encrypt_vaes
+.type AES_ECB_encrypt_vaes,@function
+.align 16
+AES_ECB_encrypt_vaes:
+#else
+.section __TEXT,__text
+.globl _AES_ECB_encrypt_vaes
+.p2align 4
+_AES_ECB_encrypt_vaes:
+#endif /* __APPLE__ */
+ xorl %eax, %eax
+ cmpl $0x80, %edx
+ movl %edx, %r9d
+ jl L_AES_ECB_encrypt_vaes_done_128
+ andl $0xffffff80, %r9d
+L_AES_ECB_encrypt_vaes_enc_128:
+ # 128 bytes of input
+ # aes_ecb_enc_128
+ leaq (%rdi,%rax,1), %r10
+ leaq (%rsi,%rax,1), %r11
+ vmovdqu (%r10), %ymm0
+ vmovdqu 32(%r10), %ymm1
+ vmovdqu 64(%r10), %ymm2
+ vmovdqu 96(%r10), %ymm3
+ # aes_enc_block
+ vbroadcasti128 (%rcx), %ymm7
+ vpxor %ymm7, %ymm0, %ymm0
+ vpxor %ymm7, %ymm1, %ymm1
+ vpxor %ymm7, %ymm2, %ymm2
+ vpxor %ymm7, %ymm3, %ymm3
+ vbroadcasti128 16(%rcx), %ymm7
+ vaesenc %ymm7, %ymm0, %ymm0
+ vaesenc %ymm7, %ymm1, %ymm1
+ vaesenc %ymm7, %ymm2, %ymm2
+ vaesenc %ymm7, %ymm3, %ymm3
+ vbroadcasti128 32(%rcx), %ymm7
+ vaesenc %ymm7, %ymm0, %ymm0
+ vaesenc %ymm7, %ymm1, %ymm1
+ vaesenc %ymm7, %ymm2, %ymm2
+ vaesenc %ymm7, %ymm3, %ymm3
+ vbroadcasti128 48(%rcx), %ymm7
+ vaesenc %ymm7, %ymm0, %ymm0
+ vaesenc %ymm7, %ymm1, %ymm1
+ vaesenc %ymm7, %ymm2, %ymm2
+ vaesenc %ymm7, %ymm3, %ymm3
+ vbroadcasti128 64(%rcx), %ymm7
+ vaesenc %ymm7, %ymm0, %ymm0
+ vaesenc %ymm7, %ymm1, %ymm1
+ vaesenc %ymm7, %ymm2, %ymm2
+ vaesenc %ymm7, %ymm3, %ymm3
+ vbroadcasti128 80(%rcx), %ymm7
+ vaesenc %ymm7, %ymm0, %ymm0
+ vaesenc %ymm7, %ymm1, %ymm1
+ vaesenc %ymm7, %ymm2, %ymm2
+ vaesenc %ymm7, %ymm3, %ymm3
+ vbroadcasti128 96(%rcx), %ymm7
+ vaesenc %ymm7, %ymm0, %ymm0
+ vaesenc %ymm7, %ymm1, %ymm1
+ vaesenc %ymm7, %ymm2, %ymm2
+ vaesenc %ymm7, %ymm3, %ymm3
+ vbroadcasti128 112(%rcx), %ymm7
+ vaesenc %ymm7, %ymm0, %ymm0
+ vaesenc %ymm7, %ymm1, %ymm1
+ vaesenc %ymm7, %ymm2, %ymm2
+ vaesenc %ymm7, %ymm3, %ymm3
+ vbroadcasti128 128(%rcx), %ymm7
+ vaesenc %ymm7, %ymm0, %ymm0
+ vaesenc %ymm7, %ymm1, %ymm1
+ vaesenc %ymm7, %ymm2, %ymm2
+ vaesenc %ymm7, %ymm3, %ymm3
+ vbroadcasti128 144(%rcx), %ymm7
+ vaesenc %ymm7, %ymm0, %ymm0
+ vaesenc %ymm7, %ymm1, %ymm1
+ vaesenc %ymm7, %ymm2, %ymm2
+ vaesenc %ymm7, %ymm3, %ymm3
+ cmpl $11, %r8d
+ vbroadcasti128 160(%rcx), %ymm7
+ jl L_AES_ECB_encrypt_vaes_128_aes_enc_block_last
+ vaesenc %ymm7, %ymm0, %ymm0
+ vaesenc %ymm7, %ymm1, %ymm1
+ vaesenc %ymm7, %ymm2, %ymm2
+ vaesenc %ymm7, %ymm3, %ymm3
+ vbroadcasti128 176(%rcx), %ymm7
+ vaesenc %ymm7, %ymm0, %ymm0
+ vaesenc %ymm7, %ymm1, %ymm1
+ vaesenc %ymm7, %ymm2, %ymm2
+ vaesenc %ymm7, %ymm3, %ymm3
+ cmpl $13, %r8d
+ vbroadcasti128 192(%rcx), %ymm7
+ jl L_AES_ECB_encrypt_vaes_128_aes_enc_block_last
+ vaesenc %ymm7, %ymm0, %ymm0
+ vaesenc %ymm7, %ymm1, %ymm1
+ vaesenc %ymm7, %ymm2, %ymm2
+ vaesenc %ymm7, %ymm3, %ymm3
+ vbroadcasti128 208(%rcx), %ymm7
+ vaesenc %ymm7, %ymm0, %ymm0
+ vaesenc %ymm7, %ymm1, %ymm1
+ vaesenc %ymm7, %ymm2, %ymm2
+ vaesenc %ymm7, %ymm3, %ymm3
+ vbroadcasti128 224(%rcx), %ymm7
+L_AES_ECB_encrypt_vaes_128_aes_enc_block_last:
+ vaesenclast %ymm7, %ymm0, %ymm0
+ vaesenclast %ymm7, %ymm1, %ymm1
+ vaesenclast %ymm7, %ymm2, %ymm2
+ vaesenclast %ymm7, %ymm3, %ymm3
+ vmovdqu %ymm0, (%r11)
+ vmovdqu %ymm1, 32(%r11)
+ vmovdqu %ymm2, 64(%r11)
+ vmovdqu %ymm3, 96(%r11)
+ addl $0x80, %eax
+ cmpl %r9d, %eax
+ jl L_AES_ECB_encrypt_vaes_enc_128
+L_AES_ECB_encrypt_vaes_done_128:
+ movl %edx, %r9d
+ andl $0xffffffe0, %r9d
+ cmpl %r9d, %eax
+ je L_AES_ECB_encrypt_vaes_done_32
+L_AES_ECB_encrypt_vaes_enc_32:
+ # 32 bytes of input
+ # aes_ecb_enc_32
+ leaq (%rdi,%rax,1), %r10
+ leaq (%rsi,%rax,1), %r11
+ vmovdqu (%r10), %ymm0
+ # aes_enc_block
+ vbroadcasti128 (%rcx), %ymm7
+ vpxor %ymm7, %ymm0, %ymm0
+ vbroadcasti128 16(%rcx), %ymm7
+ vaesenc %ymm7, %ymm0, %ymm0
+ vbroadcasti128 32(%rcx), %ymm7
+ vaesenc %ymm7, %ymm0, %ymm0
+ vbroadcasti128 48(%rcx), %ymm7
+ vaesenc %ymm7, %ymm0, %ymm0
+ vbroadcasti128 64(%rcx), %ymm7
+ vaesenc %ymm7, %ymm0, %ymm0
+ vbroadcasti128 80(%rcx), %ymm7
+ vaesenc %ymm7, %ymm0, %ymm0
+ vbroadcasti128 96(%rcx), %ymm7
+ vaesenc %ymm7, %ymm0, %ymm0
+ vbroadcasti128 112(%rcx), %ymm7
+ vaesenc %ymm7, %ymm0, %ymm0
+ vbroadcasti128 128(%rcx), %ymm7
+ vaesenc %ymm7, %ymm0, %ymm0
+ vbroadcasti128 144(%rcx), %ymm7
+ vaesenc %ymm7, %ymm0, %ymm0
+ cmpl $11, %r8d
+ vbroadcasti128 160(%rcx), %ymm7
+ jl L_AES_ECB_encrypt_vaes_32_aes_enc_block_last
+ vaesenc %ymm7, %ymm0, %ymm0
+ vbroadcasti128 176(%rcx), %ymm7
+ vaesenc %ymm7, %ymm0, %ymm0
+ cmpl $13, %r8d
+ vbroadcasti128 192(%rcx), %ymm7
+ jl L_AES_ECB_encrypt_vaes_32_aes_enc_block_last
+ vaesenc %ymm7, %ymm0, %ymm0
+ vbroadcasti128 208(%rcx), %ymm7
+ vaesenc %ymm7, %ymm0, %ymm0
+ vbroadcasti128 224(%rcx), %ymm7
+L_AES_ECB_encrypt_vaes_32_aes_enc_block_last:
+ vaesenclast %ymm7, %ymm0, %ymm0
+ vmovdqu %ymm0, (%r11)
+ addl $32, %eax
+ cmpl %r9d, %eax
+ jl L_AES_ECB_encrypt_vaes_enc_32
+L_AES_ECB_encrypt_vaes_done_32:
+ cmpl %edx, %eax
+ movl %edx, %r9d
+ je L_AES_ECB_encrypt_vaes_done_enc
+ andl $0xfffffff0, %r9d
+L_AES_ECB_encrypt_vaes_enc_16:
+ # 16 bytes of input
+ leaq (%rdi,%rax,1), %r10
+ vmovdqu (%r10), %xmm0
+ # aes_enc_block
+ vpxor (%rcx), %xmm0, %xmm0
+ vmovdqu 16(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ cmpl $11, %r8d
+ vmovdqu 160(%rcx), %xmm5
+ jl L_AES_ECB_encrypt_vaes_16_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%rcx), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ cmpl $13, %r8d
+ vmovdqu 192(%rcx), %xmm5
+ jl L_AES_ECB_encrypt_vaes_16_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%rcx), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%rcx), %xmm5
+L_AES_ECB_encrypt_vaes_16_aes_enc_block_last:
+ vaesenclast %xmm5, %xmm0, %xmm0
+ leaq (%rsi,%rax,1), %r10
+ vmovdqu %xmm0, (%r10)
+ addl $16, %eax
+ cmpl %r9d, %eax
+ jl L_AES_ECB_encrypt_vaes_enc_16
+L_AES_ECB_encrypt_vaes_done_enc:
+ repz retq
+#ifndef __APPLE__
+.size AES_ECB_encrypt_vaes,.-AES_ECB_encrypt_vaes
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_ECB_decrypt_vaes
+.type AES_ECB_decrypt_vaes,@function
+.align 16
+AES_ECB_decrypt_vaes:
+#else
+.section __TEXT,__text
+.globl _AES_ECB_decrypt_vaes
+.p2align 4
+_AES_ECB_decrypt_vaes:
+#endif /* __APPLE__ */
+ xorl %eax, %eax
+ cmpl $0x80, %edx
+ movl %edx, %r9d
+ jl L_AES_ECB_decrypt_vaes_done_128
+ andl $0xffffff80, %r9d
+L_AES_ECB_decrypt_vaes_dec_128:
+ # 128 bytes of input
+ # aes_ecb_dec_128
+ leaq (%rdi,%rax,1), %r10
+ leaq (%rsi,%rax,1), %r11
+ vmovdqu (%r10), %ymm0
+ vmovdqu 32(%r10), %ymm1
+ vmovdqu 64(%r10), %ymm2
+ vmovdqu 96(%r10), %ymm3
+ # aes_dec_block
+ vbroadcasti128 (%rcx), %ymm7
+ vpxor %ymm7, %ymm0, %ymm0
+ vpxor %ymm7, %ymm1, %ymm1
+ vpxor %ymm7, %ymm2, %ymm2
+ vpxor %ymm7, %ymm3, %ymm3
+ vbroadcasti128 16(%rcx), %ymm7
+ vaesdec %ymm7, %ymm0, %ymm0
+ vaesdec %ymm7, %ymm1, %ymm1
+ vaesdec %ymm7, %ymm2, %ymm2
+ vaesdec %ymm7, %ymm3, %ymm3
+ vbroadcasti128 32(%rcx), %ymm7
+ vaesdec %ymm7, %ymm0, %ymm0
+ vaesdec %ymm7, %ymm1, %ymm1
+ vaesdec %ymm7, %ymm2, %ymm2
+ vaesdec %ymm7, %ymm3, %ymm3
+ vbroadcasti128 48(%rcx), %ymm7
+ vaesdec %ymm7, %ymm0, %ymm0
+ vaesdec %ymm7, %ymm1, %ymm1
+ vaesdec %ymm7, %ymm2, %ymm2
+ vaesdec %ymm7, %ymm3, %ymm3
+ vbroadcasti128 64(%rcx), %ymm7
+ vaesdec %ymm7, %ymm0, %ymm0
+ vaesdec %ymm7, %ymm1, %ymm1
+ vaesdec %ymm7, %ymm2, %ymm2
+ vaesdec %ymm7, %ymm3, %ymm3
+ vbroadcasti128 80(%rcx), %ymm7
+ vaesdec %ymm7, %ymm0, %ymm0
+ vaesdec %ymm7, %ymm1, %ymm1
+ vaesdec %ymm7, %ymm2, %ymm2
+ vaesdec %ymm7, %ymm3, %ymm3
+ vbroadcasti128 96(%rcx), %ymm7
+ vaesdec %ymm7, %ymm0, %ymm0
+ vaesdec %ymm7, %ymm1, %ymm1
+ vaesdec %ymm7, %ymm2, %ymm2
+ vaesdec %ymm7, %ymm3, %ymm3
+ vbroadcasti128 112(%rcx), %ymm7
+ vaesdec %ymm7, %ymm0, %ymm0
+ vaesdec %ymm7, %ymm1, %ymm1
+ vaesdec %ymm7, %ymm2, %ymm2
+ vaesdec %ymm7, %ymm3, %ymm3
+ vbroadcasti128 128(%rcx), %ymm7
+ vaesdec %ymm7, %ymm0, %ymm0
+ vaesdec %ymm7, %ymm1, %ymm1
+ vaesdec %ymm7, %ymm2, %ymm2
+ vaesdec %ymm7, %ymm3, %ymm3
+ vbroadcasti128 144(%rcx), %ymm7
+ vaesdec %ymm7, %ymm0, %ymm0
+ vaesdec %ymm7, %ymm1, %ymm1
+ vaesdec %ymm7, %ymm2, %ymm2
+ vaesdec %ymm7, %ymm3, %ymm3
+ cmpl $11, %r8d
+ vbroadcasti128 160(%rcx), %ymm7
+ jl L_AES_ECB_decrypt_vaes_128_aes_dec_block_last
+ vaesdec %ymm7, %ymm0, %ymm0
+ vaesdec %ymm7, %ymm1, %ymm1
+ vaesdec %ymm7, %ymm2, %ymm2
+ vaesdec %ymm7, %ymm3, %ymm3
+ vbroadcasti128 176(%rcx), %ymm7
+ vaesdec %ymm7, %ymm0, %ymm0
+ vaesdec %ymm7, %ymm1, %ymm1
+ vaesdec %ymm7, %ymm2, %ymm2
+ vaesdec %ymm7, %ymm3, %ymm3
+ cmpl $13, %r8d
+ vbroadcasti128 192(%rcx), %ymm7
+ jl L_AES_ECB_decrypt_vaes_128_aes_dec_block_last
+ vaesdec %ymm7, %ymm0, %ymm0
+ vaesdec %ymm7, %ymm1, %ymm1
+ vaesdec %ymm7, %ymm2, %ymm2
+ vaesdec %ymm7, %ymm3, %ymm3
+ vbroadcasti128 208(%rcx), %ymm7
+ vaesdec %ymm7, %ymm0, %ymm0
+ vaesdec %ymm7, %ymm1, %ymm1
+ vaesdec %ymm7, %ymm2, %ymm2
+ vaesdec %ymm7, %ymm3, %ymm3
+ vbroadcasti128 224(%rcx), %ymm7
+L_AES_ECB_decrypt_vaes_128_aes_dec_block_last:
+ vaesdeclast %ymm7, %ymm0, %ymm0
+ vaesdeclast %ymm7, %ymm1, %ymm1
+ vaesdeclast %ymm7, %ymm2, %ymm2
+ vaesdeclast %ymm7, %ymm3, %ymm3
+ vmovdqu %ymm0, (%r11)
+ vmovdqu %ymm1, 32(%r11)
+ vmovdqu %ymm2, 64(%r11)
+ vmovdqu %ymm3, 96(%r11)
+ addl $0x80, %eax
+ cmpl %r9d, %eax
+ jl L_AES_ECB_decrypt_vaes_dec_128
+L_AES_ECB_decrypt_vaes_done_128:
+ movl %edx, %r9d
+ andl $0xffffffe0, %r9d
+ cmpl %r9d, %eax
+ je L_AES_ECB_decrypt_vaes_done_32
+L_AES_ECB_decrypt_vaes_dec_32:
+ # 32 bytes of input
+ # aes_ecb_dec_32
+ leaq (%rdi,%rax,1), %r10
+ leaq (%rsi,%rax,1), %r11
+ vmovdqu (%r10), %ymm0
+ # aes_dec_block
+ vbroadcasti128 (%rcx), %ymm7
+ vpxor %ymm7, %ymm0, %ymm0
+ vbroadcasti128 16(%rcx), %ymm7
+ vaesdec %ymm7, %ymm0, %ymm0
+ vbroadcasti128 32(%rcx), %ymm7
+ vaesdec %ymm7, %ymm0, %ymm0
+ vbroadcasti128 48(%rcx), %ymm7
+ vaesdec %ymm7, %ymm0, %ymm0
+ vbroadcasti128 64(%rcx), %ymm7
+ vaesdec %ymm7, %ymm0, %ymm0
+ vbroadcasti128 80(%rcx), %ymm7
+ vaesdec %ymm7, %ymm0, %ymm0
+ vbroadcasti128 96(%rcx), %ymm7
+ vaesdec %ymm7, %ymm0, %ymm0
+ vbroadcasti128 112(%rcx), %ymm7
+ vaesdec %ymm7, %ymm0, %ymm0
+ vbroadcasti128 128(%rcx), %ymm7
+ vaesdec %ymm7, %ymm0, %ymm0
+ vbroadcasti128 144(%rcx), %ymm7
+ vaesdec %ymm7, %ymm0, %ymm0
+ cmpl $11, %r8d
+ vbroadcasti128 160(%rcx), %ymm7
+ jl L_AES_ECB_decrypt_vaes_32_aes_dec_block_last
+ vaesdec %ymm7, %ymm0, %ymm0
+ vbroadcasti128 176(%rcx), %ymm7
+ vaesdec %ymm7, %ymm0, %ymm0
+ cmpl $13, %r8d
+ vbroadcasti128 192(%rcx), %ymm7
+ jl L_AES_ECB_decrypt_vaes_32_aes_dec_block_last
+ vaesdec %ymm7, %ymm0, %ymm0
+ vbroadcasti128 208(%rcx), %ymm7
+ vaesdec %ymm7, %ymm0, %ymm0
+ vbroadcasti128 224(%rcx), %ymm7
+L_AES_ECB_decrypt_vaes_32_aes_dec_block_last:
+ vaesdeclast %ymm7, %ymm0, %ymm0
+ vmovdqu %ymm0, (%r11)
+ addl $32, %eax
+ cmpl %r9d, %eax
+ jl L_AES_ECB_decrypt_vaes_dec_32
+L_AES_ECB_decrypt_vaes_done_32:
+ cmpl %edx, %eax
+ movl %edx, %r9d
+ je L_AES_ECB_decrypt_vaes_done_dec
+ andl $0xfffffff0, %r9d
+L_AES_ECB_decrypt_vaes_dec_16:
+ # 16 bytes of input
+ leaq (%rdi,%rax,1), %r10
+ vmovdqu (%r10), %xmm0
+ # aes_dec_block
+ vpxor (%rcx), %xmm0, %xmm0
+ vmovdqu 16(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ cmpl $11, %r8d
+ vmovdqu 160(%rcx), %xmm5
+ jl L_AES_ECB_decrypt_vaes_16_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%rcx), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ cmpl $13, %r8d
+ vmovdqu 192(%rcx), %xmm5
+ jl L_AES_ECB_decrypt_vaes_16_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%rcx), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%rcx), %xmm5
+L_AES_ECB_decrypt_vaes_16_aes_dec_block_last:
+ vaesdeclast %xmm5, %xmm0, %xmm0
+ leaq (%rsi,%rax,1), %r10
+ vmovdqu %xmm0, (%r10)
+ addl $16, %eax
+ cmpl %r9d, %eax
+ jl L_AES_ECB_decrypt_vaes_dec_16
+L_AES_ECB_decrypt_vaes_done_dec:
+ repz retq
+#ifndef __APPLE__
+.size AES_ECB_decrypt_vaes,.-AES_ECB_decrypt_vaes
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_CBC_encrypt_vaes
+.type AES_CBC_encrypt_vaes,@function
+.align 16
+AES_CBC_encrypt_vaes:
+#else
+.section __TEXT,__text
+.globl _AES_CBC_encrypt_vaes
+.p2align 4
+_AES_CBC_encrypt_vaes:
+#endif /* __APPLE__ */
+ vmovdqu (%rdx), %xmm0
+ xorl %eax, %eax
+ cmpl %ecx, %eax
+ je L_AES_CBC_encrypt_vaes_done
+L_AES_CBC_encrypt_vaes_loop:
+ # 16 bytes of input
+ leaq (%rdi,%rax,1), %r10
+ vmovdqu (%r10), %xmm1
+ vpxor %xmm0, %xmm1, %xmm1
+ # aes_enc_block
+ vpxor (%r8), %xmm1, %xmm1
+ vmovdqu 16(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 32(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 48(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 64(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 80(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 96(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 112(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 128(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 144(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ cmpl $11, %r9d
+ vmovdqu 160(%r8), %xmm3
+ jl L_AES_CBC_encrypt_vaes_aes_enc_block_last
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 176(%r8), %xmm4
+ vaesenc %xmm4, %xmm1, %xmm1
+ cmpl $13, %r9d
+ vmovdqu 192(%r8), %xmm3
+ jl L_AES_CBC_encrypt_vaes_aes_enc_block_last
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 208(%r8), %xmm4
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqu 224(%r8), %xmm3
+L_AES_CBC_encrypt_vaes_aes_enc_block_last:
+ vaesenclast %xmm3, %xmm1, %xmm1
+ leaq (%rsi,%rax,1), %r11
+ vmovdqu %xmm1, (%r11)
+ vmovdqa %xmm1, %xmm0
+ addl $16, %eax
+ cmpl %ecx, %eax
+ jl L_AES_CBC_encrypt_vaes_loop
+L_AES_CBC_encrypt_vaes_done:
+ vmovdqu %xmm0, (%rdx)
+ repz retq
+#ifndef __APPLE__
+.size AES_CBC_encrypt_vaes,.-AES_CBC_encrypt_vaes
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_CBC_decrypt_vaes
+.type AES_CBC_decrypt_vaes,@function
+.align 16
+AES_CBC_decrypt_vaes:
+#else
+.section __TEXT,__text
+.globl _AES_CBC_decrypt_vaes
+.p2align 4
+_AES_CBC_decrypt_vaes:
+#endif /* __APPLE__ */
+ pushq %r12
+ vmovdqu (%rdx), %xmm8
+ xorl %eax, %eax
+ cmpl $0x80, %ecx
+ movl %ecx, %r10d
+ jl L_AES_CBC_decrypt_vaes_done_128
+ andl $0xffffff80, %r10d
+L_AES_CBC_decrypt_vaes_dec_128:
+ # 128 bytes of input
+ # aes_cbc_dec_128
+ leaq (%rdi,%rax,1), %r11
+ leaq (%rsi,%rax,1), %r12
+ vmovdqu (%r11), %ymm0
+ vmovdqu 32(%r11), %ymm1
+ vmovdqu 64(%r11), %ymm2
+ vmovdqu 96(%r11), %ymm3
+ vinserti128 $0x01, %xmm0, %ymm8, %ymm10
+ vmovdqu 16(%r11), %ymm11
+ vmovdqu 48(%r11), %ymm12
+ vmovdqu 80(%r11), %ymm13
+ vextracti128 $0x01, %ymm3, %xmm8
+ # aes_dec_block
+ vbroadcasti128 (%r8), %ymm9
+ vpxor %ymm9, %ymm0, %ymm0
+ vpxor %ymm9, %ymm1, %ymm1
+ vpxor %ymm9, %ymm2, %ymm2
+ vpxor %ymm9, %ymm3, %ymm3
+ vbroadcasti128 16(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 32(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 48(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 64(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 80(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 96(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 112(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 128(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 144(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ cmpl $11, %r9d
+ vbroadcasti128 160(%r8), %ymm9
+ jl L_AES_CBC_decrypt_vaes_128_aes_dec_block_last
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 176(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ cmpl $13, %r9d
+ vbroadcasti128 192(%r8), %ymm9
+ jl L_AES_CBC_decrypt_vaes_128_aes_dec_block_last
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 208(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 224(%r8), %ymm9
+L_AES_CBC_decrypt_vaes_128_aes_dec_block_last:
+ vaesdeclast %ymm9, %ymm0, %ymm0
+ vaesdeclast %ymm9, %ymm1, %ymm1
+ vaesdeclast %ymm9, %ymm2, %ymm2
+ vaesdeclast %ymm9, %ymm3, %ymm3
+ vpxor %ymm10, %ymm0, %ymm0
+ vpxor %ymm11, %ymm1, %ymm1
+ vpxor %ymm12, %ymm2, %ymm2
+ vpxor %ymm13, %ymm3, %ymm3
+ vmovdqu %ymm0, (%r12)
+ vmovdqu %ymm1, 32(%r12)
+ vmovdqu %ymm2, 64(%r12)
+ vmovdqu %ymm3, 96(%r12)
+ addl $0x80, %eax
+ cmpl %r10d, %eax
+ jl L_AES_CBC_decrypt_vaes_dec_128
+L_AES_CBC_decrypt_vaes_done_128:
+ movl %ecx, %r10d
+ andl $0xffffffe0, %r10d
+ cmpl %r10d, %eax
+ je L_AES_CBC_decrypt_vaes_done_32
+L_AES_CBC_decrypt_vaes_dec_32:
+ # 32 bytes of input
+ # aes_cbc_dec_32
+ leaq (%rdi,%rax,1), %r11
+ leaq (%rsi,%rax,1), %r12
+ vmovdqu (%r11), %ymm0
+ vinserti128 $0x01, %xmm0, %ymm8, %ymm10
+ vextracti128 $0x01, %ymm0, %xmm8
+ # aes_dec_block
+ vbroadcasti128 (%r8), %ymm9
+ vpxor %ymm9, %ymm0, %ymm0
+ vbroadcasti128 16(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 32(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 48(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 64(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 80(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 96(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 112(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 128(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 144(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ cmpl $11, %r9d
+ vbroadcasti128 160(%r8), %ymm9
+ jl L_AES_CBC_decrypt_vaes_32_aes_dec_block_last
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 176(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ cmpl $13, %r9d
+ vbroadcasti128 192(%r8), %ymm9
+ jl L_AES_CBC_decrypt_vaes_32_aes_dec_block_last
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 208(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 224(%r8), %ymm9
+L_AES_CBC_decrypt_vaes_32_aes_dec_block_last:
+ vaesdeclast %ymm9, %ymm0, %ymm0
+ vpxor %ymm10, %ymm0, %ymm0
+ vmovdqu %ymm0, (%r12)
+ addl $32, %eax
+ cmpl %r10d, %eax
+ jl L_AES_CBC_decrypt_vaes_dec_32
+L_AES_CBC_decrypt_vaes_done_32:
+ cmpl %ecx, %eax
+ movl %ecx, %r10d
+ je L_AES_CBC_decrypt_vaes_done_dec
+ andl $0xfffffff0, %r10d
+L_AES_CBC_decrypt_vaes_dec_16:
+ # 16 bytes of input
+ leaq (%rdi,%rax,1), %r11
+ vmovdqu (%r11), %xmm0
+ vmovdqa %xmm0, %xmm7
+ # aes_dec_block
+ vpxor (%r8), %xmm0, %xmm0
+ vmovdqu 16(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ cmpl $11, %r9d
+ vmovdqu 160(%r8), %xmm5
+ jl L_AES_CBC_decrypt_vaes_16_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ cmpl $13, %r9d
+ vmovdqu 192(%r8), %xmm5
+ jl L_AES_CBC_decrypt_vaes_16_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%r8), %xmm5
+L_AES_CBC_decrypt_vaes_16_aes_dec_block_last:
+ vaesdeclast %xmm5, %xmm0, %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ vmovdqa %xmm7, %xmm8
+ leaq (%rsi,%rax,1), %r11
+ vmovdqu %xmm0, (%r11)
+ addl $16, %eax
+ cmpl %r10d, %eax
+ jl L_AES_CBC_decrypt_vaes_dec_16
+L_AES_CBC_decrypt_vaes_done_dec:
+ vmovdqu %xmm8, (%rdx)
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size AES_CBC_decrypt_vaes,.-AES_CBC_decrypt_vaes
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_aes_ctr_bswap_vaes:
+.quad 0x08090a0b0c0d0e0f,0x0001020304050607
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 32
+#else
+.p2align 5
+#endif /* __APPLE__ */
+L_aes_ctr_inc_vaes:
+.quad 0x0000000000000000,0x0000000000000000
+.quad 0x0000000000000001,0x0000000000000000
+.quad 0x0000000000000002,0x0000000000000000
+.quad 0x0000000000000003,0x0000000000000000
+.quad 0x0000000000000004,0x0000000000000000
+.quad 0x0000000000000005,0x0000000000000000
+.quad 0x0000000000000006,0x0000000000000000
+.quad 0x0000000000000007,0x0000000000000000
+.quad 0x0000000000000008,0x0000000000000000
+.quad 0x0000000000000009,0x0000000000000000
+.quad 0x000000000000000a,0x0000000000000000
+.quad 0x000000000000000b,0x0000000000000000
+.quad 0x000000000000000c,0x0000000000000000
+.quad 0x000000000000000d,0x0000000000000000
+.quad 0x000000000000000e,0x0000000000000000
+.quad 0x000000000000000f,0x0000000000000000
+.quad 0x0000000000000010,0x0000000000000000
+#ifndef __APPLE__
+.text
+.globl AES_CTR_encrypt_vaes
+.type AES_CTR_encrypt_vaes,@function
+.align 16
+AES_CTR_encrypt_vaes:
+#else
+.section __TEXT,__text
+.globl _AES_CTR_encrypt_vaes
+.p2align 4
+_AES_CTR_encrypt_vaes:
+#endif /* __APPLE__ */
+ pushq %rbx
+ vbroadcasti128 L_aes_ctr_bswap_vaes(%rip), %ymm8
+ vbroadcasti128 (%r9), %ymm7
+ vpshufb %ymm8, %ymm7, %ymm7
+ vbroadcasti128 128+L_aes_ctr_inc_vaes(%rip), %ymm10
+ vbroadcasti128 32+L_aes_ctr_inc_vaes(%rip), %ymm11
+ vbroadcasti128 16+L_aes_ctr_inc_vaes(%rip), %ymm12
+ xorl %eax, %eax
+ cmpl $0x80, %edx
+ movl %edx, %r10d
+ jl L_AES_CTR_encrypt_vaes_done_128
+ andl $0xffffff80, %r10d
+ vmovdqa %ymm7, %ymm9
+ vpaddq 0+L_aes_ctr_inc_vaes(%rip), %ymm7, %ymm4
+ vpand 0+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm14
+ vpor 0+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm9
+ vpandn %ymm9, %ymm4, %ymm9
+ vpor %ymm14, %ymm9, %ymm9
+ vpsrlq $63, %ymm9, %ymm9
+ vpslldq $8, %ymm9, %ymm9
+ vpaddq %ymm9, %ymm4, %ymm4
+ vmovdqa %ymm7, %ymm9
+ vpaddq 32+L_aes_ctr_inc_vaes(%rip), %ymm7, %ymm5
+ vpand 32+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm14
+ vpor 32+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm9
+ vpandn %ymm9, %ymm5, %ymm9
+ vpor %ymm14, %ymm9, %ymm9
+ vpsrlq $63, %ymm9, %ymm9
+ vpslldq $8, %ymm9, %ymm9
+ vpaddq %ymm9, %ymm5, %ymm5
+ vmovdqa %ymm7, %ymm9
+ vpaddq 64+L_aes_ctr_inc_vaes(%rip), %ymm7, %ymm6
+ vpand 64+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm14
+ vpor 64+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm9
+ vpandn %ymm9, %ymm6, %ymm9
+ vpor %ymm14, %ymm9, %ymm9
+ vpsrlq $63, %ymm9, %ymm9
+ vpslldq $8, %ymm9, %ymm9
+ vpaddq %ymm9, %ymm6, %ymm6
+ vmovdqa %ymm7, %ymm9
+ vpaddq 96+L_aes_ctr_inc_vaes(%rip), %ymm7, %ymm7
+ vpand 96+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm14
+ vpor 96+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm9
+ vpandn %ymm9, %ymm7, %ymm9
+ vpor %ymm14, %ymm9, %ymm9
+ vpsrlq $63, %ymm9, %ymm9
+ vpslldq $8, %ymm9, %ymm9
+ vpaddq %ymm9, %ymm7, %ymm7
+L_AES_CTR_encrypt_vaes_enc_128:
+ # 128 bytes of input
+ leaq (%rdi,%rax,1), %r11
+ leaq (%rsi,%rax,1), %rbx
+ vpshufb %ymm8, %ymm4, %ymm0
+ vpshufb %ymm8, %ymm5, %ymm1
+ vpshufb %ymm8, %ymm6, %ymm2
+ vpshufb %ymm8, %ymm7, %ymm3
+ vmovdqa %ymm4, %ymm9
+ vpaddq %ymm10, %ymm4, %ymm4
+ vpand %ymm10, %ymm9, %ymm14
+ vpor %ymm10, %ymm9, %ymm9
+ vpandn %ymm9, %ymm4, %ymm9
+ vpor %ymm14, %ymm9, %ymm9
+ vpsrlq $63, %ymm9, %ymm9
+ vpslldq $8, %ymm9, %ymm9
+ vpaddq %ymm9, %ymm4, %ymm4
+ vmovdqa %ymm5, %ymm9
+ vpaddq %ymm10, %ymm5, %ymm5
+ vpand %ymm10, %ymm9, %ymm14
+ vpor %ymm10, %ymm9, %ymm9
+ vpandn %ymm9, %ymm5, %ymm9
+ vpor %ymm14, %ymm9, %ymm9
+ vpsrlq $63, %ymm9, %ymm9
+ vpslldq $8, %ymm9, %ymm9
+ vpaddq %ymm9, %ymm5, %ymm5
+ vmovdqa %ymm6, %ymm9
+ vpaddq %ymm10, %ymm6, %ymm6
+ vpand %ymm10, %ymm9, %ymm14
+ vpor %ymm10, %ymm9, %ymm9
+ vpandn %ymm9, %ymm6, %ymm9
+ vpor %ymm14, %ymm9, %ymm9
+ vpsrlq $63, %ymm9, %ymm9
+ vpslldq $8, %ymm9, %ymm9
+ vpaddq %ymm9, %ymm6, %ymm6
+ vmovdqa %ymm7, %ymm9
+ vpaddq %ymm10, %ymm7, %ymm7
+ vpand %ymm10, %ymm9, %ymm14
+ vpor %ymm10, %ymm9, %ymm9
+ vpandn %ymm9, %ymm7, %ymm9
+ vpor %ymm14, %ymm9, %ymm9
+ vpsrlq $63, %ymm9, %ymm9
+ vpslldq $8, %ymm9, %ymm9
+ vpaddq %ymm9, %ymm7, %ymm7
+ # aes_enc_block
+ vbroadcasti128 (%rcx), %ymm13
+ vpxor %ymm13, %ymm0, %ymm0
+ vpxor %ymm13, %ymm1, %ymm1
+ vpxor %ymm13, %ymm2, %ymm2
+ vpxor %ymm13, %ymm3, %ymm3
+ vbroadcasti128 16(%rcx), %ymm13
+ vaesenc %ymm13, %ymm0, %ymm0
+ vaesenc %ymm13, %ymm1, %ymm1
+ vaesenc %ymm13, %ymm2, %ymm2
+ vaesenc %ymm13, %ymm3, %ymm3
+ vbroadcasti128 32(%rcx), %ymm13
+ vaesenc %ymm13, %ymm0, %ymm0
+ vaesenc %ymm13, %ymm1, %ymm1
+ vaesenc %ymm13, %ymm2, %ymm2
+ vaesenc %ymm13, %ymm3, %ymm3
+ vbroadcasti128 48(%rcx), %ymm13
+ vaesenc %ymm13, %ymm0, %ymm0
+ vaesenc %ymm13, %ymm1, %ymm1
+ vaesenc %ymm13, %ymm2, %ymm2
+ vaesenc %ymm13, %ymm3, %ymm3
+ vbroadcasti128 64(%rcx), %ymm13
+ vaesenc %ymm13, %ymm0, %ymm0
+ vaesenc %ymm13, %ymm1, %ymm1
+ vaesenc %ymm13, %ymm2, %ymm2
+ vaesenc %ymm13, %ymm3, %ymm3
+ vbroadcasti128 80(%rcx), %ymm13
+ vaesenc %ymm13, %ymm0, %ymm0
+ vaesenc %ymm13, %ymm1, %ymm1
+ vaesenc %ymm13, %ymm2, %ymm2
+ vaesenc %ymm13, %ymm3, %ymm3
+ vbroadcasti128 96(%rcx), %ymm13
+ vaesenc %ymm13, %ymm0, %ymm0
+ vaesenc %ymm13, %ymm1, %ymm1
+ vaesenc %ymm13, %ymm2, %ymm2
+ vaesenc %ymm13, %ymm3, %ymm3
+ vbroadcasti128 112(%rcx), %ymm13
+ vaesenc %ymm13, %ymm0, %ymm0
+ vaesenc %ymm13, %ymm1, %ymm1
+ vaesenc %ymm13, %ymm2, %ymm2
+ vaesenc %ymm13, %ymm3, %ymm3
+ vbroadcasti128 128(%rcx), %ymm13
+ vaesenc %ymm13, %ymm0, %ymm0
+ vaesenc %ymm13, %ymm1, %ymm1
+ vaesenc %ymm13, %ymm2, %ymm2
+ vaesenc %ymm13, %ymm3, %ymm3
+ vbroadcasti128 144(%rcx), %ymm13
+ vaesenc %ymm13, %ymm0, %ymm0
+ vaesenc %ymm13, %ymm1, %ymm1
+ vaesenc %ymm13, %ymm2, %ymm2
+ vaesenc %ymm13, %ymm3, %ymm3
+ cmpl $11, %r8d
+ vbroadcasti128 160(%rcx), %ymm13
+ jl L_AES_CTR_encrypt_vaes_128_aes_enc_block_last
+ vaesenc %ymm13, %ymm0, %ymm0
+ vaesenc %ymm13, %ymm1, %ymm1
+ vaesenc %ymm13, %ymm2, %ymm2
+ vaesenc %ymm13, %ymm3, %ymm3
+ vbroadcasti128 176(%rcx), %ymm13
+ vaesenc %ymm13, %ymm0, %ymm0
+ vaesenc %ymm13, %ymm1, %ymm1
+ vaesenc %ymm13, %ymm2, %ymm2
+ vaesenc %ymm13, %ymm3, %ymm3
+ cmpl $13, %r8d
+ vbroadcasti128 192(%rcx), %ymm13
+ jl L_AES_CTR_encrypt_vaes_128_aes_enc_block_last
+ vaesenc %ymm13, %ymm0, %ymm0
+ vaesenc %ymm13, %ymm1, %ymm1
+ vaesenc %ymm13, %ymm2, %ymm2
+ vaesenc %ymm13, %ymm3, %ymm3
+ vbroadcasti128 208(%rcx), %ymm13
+ vaesenc %ymm13, %ymm0, %ymm0
+ vaesenc %ymm13, %ymm1, %ymm1
+ vaesenc %ymm13, %ymm2, %ymm2
+ vaesenc %ymm13, %ymm3, %ymm3
+ vbroadcasti128 224(%rcx), %ymm13
+L_AES_CTR_encrypt_vaes_128_aes_enc_block_last:
+ vaesenclast %ymm13, %ymm0, %ymm0
+ vaesenclast %ymm13, %ymm1, %ymm1
+ vaesenclast %ymm13, %ymm2, %ymm2
+ vaesenclast %ymm13, %ymm3, %ymm3
+ vpxor (%r11), %ymm0, %ymm0
+ vpxor 32(%r11), %ymm1, %ymm1
+ vpxor 64(%r11), %ymm2, %ymm2
+ vpxor 96(%r11), %ymm3, %ymm3
+ vmovdqu %ymm0, (%rbx)
+ vmovdqu %ymm1, 32(%rbx)
+ vmovdqu %ymm2, 64(%rbx)
+ vmovdqu %ymm3, 96(%rbx)
+ addl $0x80, %eax
+ cmpl %r10d, %eax
+ jl L_AES_CTR_encrypt_vaes_enc_128
+ vperm2i128 $0x00, %ymm4, %ymm4, %ymm7
+L_AES_CTR_encrypt_vaes_done_128:
+ movl %edx, %r10d
+ andl $0xffffffe0, %r10d
+ cmpl %r10d, %eax
+ je L_AES_CTR_encrypt_vaes_done_32
+L_AES_CTR_encrypt_vaes_enc_32:
+ # 32 bytes of input
+ # aes_ctr_enc_32
+ leaq (%rdi,%rax,1), %r11
+ leaq (%rsi,%rax,1), %rbx
+ vpaddq 0+L_aes_ctr_inc_vaes(%rip), %ymm7, %ymm0
+ vmovdqa %ymm7, %ymm9
+ vpand 0+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm14
+ vpor 0+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm9
+ vpandn %ymm9, %ymm0, %ymm9
+ vpor %ymm14, %ymm9, %ymm9
+ vpsrlq $63, %ymm9, %ymm9
+ vpslldq $8, %ymm9, %ymm9
+ vpaddq %ymm9, %ymm0, %ymm0
+ vpshufb %ymm8, %ymm0, %ymm0
+ vmovdqa %ymm7, %ymm9
+ vpaddq %ymm11, %ymm7, %ymm7
+ vpand %ymm11, %ymm9, %ymm14
+ vpor %ymm11, %ymm9, %ymm9
+ vpandn %ymm9, %ymm7, %ymm9
+ vpor %ymm14, %ymm9, %ymm9
+ vpsrlq $63, %ymm9, %ymm9
+ vpslldq $8, %ymm9, %ymm9
+ vpaddq %ymm9, %ymm7, %ymm7
+ # aes_enc_block
+ vbroadcasti128 (%rcx), %ymm13
+ vpxor %ymm13, %ymm0, %ymm0
+ vbroadcasti128 16(%rcx), %ymm13
+ vaesenc %ymm13, %ymm0, %ymm0
+ vbroadcasti128 32(%rcx), %ymm13
+ vaesenc %ymm13, %ymm0, %ymm0
+ vbroadcasti128 48(%rcx), %ymm13
+ vaesenc %ymm13, %ymm0, %ymm0
+ vbroadcasti128 64(%rcx), %ymm13
+ vaesenc %ymm13, %ymm0, %ymm0
+ vbroadcasti128 80(%rcx), %ymm13
+ vaesenc %ymm13, %ymm0, %ymm0
+ vbroadcasti128 96(%rcx), %ymm13
+ vaesenc %ymm13, %ymm0, %ymm0
+ vbroadcasti128 112(%rcx), %ymm13
+ vaesenc %ymm13, %ymm0, %ymm0
+ vbroadcasti128 128(%rcx), %ymm13
+ vaesenc %ymm13, %ymm0, %ymm0
+ vbroadcasti128 144(%rcx), %ymm13
+ vaesenc %ymm13, %ymm0, %ymm0
+ cmpl $11, %r8d
+ vbroadcasti128 160(%rcx), %ymm13
+ jl L_AES_CTR_encrypt_vaes_32_aes_enc_block_last
+ vaesenc %ymm13, %ymm0, %ymm0
+ vbroadcasti128 176(%rcx), %ymm13
+ vaesenc %ymm13, %ymm0, %ymm0
+ cmpl $13, %r8d
+ vbroadcasti128 192(%rcx), %ymm13
+ jl L_AES_CTR_encrypt_vaes_32_aes_enc_block_last
+ vaesenc %ymm13, %ymm0, %ymm0
+ vbroadcasti128 208(%rcx), %ymm13
+ vaesenc %ymm13, %ymm0, %ymm0
+ vbroadcasti128 224(%rcx), %ymm13
+L_AES_CTR_encrypt_vaes_32_aes_enc_block_last:
+ vaesenclast %ymm13, %ymm0, %ymm0
+ vpxor (%r11), %ymm0, %ymm0
+ vmovdqu %ymm0, (%rbx)
+ addl $32, %eax
+ cmpl %r10d, %eax
+ jl L_AES_CTR_encrypt_vaes_enc_32
+L_AES_CTR_encrypt_vaes_done_32:
+ cmpl %edx, %eax
+ movl %edx, %r10d
+ je L_AES_CTR_encrypt_vaes_done_enc
+ andl $0xfffffff0, %r10d
+L_AES_CTR_encrypt_vaes_enc_16:
+ # 16 bytes of input
+ vpshufb %xmm8, %xmm7, %xmm0
+ vmovdqa %ymm7, %ymm9
+ vpaddq %ymm12, %ymm7, %ymm7
+ vpand %ymm12, %ymm9, %ymm14
+ vpor %ymm12, %ymm9, %ymm9
+ vpandn %ymm9, %ymm7, %ymm9
+ vpor %ymm14, %ymm9, %ymm9
+ vpsrlq $63, %ymm9, %ymm9
+ vpslldq $8, %ymm9, %ymm9
+ vpaddq %ymm9, %ymm7, %ymm7
+ # aes_enc_block
+ vpxor (%rcx), %xmm0, %xmm0
+ vmovdqu 16(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ cmpl $11, %r8d
+ vmovdqu 160(%rcx), %xmm5
+ jl L_AES_CTR_encrypt_vaes_16_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%rcx), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ cmpl $13, %r8d
+ vmovdqu 192(%rcx), %xmm5
+ jl L_AES_CTR_encrypt_vaes_16_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%rcx), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%rcx), %xmm5
+L_AES_CTR_encrypt_vaes_16_aes_enc_block_last:
+ vaesenclast %xmm5, %xmm0, %xmm0
+ leaq (%rdi,%rax,1), %r11
+ vpxor (%r11), %xmm0, %xmm0
+ leaq (%rsi,%rax,1), %r11
+ vmovdqu %xmm0, (%r11)
+ addl $16, %eax
+ cmpl %r10d, %eax
+ jl L_AES_CTR_encrypt_vaes_enc_16
+L_AES_CTR_encrypt_vaes_done_enc:
+ vpshufb %xmm8, %xmm7, %xmm0
+ vmovdqu %xmm0, (%r9)
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size AES_CTR_encrypt_vaes,.-AES_CTR_encrypt_vaes
+#endif /* __APPLE__ */
+#endif /* HAVE_INTEL_VAES */
+#ifdef HAVE_INTEL_AVX512
+#ifndef __APPLE__
+.text
+.globl AES_ECB_encrypt_avx512
+.type AES_ECB_encrypt_avx512,@function
+.align 16
+AES_ECB_encrypt_avx512:
+#else
+.section __TEXT,__text
+.globl _AES_ECB_encrypt_avx512
+.p2align 4
+_AES_ECB_encrypt_avx512:
+#endif /* __APPLE__ */
+ xorl %eax, %eax
+ cmpl $0x40, %edx
+ jl L_AES_ECB_encrypt_avx512_done_64
+ vbroadcasti32x4 (%rcx), %zmm8
+ vbroadcasti32x4 16(%rcx), %zmm9
+ vbroadcasti32x4 32(%rcx), %zmm10
+ vbroadcasti32x4 48(%rcx), %zmm11
+ vbroadcasti32x4 64(%rcx), %zmm12
+ vbroadcasti32x4 80(%rcx), %zmm13
+ vbroadcasti32x4 96(%rcx), %zmm14
+ vbroadcasti32x4 112(%rcx), %zmm15
+ vbroadcasti32x4 128(%rcx), %zmm16
+ vbroadcasti32x4 144(%rcx), %zmm17
+ vbroadcasti32x4 160(%rcx), %zmm18
+ cmpl $11, %r8d
+ jl L_AES_ECB_encrypt_avx512_key_cached
+ vbroadcasti32x4 176(%rcx), %zmm19
+ vbroadcasti32x4 192(%rcx), %zmm20
+ cmpl $13, %r8d
+ jl L_AES_ECB_encrypt_avx512_key_cached
+ vbroadcasti32x4 208(%rcx), %zmm21
+ vbroadcasti32x4 224(%rcx), %zmm22
+L_AES_ECB_encrypt_avx512_key_cached:
+ cmpl $0x100, %edx
+ movl %edx, %r9d
+ jl L_AES_ECB_encrypt_avx512_done_256
+ andl $0xffffff00, %r9d
+L_AES_ECB_encrypt_avx512_enc_256:
+ # 256 bytes of input
+ # aes_ecb_enc_256
+ leaq (%rdi,%rax,1), %r10
+ leaq (%rsi,%rax,1), %r11
+ vmovdqu64 (%r10), %zmm0
+ vmovdqu64 64(%r10), %zmm1
+ vmovdqu64 128(%r10), %zmm2
+ vmovdqu64 192(%r10), %zmm3
+ # aes_enc_block
+ vpxorq %zmm8, %zmm0, %zmm0
+ vpxorq %zmm8, %zmm1, %zmm1
+ vpxorq %zmm8, %zmm2, %zmm2
+ vpxorq %zmm8, %zmm3, %zmm3
+ vaesenc %zmm9, %zmm0, %zmm0
+ vaesenc %zmm9, %zmm1, %zmm1
+ vaesenc %zmm9, %zmm2, %zmm2
+ vaesenc %zmm9, %zmm3, %zmm3
+ vaesenc %zmm10, %zmm0, %zmm0
+ vaesenc %zmm10, %zmm1, %zmm1
+ vaesenc %zmm10, %zmm2, %zmm2
+ vaesenc %zmm10, %zmm3, %zmm3
+ vaesenc %zmm11, %zmm0, %zmm0
+ vaesenc %zmm11, %zmm1, %zmm1
+ vaesenc %zmm11, %zmm2, %zmm2
+ vaesenc %zmm11, %zmm3, %zmm3
+ vaesenc %zmm12, %zmm0, %zmm0
+ vaesenc %zmm12, %zmm1, %zmm1
+ vaesenc %zmm12, %zmm2, %zmm2
+ vaesenc %zmm12, %zmm3, %zmm3
+ vaesenc %zmm13, %zmm0, %zmm0
+ vaesenc %zmm13, %zmm1, %zmm1
+ vaesenc %zmm13, %zmm2, %zmm2
+ vaesenc %zmm13, %zmm3, %zmm3
+ vaesenc %zmm14, %zmm0, %zmm0
+ vaesenc %zmm14, %zmm1, %zmm1
+ vaesenc %zmm14, %zmm2, %zmm2
+ vaesenc %zmm14, %zmm3, %zmm3
+ vaesenc %zmm15, %zmm0, %zmm0
+ vaesenc %zmm15, %zmm1, %zmm1
+ vaesenc %zmm15, %zmm2, %zmm2
+ vaesenc %zmm15, %zmm3, %zmm3
+ vaesenc %zmm16, %zmm0, %zmm0
+ vaesenc %zmm16, %zmm1, %zmm1
+ vaesenc %zmm16, %zmm2, %zmm2
+ vaesenc %zmm16, %zmm3, %zmm3
+ vaesenc %zmm17, %zmm0, %zmm0
+ vaesenc %zmm17, %zmm1, %zmm1
+ vaesenc %zmm17, %zmm2, %zmm2
+ vaesenc %zmm17, %zmm3, %zmm3
+ cmpl $11, %r8d
+ vmovdqa64 %zmm18, %zmm7
+ jl L_AES_ECB_encrypt_avx512_256_aes_enc_block_last
+ vaesenc %zmm18, %zmm0, %zmm0
+ vaesenc %zmm18, %zmm1, %zmm1
+ vaesenc %zmm18, %zmm2, %zmm2
+ vaesenc %zmm18, %zmm3, %zmm3
+ vaesenc %zmm19, %zmm0, %zmm0
+ vaesenc %zmm19, %zmm1, %zmm1
+ vaesenc %zmm19, %zmm2, %zmm2
+ vaesenc %zmm19, %zmm3, %zmm3
+ cmpl $13, %r8d
+ vmovdqa64 %zmm20, %zmm7
+ jl L_AES_ECB_encrypt_avx512_256_aes_enc_block_last
+ vaesenc %zmm20, %zmm0, %zmm0
+ vaesenc %zmm20, %zmm1, %zmm1
+ vaesenc %zmm20, %zmm2, %zmm2
+ vaesenc %zmm20, %zmm3, %zmm3
+ vaesenc %zmm21, %zmm0, %zmm0
+ vaesenc %zmm21, %zmm1, %zmm1
+ vaesenc %zmm21, %zmm2, %zmm2
+ vaesenc %zmm21, %zmm3, %zmm3
+ vmovdqa64 %zmm22, %zmm7
+L_AES_ECB_encrypt_avx512_256_aes_enc_block_last:
+ vaesenclast %zmm7, %zmm0, %zmm0
+ vaesenclast %zmm7, %zmm1, %zmm1
+ vaesenclast %zmm7, %zmm2, %zmm2
+ vaesenclast %zmm7, %zmm3, %zmm3
+ vmovdqu64 %zmm0, (%r11)
+ vmovdqu64 %zmm1, 64(%r11)
+ vmovdqu64 %zmm2, 128(%r11)
+ vmovdqu64 %zmm3, 192(%r11)
+ addl $0x100, %eax
+ cmpl %r9d, %eax
+ jl L_AES_ECB_encrypt_avx512_enc_256
+L_AES_ECB_encrypt_avx512_done_256:
+ movl %edx, %r9d
+ andl $0xffffffc0, %r9d
+ cmpl %r9d, %eax
+ je L_AES_ECB_encrypt_avx512_done_64
+L_AES_ECB_encrypt_avx512_enc_64:
+ # 64 bytes of input
+ # aes_ecb_enc_64
+ leaq (%rdi,%rax,1), %r10
+ leaq (%rsi,%rax,1), %r11
+ vmovdqu64 (%r10), %zmm0
+ # aes_enc_block
+ vpxorq %zmm8, %zmm0, %zmm0
+ vaesenc %zmm9, %zmm0, %zmm0
+ vaesenc %zmm10, %zmm0, %zmm0
+ vaesenc %zmm11, %zmm0, %zmm0
+ vaesenc %zmm12, %zmm0, %zmm0
+ vaesenc %zmm13, %zmm0, %zmm0
+ vaesenc %zmm14, %zmm0, %zmm0
+ vaesenc %zmm15, %zmm0, %zmm0
+ vaesenc %zmm16, %zmm0, %zmm0
+ vaesenc %zmm17, %zmm0, %zmm0
+ cmpl $11, %r8d
+ vmovdqa64 %zmm18, %zmm7
+ jl L_AES_ECB_encrypt_avx512_64_aes_enc_block_last
+ vaesenc %zmm18, %zmm0, %zmm0
+ vaesenc %zmm19, %zmm0, %zmm0
+ cmpl $13, %r8d
+ vmovdqa64 %zmm20, %zmm7
+ jl L_AES_ECB_encrypt_avx512_64_aes_enc_block_last
+ vaesenc %zmm20, %zmm0, %zmm0
+ vaesenc %zmm21, %zmm0, %zmm0
+ vmovdqa64 %zmm22, %zmm7
+L_AES_ECB_encrypt_avx512_64_aes_enc_block_last:
+ vaesenclast %zmm7, %zmm0, %zmm0
+ vmovdqu64 %zmm0, (%r11)
+ addl $0x40, %eax
+ cmpl %r9d, %eax
+ jl L_AES_ECB_encrypt_avx512_enc_64
+L_AES_ECB_encrypt_avx512_done_64:
+ cmpl %edx, %eax
+ movl %edx, %r9d
+ je L_AES_ECB_encrypt_avx512_done_enc
+ andl $0xfffffff0, %r9d
+L_AES_ECB_encrypt_avx512_enc_16:
+ # 16 bytes of input
+ leaq (%rdi,%rax,1), %r10
+ vmovdqu (%r10), %xmm0
+ # aes_enc_block
+ vpxor (%rcx), %xmm0, %xmm0
+ vmovdqu 16(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ cmpl $11, %r8d
+ vmovdqu 160(%rcx), %xmm5
+ jl L_AES_ECB_encrypt_avx512_16_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%rcx), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ cmpl $13, %r8d
+ vmovdqu 192(%rcx), %xmm5
+ jl L_AES_ECB_encrypt_avx512_16_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%rcx), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%rcx), %xmm5
+L_AES_ECB_encrypt_avx512_16_aes_enc_block_last:
+ vaesenclast %xmm5, %xmm0, %xmm0
+ leaq (%rsi,%rax,1), %r10
+ vmovdqu %xmm0, (%r10)
+ addl $16, %eax
+ cmpl %r9d, %eax
+ jl L_AES_ECB_encrypt_avx512_enc_16
+L_AES_ECB_encrypt_avx512_done_enc:
+ repz retq
+#ifndef __APPLE__
+.size AES_ECB_encrypt_avx512,.-AES_ECB_encrypt_avx512
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_ECB_decrypt_avx512
+.type AES_ECB_decrypt_avx512,@function
+.align 16
+AES_ECB_decrypt_avx512:
+#else
+.section __TEXT,__text
+.globl _AES_ECB_decrypt_avx512
+.p2align 4
+_AES_ECB_decrypt_avx512:
+#endif /* __APPLE__ */
+ xorl %eax, %eax
+ cmpl $0x40, %edx
+ jl L_AES_ECB_decrypt_avx512_done_64
+ vbroadcasti32x4 (%rcx), %zmm8
+ vbroadcasti32x4 16(%rcx), %zmm9
+ vbroadcasti32x4 32(%rcx), %zmm10
+ vbroadcasti32x4 48(%rcx), %zmm11
+ vbroadcasti32x4 64(%rcx), %zmm12
+ vbroadcasti32x4 80(%rcx), %zmm13
+ vbroadcasti32x4 96(%rcx), %zmm14
+ vbroadcasti32x4 112(%rcx), %zmm15
+ vbroadcasti32x4 128(%rcx), %zmm16
+ vbroadcasti32x4 144(%rcx), %zmm17
+ vbroadcasti32x4 160(%rcx), %zmm18
+ cmpl $11, %r8d
+ jl L_AES_ECB_decrypt_avx512_key_cached
+ vbroadcasti32x4 176(%rcx), %zmm19
+ vbroadcasti32x4 192(%rcx), %zmm20
+ cmpl $13, %r8d
+ jl L_AES_ECB_decrypt_avx512_key_cached
+ vbroadcasti32x4 208(%rcx), %zmm21
+ vbroadcasti32x4 224(%rcx), %zmm22
+L_AES_ECB_decrypt_avx512_key_cached:
+ cmpl $0x100, %edx
+ movl %edx, %r9d
+ jl L_AES_ECB_decrypt_avx512_done_256
+ andl $0xffffff00, %r9d
+L_AES_ECB_decrypt_avx512_dec_256:
+ # 256 bytes of input
+ # aes_ecb_dec_256
+ leaq (%rdi,%rax,1), %r10
+ leaq (%rsi,%rax,1), %r11
+ vmovdqu64 (%r10), %zmm0
+ vmovdqu64 64(%r10), %zmm1
+ vmovdqu64 128(%r10), %zmm2
+ vmovdqu64 192(%r10), %zmm3
+ # aes_dec_block
+ vpxorq %zmm8, %zmm0, %zmm0
+ vpxorq %zmm8, %zmm1, %zmm1
+ vpxorq %zmm8, %zmm2, %zmm2
+ vpxorq %zmm8, %zmm3, %zmm3
+ vaesdec %zmm9, %zmm0, %zmm0
+ vaesdec %zmm9, %zmm1, %zmm1
+ vaesdec %zmm9, %zmm2, %zmm2
+ vaesdec %zmm9, %zmm3, %zmm3
+ vaesdec %zmm10, %zmm0, %zmm0
+ vaesdec %zmm10, %zmm1, %zmm1
+ vaesdec %zmm10, %zmm2, %zmm2
+ vaesdec %zmm10, %zmm3, %zmm3
+ vaesdec %zmm11, %zmm0, %zmm0
+ vaesdec %zmm11, %zmm1, %zmm1
+ vaesdec %zmm11, %zmm2, %zmm2
+ vaesdec %zmm11, %zmm3, %zmm3
+ vaesdec %zmm12, %zmm0, %zmm0
+ vaesdec %zmm12, %zmm1, %zmm1
+ vaesdec %zmm12, %zmm2, %zmm2
+ vaesdec %zmm12, %zmm3, %zmm3
+ vaesdec %zmm13, %zmm0, %zmm0
+ vaesdec %zmm13, %zmm1, %zmm1
+ vaesdec %zmm13, %zmm2, %zmm2
+ vaesdec %zmm13, %zmm3, %zmm3
+ vaesdec %zmm14, %zmm0, %zmm0
+ vaesdec %zmm14, %zmm1, %zmm1
+ vaesdec %zmm14, %zmm2, %zmm2
+ vaesdec %zmm14, %zmm3, %zmm3
+ vaesdec %zmm15, %zmm0, %zmm0
+ vaesdec %zmm15, %zmm1, %zmm1
+ vaesdec %zmm15, %zmm2, %zmm2
+ vaesdec %zmm15, %zmm3, %zmm3
+ vaesdec %zmm16, %zmm0, %zmm0
+ vaesdec %zmm16, %zmm1, %zmm1
+ vaesdec %zmm16, %zmm2, %zmm2
+ vaesdec %zmm16, %zmm3, %zmm3
+ vaesdec %zmm17, %zmm0, %zmm0
+ vaesdec %zmm17, %zmm1, %zmm1
+ vaesdec %zmm17, %zmm2, %zmm2
+ vaesdec %zmm17, %zmm3, %zmm3
+ cmpl $11, %r8d
+ vmovdqa64 %zmm18, %zmm7
+ jl L_AES_ECB_decrypt_avx512_256_aes_dec_block_last
+ vaesdec %zmm18, %zmm0, %zmm0
+ vaesdec %zmm18, %zmm1, %zmm1
+ vaesdec %zmm18, %zmm2, %zmm2
+ vaesdec %zmm18, %zmm3, %zmm3
+ vaesdec %zmm19, %zmm0, %zmm0
+ vaesdec %zmm19, %zmm1, %zmm1
+ vaesdec %zmm19, %zmm2, %zmm2
+ vaesdec %zmm19, %zmm3, %zmm3
+ cmpl $13, %r8d
+ vmovdqa64 %zmm20, %zmm7
+ jl L_AES_ECB_decrypt_avx512_256_aes_dec_block_last
+ vaesdec %zmm20, %zmm0, %zmm0
+ vaesdec %zmm20, %zmm1, %zmm1
+ vaesdec %zmm20, %zmm2, %zmm2
+ vaesdec %zmm20, %zmm3, %zmm3
+ vaesdec %zmm21, %zmm0, %zmm0
+ vaesdec %zmm21, %zmm1, %zmm1
+ vaesdec %zmm21, %zmm2, %zmm2
+ vaesdec %zmm21, %zmm3, %zmm3
+ vmovdqa64 %zmm22, %zmm7
+L_AES_ECB_decrypt_avx512_256_aes_dec_block_last:
+ vaesdeclast %zmm7, %zmm0, %zmm0
+ vaesdeclast %zmm7, %zmm1, %zmm1
+ vaesdeclast %zmm7, %zmm2, %zmm2
+ vaesdeclast %zmm7, %zmm3, %zmm3
+ vmovdqu64 %zmm0, (%r11)
+ vmovdqu64 %zmm1, 64(%r11)
+ vmovdqu64 %zmm2, 128(%r11)
+ vmovdqu64 %zmm3, 192(%r11)
+ addl $0x100, %eax
+ cmpl %r9d, %eax
+ jl L_AES_ECB_decrypt_avx512_dec_256
+L_AES_ECB_decrypt_avx512_done_256:
+ movl %edx, %r9d
+ andl $0xffffffc0, %r9d
+ cmpl %r9d, %eax
+ je L_AES_ECB_decrypt_avx512_done_64
+L_AES_ECB_decrypt_avx512_dec_64:
+ # 64 bytes of input
+ # aes_ecb_dec_64
+ leaq (%rdi,%rax,1), %r10
+ leaq (%rsi,%rax,1), %r11
+ vmovdqu64 (%r10), %zmm0
+ # aes_dec_block
+ vpxorq %zmm8, %zmm0, %zmm0
+ vaesdec %zmm9, %zmm0, %zmm0
+ vaesdec %zmm10, %zmm0, %zmm0
+ vaesdec %zmm11, %zmm0, %zmm0
+ vaesdec %zmm12, %zmm0, %zmm0
+ vaesdec %zmm13, %zmm0, %zmm0
+ vaesdec %zmm14, %zmm0, %zmm0
+ vaesdec %zmm15, %zmm0, %zmm0
+ vaesdec %zmm16, %zmm0, %zmm0
+ vaesdec %zmm17, %zmm0, %zmm0
+ cmpl $11, %r8d
+ vmovdqa64 %zmm18, %zmm7
+ jl L_AES_ECB_decrypt_avx512_64_aes_dec_block_last
+ vaesdec %zmm18, %zmm0, %zmm0
+ vaesdec %zmm19, %zmm0, %zmm0
+ cmpl $13, %r8d
+ vmovdqa64 %zmm20, %zmm7
+ jl L_AES_ECB_decrypt_avx512_64_aes_dec_block_last
+ vaesdec %zmm20, %zmm0, %zmm0
+ vaesdec %zmm21, %zmm0, %zmm0
+ vmovdqa64 %zmm22, %zmm7
+L_AES_ECB_decrypt_avx512_64_aes_dec_block_last:
+ vaesdeclast %zmm7, %zmm0, %zmm0
+ vmovdqu64 %zmm0, (%r11)
+ addl $0x40, %eax
+ cmpl %r9d, %eax
+ jl L_AES_ECB_decrypt_avx512_dec_64
+L_AES_ECB_decrypt_avx512_done_64:
+ cmpl %edx, %eax
+ movl %edx, %r9d
+ je L_AES_ECB_decrypt_avx512_done_dec
+ andl $0xfffffff0, %r9d
+L_AES_ECB_decrypt_avx512_dec_16:
+ # 16 bytes of input
+ leaq (%rdi,%rax,1), %r10
+ vmovdqu (%r10), %xmm0
+ # aes_dec_block
+ vpxor (%rcx), %xmm0, %xmm0
+ vmovdqu 16(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%rcx), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ cmpl $11, %r8d
+ vmovdqu 160(%rcx), %xmm5
+ jl L_AES_ECB_decrypt_avx512_16_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%rcx), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ cmpl $13, %r8d
+ vmovdqu 192(%rcx), %xmm5
+ jl L_AES_ECB_decrypt_avx512_16_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%rcx), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%rcx), %xmm5
+L_AES_ECB_decrypt_avx512_16_aes_dec_block_last:
+ vaesdeclast %xmm5, %xmm0, %xmm0
+ leaq (%rsi,%rax,1), %r10
+ vmovdqu %xmm0, (%r10)
+ addl $16, %eax
+ cmpl %r9d, %eax
+ jl L_AES_ECB_decrypt_avx512_dec_16
+L_AES_ECB_decrypt_avx512_done_dec:
+ repz retq
+#ifndef __APPLE__
+.size AES_ECB_decrypt_avx512,.-AES_ECB_decrypt_avx512
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_CBC_encrypt_avx512
+.type AES_CBC_encrypt_avx512,@function
+.align 16
+AES_CBC_encrypt_avx512:
+#else
+.section __TEXT,__text
+.globl _AES_CBC_encrypt_avx512
+.p2align 4
+_AES_CBC_encrypt_avx512:
+#endif /* __APPLE__ */
+ vmovdqu (%rdx), %xmm0
+ xorl %eax, %eax
+ cmpl %ecx, %eax
+ je L_AES_CBC_encrypt_avx512_done
+L_AES_CBC_encrypt_avx512_loop:
+ # 16 bytes of input
+ leaq (%rdi,%rax,1), %r10
+ vmovdqu (%r10), %xmm1
+ vpternlogq $0x96, (%r8), %xmm0, %xmm1
+ # aes_enc_block
+ vmovdqu 16(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 32(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 48(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 64(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 80(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 96(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 112(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 128(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 144(%r8), %xmm3
+ vaesenc %xmm3, %xmm1, %xmm1
+ cmpl $11, %r9d
+ vmovdqu 160(%r8), %xmm3
+ jl L_AES_CBC_encrypt_avx512_aes_enc_block_last
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 176(%r8), %xmm4
+ vaesenc %xmm4, %xmm1, %xmm1
+ cmpl $13, %r9d
+ vmovdqu 192(%r8), %xmm3
+ jl L_AES_CBC_encrypt_avx512_aes_enc_block_last
+ vaesenc %xmm3, %xmm1, %xmm1
+ vmovdqu 208(%r8), %xmm4
+ vaesenc %xmm4, %xmm1, %xmm1
+ vmovdqu 224(%r8), %xmm3
+L_AES_CBC_encrypt_avx512_aes_enc_block_last:
+ vaesenclast %xmm3, %xmm1, %xmm1
+ leaq (%rsi,%rax,1), %r11
+ vmovdqu %xmm1, (%r11)
+ vmovdqa %xmm1, %xmm0
+ addl $16, %eax
+ cmpl %ecx, %eax
+ jl L_AES_CBC_encrypt_avx512_loop
+L_AES_CBC_encrypt_avx512_done:
+ vmovdqu %xmm0, (%rdx)
+ repz retq
+#ifndef __APPLE__
+.size AES_CBC_encrypt_avx512,.-AES_CBC_encrypt_avx512
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_CBC_decrypt_avx512
+.type AES_CBC_decrypt_avx512,@function
+.align 16
+AES_CBC_decrypt_avx512:
+#else
+.section __TEXT,__text
+.globl _AES_CBC_decrypt_avx512
+.p2align 4
+_AES_CBC_decrypt_avx512:
+#endif /* __APPLE__ */
+ pushq %r12
+ vmovdqu (%rdx), %xmm8
+ xorl %eax, %eax
+ cmpl $0x40, %ecx
+ jl L_AES_CBC_decrypt_avx512_done_64
+ vbroadcasti32x4 (%r8), %zmm14
+ vbroadcasti32x4 16(%r8), %zmm15
+ vbroadcasti32x4 32(%r8), %zmm16
+ vbroadcasti32x4 48(%r8), %zmm17
+ vbroadcasti32x4 64(%r8), %zmm18
+ vbroadcasti32x4 80(%r8), %zmm19
+ vbroadcasti32x4 96(%r8), %zmm20
+ vbroadcasti32x4 112(%r8), %zmm21
+ vbroadcasti32x4 128(%r8), %zmm22
+ vbroadcasti32x4 144(%r8), %zmm23
+ vbroadcasti32x4 160(%r8), %zmm24
+ cmpl $11, %r9d
+ jl L_AES_CBC_decrypt_avx512_key_cached
+ vbroadcasti32x4 176(%r8), %zmm25
+ vbroadcasti32x4 192(%r8), %zmm26
+ cmpl $13, %r9d
+ jl L_AES_CBC_decrypt_avx512_key_cached
+ vbroadcasti32x4 208(%r8), %zmm27
+ vbroadcasti32x4 224(%r8), %zmm28
+L_AES_CBC_decrypt_avx512_key_cached:
+ cmpl $0x100, %ecx
+ movl %ecx, %r10d
+ jl L_AES_CBC_decrypt_avx512_done_256
+ andl $0xffffff00, %r10d
+L_AES_CBC_decrypt_avx512_dec_256:
+ # 256 bytes of input
+ # aes_cbc_dec_256
+ leaq (%rdi,%rax,1), %r11
+ leaq (%rsi,%rax,1), %r12
+ vmovdqu64 (%r11), %zmm0
+ vmovdqu64 64(%r11), %zmm1
+ vmovdqu64 128(%r11), %zmm2
+ vmovdqu64 192(%r11), %zmm3
+ vshufi64x2 $0x90, %zmm0, %zmm0, %zmm10
+ vinserti32x4 $0x00, %xmm8, %zmm10, %zmm10
+ vmovdqu64 48(%r11), %zmm11
+ vmovdqu64 112(%r11), %zmm12
+ vmovdqu64 176(%r11), %zmm13
+ vextracti32x4 $3, %zmm3, %xmm8
+ # aes_dec_block
+ vpxorq %zmm14, %zmm0, %zmm0
+ vpxorq %zmm14, %zmm1, %zmm1
+ vpxorq %zmm14, %zmm2, %zmm2
+ vpxorq %zmm14, %zmm3, %zmm3
+ vaesdec %zmm15, %zmm0, %zmm0
+ vaesdec %zmm15, %zmm1, %zmm1
+ vaesdec %zmm15, %zmm2, %zmm2
+ vaesdec %zmm15, %zmm3, %zmm3
+ vaesdec %zmm16, %zmm0, %zmm0
+ vaesdec %zmm16, %zmm1, %zmm1
+ vaesdec %zmm16, %zmm2, %zmm2
+ vaesdec %zmm16, %zmm3, %zmm3
+ vaesdec %zmm17, %zmm0, %zmm0
+ vaesdec %zmm17, %zmm1, %zmm1
+ vaesdec %zmm17, %zmm2, %zmm2
+ vaesdec %zmm17, %zmm3, %zmm3
+ vaesdec %zmm18, %zmm0, %zmm0
+ vaesdec %zmm18, %zmm1, %zmm1
+ vaesdec %zmm18, %zmm2, %zmm2
+ vaesdec %zmm18, %zmm3, %zmm3
+ vaesdec %zmm19, %zmm0, %zmm0
+ vaesdec %zmm19, %zmm1, %zmm1
+ vaesdec %zmm19, %zmm2, %zmm2
+ vaesdec %zmm19, %zmm3, %zmm3
+ vaesdec %zmm20, %zmm0, %zmm0
+ vaesdec %zmm20, %zmm1, %zmm1
+ vaesdec %zmm20, %zmm2, %zmm2
+ vaesdec %zmm20, %zmm3, %zmm3
+ vaesdec %zmm21, %zmm0, %zmm0
+ vaesdec %zmm21, %zmm1, %zmm1
+ vaesdec %zmm21, %zmm2, %zmm2
+ vaesdec %zmm21, %zmm3, %zmm3
+ vaesdec %zmm22, %zmm0, %zmm0
+ vaesdec %zmm22, %zmm1, %zmm1
+ vaesdec %zmm22, %zmm2, %zmm2
+ vaesdec %zmm22, %zmm3, %zmm3
+ vaesdec %zmm23, %zmm0, %zmm0
+ vaesdec %zmm23, %zmm1, %zmm1
+ vaesdec %zmm23, %zmm2, %zmm2
+ vaesdec %zmm23, %zmm3, %zmm3
+ cmpl $11, %r9d
+ vmovdqa64 %zmm24, %zmm9
+ jl L_AES_CBC_decrypt_avx512_256_aes_dec_block_last
+ vaesdec %zmm24, %zmm0, %zmm0
+ vaesdec %zmm24, %zmm1, %zmm1
+ vaesdec %zmm24, %zmm2, %zmm2
+ vaesdec %zmm24, %zmm3, %zmm3
+ vaesdec %zmm25, %zmm0, %zmm0
+ vaesdec %zmm25, %zmm1, %zmm1
+ vaesdec %zmm25, %zmm2, %zmm2
+ vaesdec %zmm25, %zmm3, %zmm3
+ cmpl $13, %r9d
+ vmovdqa64 %zmm26, %zmm9
+ jl L_AES_CBC_decrypt_avx512_256_aes_dec_block_last
+ vaesdec %zmm26, %zmm0, %zmm0
+ vaesdec %zmm26, %zmm1, %zmm1
+ vaesdec %zmm26, %zmm2, %zmm2
+ vaesdec %zmm26, %zmm3, %zmm3
+ vaesdec %zmm27, %zmm0, %zmm0
+ vaesdec %zmm27, %zmm1, %zmm1
+ vaesdec %zmm27, %zmm2, %zmm2
+ vaesdec %zmm27, %zmm3, %zmm3
+ vmovdqa64 %zmm28, %zmm9
+L_AES_CBC_decrypt_avx512_256_aes_dec_block_last:
+ vaesdeclast %zmm9, %zmm0, %zmm0
+ vaesdeclast %zmm9, %zmm1, %zmm1
+ vaesdeclast %zmm9, %zmm2, %zmm2
+ vaesdeclast %zmm9, %zmm3, %zmm3
+ vpxorq %zmm10, %zmm0, %zmm0
+ vpxorq %zmm11, %zmm1, %zmm1
+ vpxorq %zmm12, %zmm2, %zmm2
+ vpxorq %zmm13, %zmm3, %zmm3
+ vmovdqu64 %zmm0, (%r12)
+ vmovdqu64 %zmm1, 64(%r12)
+ vmovdqu64 %zmm2, 128(%r12)
+ vmovdqu64 %zmm3, 192(%r12)
+ addl $0x100, %eax
+ cmpl %r10d, %eax
+ jl L_AES_CBC_decrypt_avx512_dec_256
+L_AES_CBC_decrypt_avx512_done_256:
+ movl %ecx, %r10d
+ andl $0xffffffc0, %r10d
+ cmpl %r10d, %eax
+ je L_AES_CBC_decrypt_avx512_done_64
+L_AES_CBC_decrypt_avx512_dec_64:
+ # 64 bytes of input
+ # aes_cbc_dec_64
+ leaq (%rdi,%rax,1), %r11
+ leaq (%rsi,%rax,1), %r12
+ vmovdqu64 (%r11), %zmm0
+ vshufi64x2 $0x90, %zmm0, %zmm0, %zmm10
+ vinserti32x4 $0x00, %xmm8, %zmm10, %zmm10
+ vextracti32x4 $3, %zmm0, %xmm8
+ # aes_dec_block
+ vpxorq %zmm14, %zmm0, %zmm0
+ vaesdec %zmm15, %zmm0, %zmm0
+ vaesdec %zmm16, %zmm0, %zmm0
+ vaesdec %zmm17, %zmm0, %zmm0
+ vaesdec %zmm18, %zmm0, %zmm0
+ vaesdec %zmm19, %zmm0, %zmm0
+ vaesdec %zmm20, %zmm0, %zmm0
+ vaesdec %zmm21, %zmm0, %zmm0
+ vaesdec %zmm22, %zmm0, %zmm0
+ vaesdec %zmm23, %zmm0, %zmm0
+ cmpl $11, %r9d
+ vmovdqa64 %zmm24, %zmm9
+ jl L_AES_CBC_decrypt_avx512_64_aes_dec_block_last
+ vaesdec %zmm24, %zmm0, %zmm0
+ vaesdec %zmm25, %zmm0, %zmm0
+ cmpl $13, %r9d
+ vmovdqa64 %zmm26, %zmm9
+ jl L_AES_CBC_decrypt_avx512_64_aes_dec_block_last
+ vaesdec %zmm26, %zmm0, %zmm0
+ vaesdec %zmm27, %zmm0, %zmm0
+ vmovdqa64 %zmm28, %zmm9
+L_AES_CBC_decrypt_avx512_64_aes_dec_block_last:
+ vaesdeclast %zmm9, %zmm0, %zmm0
+ vpxorq %zmm10, %zmm0, %zmm0
+ vmovdqu64 %zmm0, (%r12)
+ addl $0x40, %eax
+ cmpl %r10d, %eax
+ jl L_AES_CBC_decrypt_avx512_dec_64
+L_AES_CBC_decrypt_avx512_done_64:
+ cmpl %ecx, %eax
+ movl %ecx, %r10d
+ je L_AES_CBC_decrypt_avx512_done_dec
+ andl $0xfffffff0, %r10d
+L_AES_CBC_decrypt_avx512_dec_16:
+ # 16 bytes of input
+ leaq (%rdi,%rax,1), %r11
+ vmovdqu (%r11), %xmm0
+ vmovdqa %xmm0, %xmm7
+ # aes_dec_block
+ vpxor (%r8), %xmm0, %xmm0
+ vmovdqu 16(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ cmpl $11, %r9d
+ vmovdqu 160(%r8), %xmm5
+ jl L_AES_CBC_decrypt_avx512_16_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ cmpl $13, %r9d
+ vmovdqu 192(%r8), %xmm5
+ jl L_AES_CBC_decrypt_avx512_16_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%r8), %xmm5
+L_AES_CBC_decrypt_avx512_16_aes_dec_block_last:
+ vaesdeclast %xmm5, %xmm0, %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ vmovdqa %xmm7, %xmm8
+ leaq (%rsi,%rax,1), %r11
+ vmovdqu %xmm0, (%r11)
+ addl $16, %eax
+ cmpl %r10d, %eax
+ jl L_AES_CBC_decrypt_avx512_dec_16
+L_AES_CBC_decrypt_avx512_done_dec:
+ vmovdqu %xmm8, (%rdx)
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size AES_CBC_decrypt_avx512,.-AES_CBC_decrypt_avx512
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_aes_ctr_bswap_avx512:
+.quad 0x08090a0b0c0d0e0f,0x0001020304050607
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 32
+#else
+.p2align 5
+#endif /* __APPLE__ */
+L_aes_ctr_inc_avx512:
+.quad 0x0000000000000000,0x0000000000000000
+.quad 0x0000000000000001,0x0000000000000000
+.quad 0x0000000000000002,0x0000000000000000
+.quad 0x0000000000000003,0x0000000000000000
+.quad 0x0000000000000004,0x0000000000000000
+.quad 0x0000000000000005,0x0000000000000000
+.quad 0x0000000000000006,0x0000000000000000
+.quad 0x0000000000000007,0x0000000000000000
+.quad 0x0000000000000008,0x0000000000000000
+.quad 0x0000000000000009,0x0000000000000000
+.quad 0x000000000000000a,0x0000000000000000
+.quad 0x000000000000000b,0x0000000000000000
+.quad 0x000000000000000c,0x0000000000000000
+.quad 0x000000000000000d,0x0000000000000000
+.quad 0x000000000000000e,0x0000000000000000
+.quad 0x000000000000000f,0x0000000000000000
+.quad 0x0000000000000010,0x0000000000000000
+#ifndef __APPLE__
+.text
+.globl AES_CTR_encrypt_avx512
+.type AES_CTR_encrypt_avx512,@function
+.align 16
+AES_CTR_encrypt_avx512:
+#else
+.section __TEXT,__text
+.globl _AES_CTR_encrypt_avx512
+.p2align 4
+_AES_CTR_encrypt_avx512:
+#endif /* __APPLE__ */
+ pushq %rbx
+ vbroadcasti32x4 L_aes_ctr_bswap_avx512(%rip), %zmm8
+ vbroadcasti32x4 (%r9), %zmm7
+ vpshufb %zmm8, %zmm7, %zmm7
+ vbroadcasti32x4 256+L_aes_ctr_inc_avx512(%rip), %zmm10
+ vbroadcasti32x4 64+L_aes_ctr_inc_avx512(%rip), %zmm11
+ vbroadcasti32x4 16+L_aes_ctr_inc_avx512(%rip), %zmm12
+ xorl %eax, %eax
+ cmpl $0x40, %edx
+ jl L_AES_CTR_encrypt_avx512_done_64
+ vbroadcasti32x4 (%rcx), %zmm14
+ vbroadcasti32x4 16(%rcx), %zmm15
+ vbroadcasti32x4 32(%rcx), %zmm16
+ vbroadcasti32x4 48(%rcx), %zmm17
+ vbroadcasti32x4 64(%rcx), %zmm18
+ vbroadcasti32x4 80(%rcx), %zmm19
+ vbroadcasti32x4 96(%rcx), %zmm20
+ vbroadcasti32x4 112(%rcx), %zmm21
+ vbroadcasti32x4 128(%rcx), %zmm22
+ vbroadcasti32x4 144(%rcx), %zmm23
+ vbroadcasti32x4 160(%rcx), %zmm24
+ cmpl $11, %r8d
+ jl L_AES_CTR_encrypt_avx512_key_cached
+ vbroadcasti32x4 176(%rcx), %zmm25
+ vbroadcasti32x4 192(%rcx), %zmm26
+ cmpl $13, %r8d
+ jl L_AES_CTR_encrypt_avx512_key_cached
+ vbroadcasti32x4 208(%rcx), %zmm27
+ vbroadcasti32x4 224(%rcx), %zmm28
+L_AES_CTR_encrypt_avx512_key_cached:
+ cmpl $0x100, %edx
+ movl %edx, %r10d
+ jl L_AES_CTR_encrypt_avx512_done_256
+ andl $0xffffff00, %r10d
+ vmovdqa64 %zmm7, %zmm9
+ vpaddq 0+L_aes_ctr_inc_avx512(%rip), %zmm7, %zmm4
+ vpternlogq $0xb2, 0+L_aes_ctr_inc_avx512(%rip), %zmm4, %zmm9
+ vpsrlq $63, %zmm9, %zmm9
+ vpslldq $8, %zmm9, %zmm9
+ vpaddq %zmm9, %zmm4, %zmm4
+ vmovdqa64 %zmm7, %zmm9
+ vpaddq 64+L_aes_ctr_inc_avx512(%rip), %zmm7, %zmm5
+ vpternlogq $0xb2, 64+L_aes_ctr_inc_avx512(%rip), %zmm5, %zmm9
+ vpsrlq $63, %zmm9, %zmm9
+ vpslldq $8, %zmm9, %zmm9
+ vpaddq %zmm9, %zmm5, %zmm5
+ vmovdqa64 %zmm7, %zmm9
+ vpaddq 128+L_aes_ctr_inc_avx512(%rip), %zmm7, %zmm6
+ vpternlogq $0xb2, 128+L_aes_ctr_inc_avx512(%rip), %zmm6, %zmm9
+ vpsrlq $63, %zmm9, %zmm9
+ vpslldq $8, %zmm9, %zmm9
+ vpaddq %zmm9, %zmm6, %zmm6
+ vmovdqa64 %zmm7, %zmm9
+ vpaddq 192+L_aes_ctr_inc_avx512(%rip), %zmm7, %zmm7
+ vpternlogq $0xb2, 192+L_aes_ctr_inc_avx512(%rip), %zmm7, %zmm9
+ vpsrlq $63, %zmm9, %zmm9
+ vpslldq $8, %zmm9, %zmm9
+ vpaddq %zmm9, %zmm7, %zmm7
+L_AES_CTR_encrypt_avx512_enc_256:
+ # 256 bytes of input
+ leaq (%rdi,%rax,1), %r11
+ leaq (%rsi,%rax,1), %rbx
+ vpshufb %zmm8, %zmm4, %zmm0
+ vpshufb %zmm8, %zmm5, %zmm1
+ vpshufb %zmm8, %zmm6, %zmm2
+ vpshufb %zmm8, %zmm7, %zmm3
+ vmovdqa64 %zmm4, %zmm9
+ vpaddq %zmm10, %zmm4, %zmm4
+ vpternlogq $0xb2, %zmm10, %zmm4, %zmm9
+ vpsrlq $63, %zmm9, %zmm9
+ vpslldq $8, %zmm9, %zmm9
+ vpaddq %zmm9, %zmm4, %zmm4
+ vmovdqa64 %zmm5, %zmm9
+ vpaddq %zmm10, %zmm5, %zmm5
+ vpternlogq $0xb2, %zmm10, %zmm5, %zmm9
+ vpsrlq $63, %zmm9, %zmm9
+ vpslldq $8, %zmm9, %zmm9
+ vpaddq %zmm9, %zmm5, %zmm5
+ vmovdqa64 %zmm6, %zmm9
+ vpaddq %zmm10, %zmm6, %zmm6
+ vpternlogq $0xb2, %zmm10, %zmm6, %zmm9
+ vpsrlq $63, %zmm9, %zmm9
+ vpslldq $8, %zmm9, %zmm9
+ vpaddq %zmm9, %zmm6, %zmm6
+ vmovdqa64 %zmm7, %zmm9
+ vpaddq %zmm10, %zmm7, %zmm7
+ vpternlogq $0xb2, %zmm10, %zmm7, %zmm9
+ vpsrlq $63, %zmm9, %zmm9
+ vpslldq $8, %zmm9, %zmm9
+ vpaddq %zmm9, %zmm7, %zmm7
+ # aes_enc_block
+ vpxorq %zmm14, %zmm0, %zmm0
+ vpxorq %zmm14, %zmm1, %zmm1
+ vpxorq %zmm14, %zmm2, %zmm2
+ vpxorq %zmm14, %zmm3, %zmm3
+ vaesenc %zmm15, %zmm0, %zmm0
+ vaesenc %zmm15, %zmm1, %zmm1
+ vaesenc %zmm15, %zmm2, %zmm2
+ vaesenc %zmm15, %zmm3, %zmm3
+ vaesenc %zmm16, %zmm0, %zmm0
+ vaesenc %zmm16, %zmm1, %zmm1
+ vaesenc %zmm16, %zmm2, %zmm2
+ vaesenc %zmm16, %zmm3, %zmm3
+ vaesenc %zmm17, %zmm0, %zmm0
+ vaesenc %zmm17, %zmm1, %zmm1
+ vaesenc %zmm17, %zmm2, %zmm2
+ vaesenc %zmm17, %zmm3, %zmm3
+ vaesenc %zmm18, %zmm0, %zmm0
+ vaesenc %zmm18, %zmm1, %zmm1
+ vaesenc %zmm18, %zmm2, %zmm2
+ vaesenc %zmm18, %zmm3, %zmm3
+ vaesenc %zmm19, %zmm0, %zmm0
+ vaesenc %zmm19, %zmm1, %zmm1
+ vaesenc %zmm19, %zmm2, %zmm2
+ vaesenc %zmm19, %zmm3, %zmm3
+ vaesenc %zmm20, %zmm0, %zmm0
+ vaesenc %zmm20, %zmm1, %zmm1
+ vaesenc %zmm20, %zmm2, %zmm2
+ vaesenc %zmm20, %zmm3, %zmm3
+ vaesenc %zmm21, %zmm0, %zmm0
+ vaesenc %zmm21, %zmm1, %zmm1
+ vaesenc %zmm21, %zmm2, %zmm2
+ vaesenc %zmm21, %zmm3, %zmm3
+ vaesenc %zmm22, %zmm0, %zmm0
+ vaesenc %zmm22, %zmm1, %zmm1
+ vaesenc %zmm22, %zmm2, %zmm2
+ vaesenc %zmm22, %zmm3, %zmm3
+ vaesenc %zmm23, %zmm0, %zmm0
+ vaesenc %zmm23, %zmm1, %zmm1
+ vaesenc %zmm23, %zmm2, %zmm2
+ vaesenc %zmm23, %zmm3, %zmm3
+ cmpl $11, %r8d
+ vmovdqa64 %zmm24, %zmm13
+ jl L_AES_CTR_encrypt_avx512_256_aes_enc_block_last
+ vaesenc %zmm24, %zmm0, %zmm0
+ vaesenc %zmm24, %zmm1, %zmm1
+ vaesenc %zmm24, %zmm2, %zmm2
+ vaesenc %zmm24, %zmm3, %zmm3
+ vaesenc %zmm25, %zmm0, %zmm0
+ vaesenc %zmm25, %zmm1, %zmm1
+ vaesenc %zmm25, %zmm2, %zmm2
+ vaesenc %zmm25, %zmm3, %zmm3
+ cmpl $13, %r8d
+ vmovdqa64 %zmm26, %zmm13
+ jl L_AES_CTR_encrypt_avx512_256_aes_enc_block_last
+ vaesenc %zmm26, %zmm0, %zmm0
+ vaesenc %zmm26, %zmm1, %zmm1
+ vaesenc %zmm26, %zmm2, %zmm2
+ vaesenc %zmm26, %zmm3, %zmm3
+ vaesenc %zmm27, %zmm0, %zmm0
+ vaesenc %zmm27, %zmm1, %zmm1
+ vaesenc %zmm27, %zmm2, %zmm2
+ vaesenc %zmm27, %zmm3, %zmm3
+ vmovdqa64 %zmm28, %zmm13
+L_AES_CTR_encrypt_avx512_256_aes_enc_block_last:
+ vaesenclast %zmm13, %zmm0, %zmm0
+ vaesenclast %zmm13, %zmm1, %zmm1
+ vaesenclast %zmm13, %zmm2, %zmm2
+ vaesenclast %zmm13, %zmm3, %zmm3
+ vpxorq (%r11), %zmm0, %zmm0
+ vpxorq 64(%r11), %zmm1, %zmm1
+ vpxorq 128(%r11), %zmm2, %zmm2
+ vpxorq 192(%r11), %zmm3, %zmm3
+ vmovdqu64 %zmm0, (%rbx)
+ vmovdqu64 %zmm1, 64(%rbx)
+ vmovdqu64 %zmm2, 128(%rbx)
+ vmovdqu64 %zmm3, 192(%rbx)
+ addl $0x100, %eax
+ cmpl %r10d, %eax
+ jl L_AES_CTR_encrypt_avx512_enc_256
+ vshufi64x2 $0x00, %zmm4, %zmm4, %zmm7
+L_AES_CTR_encrypt_avx512_done_256:
+ movl %edx, %r10d
+ andl $0xffffffc0, %r10d
+ cmpl %r10d, %eax
+ je L_AES_CTR_encrypt_avx512_done_64
+L_AES_CTR_encrypt_avx512_enc_64:
+ # 64 bytes of input
+ # aes_ctr_enc_64
+ leaq (%rdi,%rax,1), %r11
+ leaq (%rsi,%rax,1), %rbx
+ vpaddq 0+L_aes_ctr_inc_avx512(%rip), %zmm7, %zmm0
+ vmovdqa64 %zmm7, %zmm9
+ vpternlogq $0xb2, 0+L_aes_ctr_inc_avx512(%rip), %zmm0, %zmm9
+ vpsrlq $63, %zmm9, %zmm9
+ vpslldq $8, %zmm9, %zmm9
+ vpaddq %zmm9, %zmm0, %zmm0
+ vpshufb %zmm8, %zmm0, %zmm0
+ vmovdqa64 %zmm7, %zmm9
+ vpaddq %zmm11, %zmm7, %zmm7
+ vpternlogq $0xb2, %zmm11, %zmm7, %zmm9
+ vpsrlq $63, %zmm9, %zmm9
+ vpslldq $8, %zmm9, %zmm9
+ vpaddq %zmm9, %zmm7, %zmm7
+ # aes_enc_block
+ vpxorq %zmm14, %zmm0, %zmm0
+ vaesenc %zmm15, %zmm0, %zmm0
+ vaesenc %zmm16, %zmm0, %zmm0
+ vaesenc %zmm17, %zmm0, %zmm0
+ vaesenc %zmm18, %zmm0, %zmm0
+ vaesenc %zmm19, %zmm0, %zmm0
+ vaesenc %zmm20, %zmm0, %zmm0
+ vaesenc %zmm21, %zmm0, %zmm0
+ vaesenc %zmm22, %zmm0, %zmm0
+ vaesenc %zmm23, %zmm0, %zmm0
+ cmpl $11, %r8d
+ vmovdqa64 %zmm24, %zmm13
+ jl L_AES_CTR_encrypt_avx512_64_aes_enc_block_last
+ vaesenc %zmm24, %zmm0, %zmm0
+ vaesenc %zmm25, %zmm0, %zmm0
+ cmpl $13, %r8d
+ vmovdqa64 %zmm26, %zmm13
+ jl L_AES_CTR_encrypt_avx512_64_aes_enc_block_last
+ vaesenc %zmm26, %zmm0, %zmm0
+ vaesenc %zmm27, %zmm0, %zmm0
+ vmovdqa64 %zmm28, %zmm13
+L_AES_CTR_encrypt_avx512_64_aes_enc_block_last:
+ vaesenclast %zmm13, %zmm0, %zmm0
+ vpxorq (%r11), %zmm0, %zmm0
+ vmovdqu64 %zmm0, (%rbx)
+ addl $0x40, %eax
+ cmpl %r10d, %eax
+ jl L_AES_CTR_encrypt_avx512_enc_64
+L_AES_CTR_encrypt_avx512_done_64:
+ cmpl %edx, %eax
+ movl %edx, %r10d
+ je L_AES_CTR_encrypt_avx512_done_enc
+ andl $0xfffffff0, %r10d
+L_AES_CTR_encrypt_avx512_enc_16:
+ # 16 bytes of input
+ vpshufb %xmm8, %xmm7, %xmm0
+ vmovdqa64 %zmm7, %zmm9
+ vpaddq %zmm12, %zmm7, %zmm7
+ vpternlogq $0xb2, %zmm12, %zmm7, %zmm9
+ vpsrlq $63, %zmm9, %zmm9
+ vpslldq $8, %zmm9, %zmm9
+ vpaddq %zmm9, %zmm7, %zmm7
+ # aes_enc_block
+ vpxor (%rcx), %xmm0, %xmm0
+ vmovdqu 16(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%rcx), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ cmpl $11, %r8d
+ vmovdqu 160(%rcx), %xmm5
+ jl L_AES_CTR_encrypt_avx512_16_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%rcx), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ cmpl $13, %r8d
+ vmovdqu 192(%rcx), %xmm5
+ jl L_AES_CTR_encrypt_avx512_16_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%rcx), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%rcx), %xmm5
+L_AES_CTR_encrypt_avx512_16_aes_enc_block_last:
+ vaesenclast %xmm5, %xmm0, %xmm0
+ leaq (%rdi,%rax,1), %r11
+ vpxor (%r11), %xmm0, %xmm0
+ leaq (%rsi,%rax,1), %r11
+ vmovdqu %xmm0, (%r11)
+ addl $16, %eax
+ cmpl %r10d, %eax
+ jl L_AES_CTR_encrypt_avx512_enc_16
+L_AES_CTR_encrypt_avx512_done_enc:
+ vpshufb %xmm8, %xmm7, %xmm0
+ vmovdqu %xmm0, (%r9)
+ popq %rbx
+ repz retq
+#ifndef __APPLE__
+.size AES_CTR_encrypt_avx512,.-AES_CTR_encrypt_avx512
+#endif /* __APPLE__ */
+#endif /* HAVE_INTEL_AVX512 */
+#endif /* WOLFSSL_X86_64_BUILD */
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/wolfcrypt/src/aes_x86_64_asm.asm b/wolfcrypt/src/aes_x86_64_asm.asm
new file mode 100644
index 00000000000..26ccbb5ee8e
--- /dev/null
+++ b/wolfcrypt/src/aes_x86_64_asm.asm
@@ -0,0 +1,4283 @@
+; /* aes_x86_64_asm.asm */
+; /*
+; * Copyright (C) 2006-2026 wolfSSL Inc.
+; *
+; * This file is part of wolfSSL.
+; *
+; * wolfSSL is free software; you can redistribute it and/or modify
+; * it under the terms of the GNU General Public License as published by
+; * the Free Software Foundation; either version 3 of the License, or
+; * (at your option) any later version.
+; *
+; * wolfSSL is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; * GNU General Public License for more details.
+; *
+; * You should have received a copy of the GNU General Public License
+; * along with this program; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+; */
+
+IF @Version LT 1200
+; AVX2 instructions not recognized by old versions of MASM
+IFNDEF NO_AVX2_SUPPORT
+NO_AVX2_SUPPORT = 1
+ENDIF
+; MOVBE instruction not recognized by old versions of MASM
+IFNDEF NO_MOVBE_SUPPORT
+NO_MOVBE_SUPPORT = 1
+ENDIF
+ENDIF
+
+IFNDEF HAVE_INTEL_AVX1
+HAVE_INTEL_AVX1 = 1
+ENDIF
+IFNDEF NO_AVX2_SUPPORT
+HAVE_INTEL_AVX2 = 1
+ENDIF
+
+IFNDEF _WIN64
+_WIN64 = 1
+ENDIF
+
+_TEXT SEGMENT READONLY PARA
+AES_128_Key_Expansion_AESNI PROC
+ movdqu xmm0, OWORD PTR [rcx]
+ movdqu OWORD PTR [rdx], xmm0
+ aeskeygenassist xmm1, xmm0, 1
+ pshufd xmm1, xmm1, 255
+ movdqa xmm2, xmm0
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pxor xmm0, xmm1
+ movdqu OWORD PTR [rdx+16], xmm0
+ aeskeygenassist xmm1, xmm0, 2
+ pshufd xmm1, xmm1, 255
+ movdqa xmm2, xmm0
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pxor xmm0, xmm1
+ movdqu OWORD PTR [rdx+32], xmm0
+ aeskeygenassist xmm1, xmm0, 4
+ pshufd xmm1, xmm1, 255
+ movdqa xmm2, xmm0
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pxor xmm0, xmm1
+ movdqu OWORD PTR [rdx+48], xmm0
+ aeskeygenassist xmm1, xmm0, 8
+ pshufd xmm1, xmm1, 255
+ movdqa xmm2, xmm0
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pxor xmm0, xmm1
+ movdqu OWORD PTR [rdx+64], xmm0
+ aeskeygenassist xmm1, xmm0, 16
+ pshufd xmm1, xmm1, 255
+ movdqa xmm2, xmm0
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pxor xmm0, xmm1
+ movdqu OWORD PTR [rdx+80], xmm0
+ aeskeygenassist xmm1, xmm0, 32
+ pshufd xmm1, xmm1, 255
+ movdqa xmm2, xmm0
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pxor xmm0, xmm1
+ movdqu OWORD PTR [rdx+96], xmm0
+ aeskeygenassist xmm1, xmm0, 64
+ pshufd xmm1, xmm1, 255
+ movdqa xmm2, xmm0
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pxor xmm0, xmm1
+ movdqu OWORD PTR [rdx+112], xmm0
+ aeskeygenassist xmm1, xmm0, 128
+ pshufd xmm1, xmm1, 255
+ movdqa xmm2, xmm0
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pxor xmm0, xmm1
+ movdqu OWORD PTR [rdx+128], xmm0
+ aeskeygenassist xmm1, xmm0, 27
+ pshufd xmm1, xmm1, 255
+ movdqa xmm2, xmm0
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pxor xmm0, xmm1
+ movdqu OWORD PTR [rdx+144], xmm0
+ aeskeygenassist xmm1, xmm0, 54
+ pshufd xmm1, xmm1, 255
+ movdqa xmm2, xmm0
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pslldq xmm2, 4
+ pxor xmm0, xmm2
+ pxor xmm0, xmm1
+ movdqu OWORD PTR [rdx+160], xmm0
+ ret
+AES_128_Key_Expansion_AESNI ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_192_Key_Expansion_AESNI PROC
+ movdqu xmm0, OWORD PTR [rcx]
+ pxor xmm1, xmm1
+ pinsrq xmm1, QWORD PTR [rcx+16], 0
+ movdqu OWORD PTR [rdx], xmm0
+ movdqa xmm4, xmm1
+ aeskeygenassist xmm2, xmm1, 1
+ pshufd xmm2, xmm2, 85
+ movdqa xmm3, xmm0
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pxor xmm0, xmm2
+ pshufd xmm2, xmm0, 255
+ movdqa xmm3, xmm1
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+ shufpd xmm4, xmm0, 0
+ movdqu OWORD PTR [rdx+16], xmm4
+ movdqa xmm5, xmm0
+ shufpd xmm5, xmm1, 1
+ movdqu OWORD PTR [rdx+32], xmm5
+ aeskeygenassist xmm2, xmm1, 2
+ pshufd xmm2, xmm2, 85
+ movdqa xmm3, xmm0
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pxor xmm0, xmm2
+ pshufd xmm2, xmm0, 255
+ movdqa xmm3, xmm1
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+ movdqu OWORD PTR [rdx+48], xmm0
+ movdqa xmm4, xmm1
+ aeskeygenassist xmm2, xmm1, 4
+ pshufd xmm2, xmm2, 85
+ movdqa xmm3, xmm0
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pxor xmm0, xmm2
+ pshufd xmm2, xmm0, 255
+ movdqa xmm3, xmm1
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+ shufpd xmm4, xmm0, 0
+ movdqu OWORD PTR [rdx+64], xmm4
+ movdqa xmm5, xmm0
+ shufpd xmm5, xmm1, 1
+ movdqu OWORD PTR [rdx+80], xmm5
+ aeskeygenassist xmm2, xmm1, 8
+ pshufd xmm2, xmm2, 85
+ movdqa xmm3, xmm0
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pxor xmm0, xmm2
+ pshufd xmm2, xmm0, 255
+ movdqa xmm3, xmm1
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+ movdqu OWORD PTR [rdx+96], xmm0
+ movdqa xmm4, xmm1
+ aeskeygenassist xmm2, xmm1, 16
+ pshufd xmm2, xmm2, 85
+ movdqa xmm3, xmm0
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pxor xmm0, xmm2
+ pshufd xmm2, xmm0, 255
+ movdqa xmm3, xmm1
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+ shufpd xmm4, xmm0, 0
+ movdqu OWORD PTR [rdx+112], xmm4
+ movdqa xmm5, xmm0
+ shufpd xmm5, xmm1, 1
+ movdqu OWORD PTR [rdx+128], xmm5
+ aeskeygenassist xmm2, xmm1, 32
+ pshufd xmm2, xmm2, 85
+ movdqa xmm3, xmm0
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pxor xmm0, xmm2
+ pshufd xmm2, xmm0, 255
+ movdqa xmm3, xmm1
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+ movdqu OWORD PTR [rdx+144], xmm0
+ movdqa xmm4, xmm1
+ aeskeygenassist xmm2, xmm1, 64
+ pshufd xmm2, xmm2, 85
+ movdqa xmm3, xmm0
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pxor xmm0, xmm2
+ pshufd xmm2, xmm0, 255
+ movdqa xmm3, xmm1
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+ shufpd xmm4, xmm0, 0
+ movdqu OWORD PTR [rdx+160], xmm4
+ movdqa xmm5, xmm0
+ shufpd xmm5, xmm1, 1
+ movdqu OWORD PTR [rdx+176], xmm5
+ aeskeygenassist xmm2, xmm1, 128
+ pshufd xmm2, xmm2, 85
+ movdqa xmm3, xmm0
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pxor xmm0, xmm2
+ pshufd xmm2, xmm0, 255
+ movdqa xmm3, xmm1
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+ movdqu OWORD PTR [rdx+192], xmm0
+ movdqu OWORD PTR [rdx+208], xmm1
+ ret
+AES_192_Key_Expansion_AESNI ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_256_Key_Expansion_AESNI PROC
+ movdqu xmm0, OWORD PTR [rcx]
+ movdqu xmm1, OWORD PTR [rcx+16]
+ movdqu OWORD PTR [rdx], xmm0
+ movdqu OWORD PTR [rdx+16], xmm1
+ aeskeygenassist xmm2, xmm1, 1
+ pshufd xmm2, xmm2, 255
+ movdqa xmm3, xmm0
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pxor xmm0, xmm2
+ movdqu OWORD PTR [rdx+32], xmm0
+ aeskeygenassist xmm2, xmm0, 0
+ pshufd xmm2, xmm2, 170
+ movdqa xmm3, xmm1
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+ movdqu OWORD PTR [rdx+48], xmm1
+ aeskeygenassist xmm2, xmm1, 2
+ pshufd xmm2, xmm2, 255
+ movdqa xmm3, xmm0
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pxor xmm0, xmm2
+ movdqu OWORD PTR [rdx+64], xmm0
+ aeskeygenassist xmm2, xmm0, 0
+ pshufd xmm2, xmm2, 170
+ movdqa xmm3, xmm1
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+ movdqu OWORD PTR [rdx+80], xmm1
+ aeskeygenassist xmm2, xmm1, 4
+ pshufd xmm2, xmm2, 255
+ movdqa xmm3, xmm0
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pxor xmm0, xmm2
+ movdqu OWORD PTR [rdx+96], xmm0
+ aeskeygenassist xmm2, xmm0, 0
+ pshufd xmm2, xmm2, 170
+ movdqa xmm3, xmm1
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+ movdqu OWORD PTR [rdx+112], xmm1
+ aeskeygenassist xmm2, xmm1, 8
+ pshufd xmm2, xmm2, 255
+ movdqa xmm3, xmm0
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pxor xmm0, xmm2
+ movdqu OWORD PTR [rdx+128], xmm0
+ aeskeygenassist xmm2, xmm0, 0
+ pshufd xmm2, xmm2, 170
+ movdqa xmm3, xmm1
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+ movdqu OWORD PTR [rdx+144], xmm1
+ aeskeygenassist xmm2, xmm1, 16
+ pshufd xmm2, xmm2, 255
+ movdqa xmm3, xmm0
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pxor xmm0, xmm2
+ movdqu OWORD PTR [rdx+160], xmm0
+ aeskeygenassist xmm2, xmm0, 0
+ pshufd xmm2, xmm2, 170
+ movdqa xmm3, xmm1
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+ movdqu OWORD PTR [rdx+176], xmm1
+ aeskeygenassist xmm2, xmm1, 32
+ pshufd xmm2, xmm2, 255
+ movdqa xmm3, xmm0
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pxor xmm0, xmm2
+ movdqu OWORD PTR [rdx+192], xmm0
+ aeskeygenassist xmm2, xmm0, 0
+ pshufd xmm2, xmm2, 170
+ movdqa xmm3, xmm1
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pslldq xmm3, 4
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+ movdqu OWORD PTR [rdx+208], xmm1
+ aeskeygenassist xmm2, xmm1, 64
+ pshufd xmm2, xmm2, 255
+ movdqa xmm3, xmm0
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pslldq xmm3, 4
+ pxor xmm0, xmm3
+ pxor xmm0, xmm2
+ movdqu OWORD PTR [rdx+224], xmm0
+ ret
+AES_256_Key_Expansion_AESNI ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_ECB_encrypt_AESNI PROC
+ mov eax, DWORD PTR [rsp+40]
+ sub rsp, 16
+ movdqu OWORD PTR [rsp], xmm6
+ xor eax, eax
+ cmp r8d, 64
+ mov r9d, r8d
+ jl L_AES_ECB_encrypt_AESNI_done_64
+ and r9d, 4294967232
+L_AES_ECB_encrypt_AESNI_enc_64:
+ ; 64 bytes of input
+ ; aes_ecb_enc_64
+ lea r10, QWORD PTR [rcx+rax]
+ lea r11, QWORD PTR [rdx+rax]
+ movdqu xmm0, OWORD PTR [r10]
+ movdqu xmm1, OWORD PTR [r10+16]
+ movdqu xmm2, OWORD PTR [r10+32]
+ movdqu xmm3, OWORD PTR [r10+48]
+ ; aes_enc_block
+ movdqu xmm4, OWORD PTR [r9]
+ pxor xmm0, xmm4
+ pxor xmm1, xmm4
+ pxor xmm2, xmm4
+ pxor xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+16]
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+32]
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+48]
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+64]
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+80]
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+96]
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+112]
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+128]
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+144]
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ cmp eax, 11
+ movdqu xmm4, OWORD PTR [r9+160]
+ jl L_AES_ECB_encrypt_AESNI_64_aes_enc_block_last
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+176]
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ cmp eax, 13
+ movdqu xmm4, OWORD PTR [r9+192]
+ jl L_AES_ECB_encrypt_AESNI_64_aes_enc_block_last
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+208]
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+224]
+L_AES_ECB_encrypt_AESNI_64_aes_enc_block_last:
+ aesenclast xmm0, xmm4
+ aesenclast xmm1, xmm4
+ aesenclast xmm2, xmm4
+ aesenclast xmm3, xmm4
+ movdqu OWORD PTR [r11], xmm0
+ movdqu OWORD PTR [r11+16], xmm1
+ movdqu OWORD PTR [r11+32], xmm2
+ movdqu OWORD PTR [r11+48], xmm3
+ add eax, 64
+ cmp eax, r9d
+ jl L_AES_ECB_encrypt_AESNI_enc_64
+L_AES_ECB_encrypt_AESNI_done_64:
+ cmp eax, r8d
+ mov r9d, r8d
+ je L_AES_ECB_encrypt_AESNI_done_enc
+ and r9d, 4294967280
+L_AES_ECB_encrypt_AESNI_enc_16:
+ ; 16 bytes of input
+ lea r10, QWORD PTR [rcx+rax]
+ movdqu xmm0, OWORD PTR [r10]
+ ; aes_enc_block
+ pxor xmm0, [r9]
+ movdqu xmm5, OWORD PTR [r9+16]
+ aesenc xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+32]
+ aesenc xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+48]
+ aesenc xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+64]
+ aesenc xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+80]
+ aesenc xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+96]
+ aesenc xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+112]
+ aesenc xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+128]
+ aesenc xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+144]
+ aesenc xmm0, xmm5
+ cmp eax, 11
+ movdqu xmm5, OWORD PTR [r9+160]
+ jl L_AES_ECB_encrypt_AESNI_16_aes_enc_block_last
+ aesenc xmm0, xmm5
+ movdqu xmm6, OWORD PTR [r9+176]
+ aesenc xmm0, xmm6
+ cmp eax, 13
+ movdqu xmm5, OWORD PTR [r9+192]
+ jl L_AES_ECB_encrypt_AESNI_16_aes_enc_block_last
+ aesenc xmm0, xmm5
+ movdqu xmm6, OWORD PTR [r9+208]
+ aesenc xmm0, xmm6
+ movdqu xmm5, OWORD PTR [r9+224]
+L_AES_ECB_encrypt_AESNI_16_aes_enc_block_last:
+ aesenclast xmm0, xmm5
+ lea r10, QWORD PTR [rdx+rax]
+ movdqu OWORD PTR [r10], xmm0
+ add eax, 16
+ cmp eax, r9d
+ jl L_AES_ECB_encrypt_AESNI_enc_16
+L_AES_ECB_encrypt_AESNI_done_enc:
+ movdqu xmm6, OWORD PTR [rsp]
+ add rsp, 16
+ ret
+AES_ECB_encrypt_AESNI ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_ECB_decrypt_AESNI PROC
+ mov eax, DWORD PTR [rsp+40]
+ sub rsp, 16
+ movdqu OWORD PTR [rsp], xmm6
+ xor eax, eax
+ cmp r8d, 64
+ mov r9d, r8d
+ jl L_AES_ECB_decrypt_AESNI_done_64
+ and r9d, 4294967232
+L_AES_ECB_decrypt_AESNI_dec_64:
+ ; 64 bytes of input
+ ; aes_ecb_dec_64
+ lea r10, QWORD PTR [rcx+rax]
+ lea r11, QWORD PTR [rdx+rax]
+ movdqu xmm0, OWORD PTR [r10]
+ movdqu xmm1, OWORD PTR [r10+16]
+ movdqu xmm2, OWORD PTR [r10+32]
+ movdqu xmm3, OWORD PTR [r10+48]
+ ; aes_dec_block
+ movdqu xmm4, OWORD PTR [r9]
+ pxor xmm0, xmm4
+ pxor xmm1, xmm4
+ pxor xmm2, xmm4
+ pxor xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+16]
+ aesdec xmm0, xmm4
+ aesdec xmm1, xmm4
+ aesdec xmm2, xmm4
+ aesdec xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+32]
+ aesdec xmm0, xmm4
+ aesdec xmm1, xmm4
+ aesdec xmm2, xmm4
+ aesdec xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+48]
+ aesdec xmm0, xmm4
+ aesdec xmm1, xmm4
+ aesdec xmm2, xmm4
+ aesdec xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+64]
+ aesdec xmm0, xmm4
+ aesdec xmm1, xmm4
+ aesdec xmm2, xmm4
+ aesdec xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+80]
+ aesdec xmm0, xmm4
+ aesdec xmm1, xmm4
+ aesdec xmm2, xmm4
+ aesdec xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+96]
+ aesdec xmm0, xmm4
+ aesdec xmm1, xmm4
+ aesdec xmm2, xmm4
+ aesdec xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+112]
+ aesdec xmm0, xmm4
+ aesdec xmm1, xmm4
+ aesdec xmm2, xmm4
+ aesdec xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+128]
+ aesdec xmm0, xmm4
+ aesdec xmm1, xmm4
+ aesdec xmm2, xmm4
+ aesdec xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+144]
+ aesdec xmm0, xmm4
+ aesdec xmm1, xmm4
+ aesdec xmm2, xmm4
+ aesdec xmm3, xmm4
+ cmp eax, 11
+ movdqu xmm4, OWORD PTR [r9+160]
+ jl L_AES_ECB_decrypt_AESNI_64_aes_dec_block_last
+ aesdec xmm0, xmm4
+ aesdec xmm1, xmm4
+ aesdec xmm2, xmm4
+ aesdec xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+176]
+ aesdec xmm0, xmm4
+ aesdec xmm1, xmm4
+ aesdec xmm2, xmm4
+ aesdec xmm3, xmm4
+ cmp eax, 13
+ movdqu xmm4, OWORD PTR [r9+192]
+ jl L_AES_ECB_decrypt_AESNI_64_aes_dec_block_last
+ aesdec xmm0, xmm4
+ aesdec xmm1, xmm4
+ aesdec xmm2, xmm4
+ aesdec xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+208]
+ aesdec xmm0, xmm4
+ aesdec xmm1, xmm4
+ aesdec xmm2, xmm4
+ aesdec xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+224]
+L_AES_ECB_decrypt_AESNI_64_aes_dec_block_last:
+ aesdeclast xmm0, xmm4
+ aesdeclast xmm1, xmm4
+ aesdeclast xmm2, xmm4
+ aesdeclast xmm3, xmm4
+ movdqu OWORD PTR [r11], xmm0
+ movdqu OWORD PTR [r11+16], xmm1
+ movdqu OWORD PTR [r11+32], xmm2
+ movdqu OWORD PTR [r11+48], xmm3
+ add eax, 64
+ cmp eax, r9d
+ jl L_AES_ECB_decrypt_AESNI_dec_64
+L_AES_ECB_decrypt_AESNI_done_64:
+ cmp eax, r8d
+ mov r9d, r8d
+ je L_AES_ECB_decrypt_AESNI_done_dec
+ and r9d, 4294967280
+L_AES_ECB_decrypt_AESNI_dec_16:
+ ; 16 bytes of input
+ lea r10, QWORD PTR [rcx+rax]
+ movdqu xmm0, OWORD PTR [r10]
+ ; aes_dec_block
+ pxor xmm0, [r9]
+ movdqu xmm5, OWORD PTR [r9+16]
+ aesdec xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+32]
+ aesdec xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+48]
+ aesdec xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+64]
+ aesdec xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+80]
+ aesdec xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+96]
+ aesdec xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+112]
+ aesdec xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+128]
+ aesdec xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+144]
+ aesdec xmm0, xmm5
+ cmp eax, 11
+ movdqu xmm5, OWORD PTR [r9+160]
+ jl L_AES_ECB_decrypt_AESNI_16_aes_dec_block_last
+ aesdec xmm0, xmm5
+ movdqu xmm6, OWORD PTR [r9+176]
+ aesdec xmm0, xmm6
+ cmp eax, 13
+ movdqu xmm5, OWORD PTR [r9+192]
+ jl L_AES_ECB_decrypt_AESNI_16_aes_dec_block_last
+ aesdec xmm0, xmm5
+ movdqu xmm6, OWORD PTR [r9+208]
+ aesdec xmm0, xmm6
+ movdqu xmm5, OWORD PTR [r9+224]
+L_AES_ECB_decrypt_AESNI_16_aes_dec_block_last:
+ aesdeclast xmm0, xmm5
+ lea r10, QWORD PTR [rdx+rax]
+ movdqu OWORD PTR [r10], xmm0
+ add eax, 16
+ cmp eax, r9d
+ jl L_AES_ECB_decrypt_AESNI_dec_16
+L_AES_ECB_decrypt_AESNI_done_dec:
+ movdqu xmm6, OWORD PTR [rsp]
+ add rsp, 16
+ ret
+AES_ECB_decrypt_AESNI ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_CBC_encrypt_AESNI PROC
+ mov rax, QWORD PTR [rsp+40]
+ mov r10d, DWORD PTR [rsp+48]
+ movdqu xmm0, OWORD PTR [r8]
+ xor eax, eax
+ cmp eax, r9d
+ je L_AES_CBC_encrypt_AESNI_done
+L_AES_CBC_encrypt_AESNI_loop:
+ ; 16 bytes of input
+ lea r10, QWORD PTR [rcx+rax]
+ movdqu xmm1, OWORD PTR [r10]
+ pxor xmm1, xmm0
+ ; aes_enc_block
+ pxor xmm1, [rax]
+ movdqu xmm3, OWORD PTR [rax+16]
+ aesenc xmm1, xmm3
+ movdqu xmm3, OWORD PTR [rax+32]
+ aesenc xmm1, xmm3
+ movdqu xmm3, OWORD PTR [rax+48]
+ aesenc xmm1, xmm3
+ movdqu xmm3, OWORD PTR [rax+64]
+ aesenc xmm1, xmm3
+ movdqu xmm3, OWORD PTR [rax+80]
+ aesenc xmm1, xmm3
+ movdqu xmm3, OWORD PTR [rax+96]
+ aesenc xmm1, xmm3
+ movdqu xmm3, OWORD PTR [rax+112]
+ aesenc xmm1, xmm3
+ movdqu xmm3, OWORD PTR [rax+128]
+ aesenc xmm1, xmm3
+ movdqu xmm3, OWORD PTR [rax+144]
+ aesenc xmm1, xmm3
+ cmp r10d, 11
+ movdqu xmm3, OWORD PTR [rax+160]
+ jl L_AES_CBC_encrypt_AESNI_aes_enc_block_last
+ aesenc xmm1, xmm3
+ movdqu xmm4, OWORD PTR [rax+176]
+ aesenc xmm1, xmm4
+ cmp r10d, 13
+ movdqu xmm3, OWORD PTR [rax+192]
+ jl L_AES_CBC_encrypt_AESNI_aes_enc_block_last
+ aesenc xmm1, xmm3
+ movdqu xmm4, OWORD PTR [rax+208]
+ aesenc xmm1, xmm4
+ movdqu xmm3, OWORD PTR [rax+224]
+L_AES_CBC_encrypt_AESNI_aes_enc_block_last:
+ aesenclast xmm1, xmm3
+ lea r11, QWORD PTR [rdx+rax]
+ movdqu OWORD PTR [r11], xmm1
+ movdqa xmm0, xmm1
+ add eax, 16
+ cmp eax, r9d
+ jl L_AES_CBC_encrypt_AESNI_loop
+L_AES_CBC_encrypt_AESNI_done:
+ movdqu OWORD PTR [r8], xmm0
+ ret
+AES_CBC_encrypt_AESNI ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_CBC_decrypt_AESNI PROC
+ push r12
+ mov rax, QWORD PTR [rsp+48]
+ mov r10d, DWORD PTR [rsp+56]
+ sub rsp, 48
+ movdqu OWORD PTR [rsp], xmm6
+ movdqu OWORD PTR [rsp+16], xmm7
+ movdqu OWORD PTR [rsp+32], xmm8
+ movdqu xmm4, OWORD PTR [r8]
+ xor eax, eax
+ cmp r9d, 64
+ mov r10d, r9d
+ jl L_AES_CBC_decrypt_AESNI_done_64
+ and r10d, 4294967232
+L_AES_CBC_decrypt_AESNI_dec_64:
+ ; 64 bytes of input
+ ; aes_cbc_dec_64
+ lea r11, QWORD PTR [rcx+rax]
+ lea r12, QWORD PTR [rdx+rax]
+ movdqu xmm0, OWORD PTR [r11]
+ movdqu xmm1, OWORD PTR [r11+16]
+ movdqu xmm2, OWORD PTR [r11+32]
+ movdqu xmm3, OWORD PTR [r11+48]
+ ; aes_dec_block
+ movdqu xmm5, OWORD PTR [rax]
+ pxor xmm0, xmm5
+ pxor xmm1, xmm5
+ pxor xmm2, xmm5
+ pxor xmm3, xmm5
+ movdqu xmm5, OWORD PTR [rax+16]
+ aesdec xmm0, xmm5
+ aesdec xmm1, xmm5
+ aesdec xmm2, xmm5
+ aesdec xmm3, xmm5
+ movdqu xmm5, OWORD PTR [rax+32]
+ aesdec xmm0, xmm5
+ aesdec xmm1, xmm5
+ aesdec xmm2, xmm5
+ aesdec xmm3, xmm5
+ movdqu xmm5, OWORD PTR [rax+48]
+ aesdec xmm0, xmm5
+ aesdec xmm1, xmm5
+ aesdec xmm2, xmm5
+ aesdec xmm3, xmm5
+ movdqu xmm5, OWORD PTR [rax+64]
+ aesdec xmm0, xmm5
+ aesdec xmm1, xmm5
+ aesdec xmm2, xmm5
+ aesdec xmm3, xmm5
+ movdqu xmm5, OWORD PTR [rax+80]
+ aesdec xmm0, xmm5
+ aesdec xmm1, xmm5
+ aesdec xmm2, xmm5
+ aesdec xmm3, xmm5
+ movdqu xmm5, OWORD PTR [rax+96]
+ aesdec xmm0, xmm5
+ aesdec xmm1, xmm5
+ aesdec xmm2, xmm5
+ aesdec xmm3, xmm5
+ movdqu xmm5, OWORD PTR [rax+112]
+ aesdec xmm0, xmm5
+ aesdec xmm1, xmm5
+ aesdec xmm2, xmm5
+ aesdec xmm3, xmm5
+ movdqu xmm5, OWORD PTR [rax+128]
+ aesdec xmm0, xmm5
+ aesdec xmm1, xmm5
+ aesdec xmm2, xmm5
+ aesdec xmm3, xmm5
+ movdqu xmm5, OWORD PTR [rax+144]
+ aesdec xmm0, xmm5
+ aesdec xmm1, xmm5
+ aesdec xmm2, xmm5
+ aesdec xmm3, xmm5
+ cmp r10d, 11
+ movdqu xmm5, OWORD PTR [rax+160]
+ jl L_AES_CBC_decrypt_AESNI_64_aes_dec_block_last
+ aesdec xmm0, xmm5
+ aesdec xmm1, xmm5
+ aesdec xmm2, xmm5
+ aesdec xmm3, xmm5
+ movdqu xmm5, OWORD PTR [rax+176]
+ aesdec xmm0, xmm5
+ aesdec xmm1, xmm5
+ aesdec xmm2, xmm5
+ aesdec xmm3, xmm5
+ cmp r10d, 13
+ movdqu xmm5, OWORD PTR [rax+192]
+ jl L_AES_CBC_decrypt_AESNI_64_aes_dec_block_last
+ aesdec xmm0, xmm5
+ aesdec xmm1, xmm5
+ aesdec xmm2, xmm5
+ aesdec xmm3, xmm5
+ movdqu xmm5, OWORD PTR [rax+208]
+ aesdec xmm0, xmm5
+ aesdec xmm1, xmm5
+ aesdec xmm2, xmm5
+ aesdec xmm3, xmm5
+ movdqu xmm5, OWORD PTR [rax+224]
+L_AES_CBC_decrypt_AESNI_64_aes_dec_block_last:
+ aesdeclast xmm0, xmm5
+ aesdeclast xmm1, xmm5
+ aesdeclast xmm2, xmm5
+ aesdeclast xmm3, xmm5
+ pxor xmm0, xmm4
+ movdqu xmm5, OWORD PTR [r11]
+ pxor xmm1, xmm5
+ movdqu xmm5, OWORD PTR [r11+16]
+ pxor xmm2, xmm5
+ movdqu xmm5, OWORD PTR [r11+32]
+ pxor xmm3, xmm5
+ movdqu xmm4, OWORD PTR [r11+48]
+ movdqu OWORD PTR [r12], xmm0
+ movdqu OWORD PTR [r12+16], xmm1
+ movdqu OWORD PTR [r12+32], xmm2
+ movdqu OWORD PTR [r12+48], xmm3
+ add eax, 64
+ cmp eax, r10d
+ jl L_AES_CBC_decrypt_AESNI_dec_64
+L_AES_CBC_decrypt_AESNI_done_64:
+ cmp eax, r9d
+ mov r10d, r9d
+ je L_AES_CBC_decrypt_AESNI_done_dec
+ and r10d, 4294967280
+L_AES_CBC_decrypt_AESNI_dec_16:
+ ; 16 bytes of input
+ lea r11, QWORD PTR [rcx+rax]
+ movdqu xmm0, OWORD PTR [r11]
+ movdqa xmm8, xmm0
+ ; aes_dec_block
+ pxor xmm0, [rax]
+ movdqu xmm6, OWORD PTR [rax+16]
+ aesdec xmm0, xmm6
+ movdqu xmm6, OWORD PTR [rax+32]
+ aesdec xmm0, xmm6
+ movdqu xmm6, OWORD PTR [rax+48]
+ aesdec xmm0, xmm6
+ movdqu xmm6, OWORD PTR [rax+64]
+ aesdec xmm0, xmm6
+ movdqu xmm6, OWORD PTR [rax+80]
+ aesdec xmm0, xmm6
+ movdqu xmm6, OWORD PTR [rax+96]
+ aesdec xmm0, xmm6
+ movdqu xmm6, OWORD PTR [rax+112]
+ aesdec xmm0, xmm6
+ movdqu xmm6, OWORD PTR [rax+128]
+ aesdec xmm0, xmm6
+ movdqu xmm6, OWORD PTR [rax+144]
+ aesdec xmm0, xmm6
+ cmp r10d, 11
+ movdqu xmm6, OWORD PTR [rax+160]
+ jl L_AES_CBC_decrypt_AESNI_16_aes_dec_block_last
+ aesdec xmm0, xmm6
+ movdqu xmm7, OWORD PTR [rax+176]
+ aesdec xmm0, xmm7
+ cmp r10d, 13
+ movdqu xmm6, OWORD PTR [rax+192]
+ jl L_AES_CBC_decrypt_AESNI_16_aes_dec_block_last
+ aesdec xmm0, xmm6
+ movdqu xmm7, OWORD PTR [rax+208]
+ aesdec xmm0, xmm7
+ movdqu xmm6, OWORD PTR [rax+224]
+L_AES_CBC_decrypt_AESNI_16_aes_dec_block_last:
+ aesdeclast xmm0, xmm6
+ pxor xmm0, xmm4
+ movdqa xmm4, xmm8
+ lea r11, QWORD PTR [rdx+rax]
+ movdqu OWORD PTR [r11], xmm0
+ add eax, 16
+ cmp eax, r10d
+ jl L_AES_CBC_decrypt_AESNI_dec_16
+L_AES_CBC_decrypt_AESNI_done_dec:
+ movdqu OWORD PTR [r8], xmm4
+ movdqu xmm6, OWORD PTR [rsp]
+ movdqu xmm7, OWORD PTR [rsp+16]
+ movdqu xmm8, OWORD PTR [rsp+32]
+ add rsp, 48
+ pop r12
+ ret
+AES_CBC_decrypt_AESNI ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_ctr_aesni_bswap QWORD \
+ 08090a0b0c0d0e0fh, 0001020304050607h
+ptr_L_aes_ctr_aesni_bswap QWORD L_aes_ctr_aesni_bswap
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_ctr_aesni_one QWORD \
+ 0000000000000001h, 0000000000000000h
+ptr_L_aes_ctr_aesni_one QWORD L_aes_ctr_aesni_one
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+AES_CTR_encrypt_AESNI PROC
+ push rbx
+ mov eax, DWORD PTR [rsp+48]
+ mov r10, QWORD PTR [rsp+56]
+ sub rsp, 96
+ movdqu OWORD PTR [rsp], xmm6
+ movdqu OWORD PTR [rsp+16], xmm7
+ movdqu OWORD PTR [rsp+32], xmm8
+ movdqu OWORD PTR [rsp+48], xmm9
+ movdqu OWORD PTR [rsp+64], xmm10
+ movdqu OWORD PTR [rsp+80], xmm11
+ movdqu xmm8, OWORD PTR L_aes_ctr_aesni_bswap
+ movdqu xmm9, OWORD PTR L_aes_ctr_aesni_one
+ pxor xmm10, xmm10
+ movdqu xmm7, OWORD PTR [r10]
+ pshufb xmm7, xmm8
+ xor eax, eax
+ cmp r8d, 64
+ mov r10d, r8d
+ jl L_AES_CTR_encrypt_AESNI_done_64
+ and r10d, 4294967232
+L_AES_CTR_encrypt_AESNI_enc_64:
+ ; 64 bytes of input
+ ; aes_ctr_enc_64
+ lea r11, QWORD PTR [rcx+rax]
+ lea rbx, QWORD PTR [rdx+rax]
+ movdqa xmm0, xmm7
+ pshufb xmm0, xmm8
+ paddq xmm7, xmm9
+ movdqa xmm11, xmm7
+ pcmpeqq xmm11, xmm10
+ pslldq xmm11, 8
+ psrlq xmm11, 63
+ paddq xmm7, xmm11
+ movdqa xmm1, xmm7
+ pshufb xmm1, xmm8
+ paddq xmm7, xmm9
+ movdqa xmm11, xmm7
+ pcmpeqq xmm11, xmm10
+ pslldq xmm11, 8
+ psrlq xmm11, 63
+ paddq xmm7, xmm11
+ movdqa xmm2, xmm7
+ pshufb xmm2, xmm8
+ paddq xmm7, xmm9
+ movdqa xmm11, xmm7
+ pcmpeqq xmm11, xmm10
+ pslldq xmm11, 8
+ psrlq xmm11, 63
+ paddq xmm7, xmm11
+ movdqa xmm3, xmm7
+ pshufb xmm3, xmm8
+ paddq xmm7, xmm9
+ movdqa xmm11, xmm7
+ pcmpeqq xmm11, xmm10
+ pslldq xmm11, 8
+ psrlq xmm11, 63
+ paddq xmm7, xmm11
+ ; aes_enc_block
+ movdqu xmm4, OWORD PTR [r9]
+ pxor xmm0, xmm4
+ pxor xmm1, xmm4
+ pxor xmm2, xmm4
+ pxor xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+16]
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+32]
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+48]
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+64]
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+80]
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+96]
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+112]
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+128]
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+144]
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ cmp eax, 11
+ movdqu xmm4, OWORD PTR [r9+160]
+ jl L_AES_CTR_encrypt_AESNI_64_aes_enc_block_last
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+176]
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ cmp eax, 13
+ movdqu xmm4, OWORD PTR [r9+192]
+ jl L_AES_CTR_encrypt_AESNI_64_aes_enc_block_last
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+208]
+ aesenc xmm0, xmm4
+ aesenc xmm1, xmm4
+ aesenc xmm2, xmm4
+ aesenc xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r9+224]
+L_AES_CTR_encrypt_AESNI_64_aes_enc_block_last:
+ aesenclast xmm0, xmm4
+ aesenclast xmm1, xmm4
+ aesenclast xmm2, xmm4
+ aesenclast xmm3, xmm4
+ movdqu xmm4, OWORD PTR [r11]
+ pxor xmm0, xmm4
+ movdqu xmm4, OWORD PTR [r11+16]
+ pxor xmm1, xmm4
+ movdqu xmm4, OWORD PTR [r11+32]
+ pxor xmm2, xmm4
+ movdqu xmm4, OWORD PTR [r11+48]
+ pxor xmm3, xmm4
+ movdqu OWORD PTR [rbx], xmm0
+ movdqu OWORD PTR [rbx+16], xmm1
+ movdqu OWORD PTR [rbx+32], xmm2
+ movdqu OWORD PTR [rbx+48], xmm3
+ add eax, 64
+ cmp eax, r10d
+ jl L_AES_CTR_encrypt_AESNI_enc_64
+L_AES_CTR_encrypt_AESNI_done_64:
+ cmp eax, r8d
+ mov r10d, r8d
+ je L_AES_CTR_encrypt_AESNI_done_enc
+ and r10d, 4294967280
+L_AES_CTR_encrypt_AESNI_enc_16:
+ ; 16 bytes of input
+ movdqa xmm0, xmm7
+ pshufb xmm0, xmm8
+ paddq xmm7, xmm9
+ movdqa xmm11, xmm7
+ pcmpeqq xmm11, xmm10
+ pslldq xmm11, 8
+ psrlq xmm11, 63
+ paddq xmm7, xmm11
+ ; aes_enc_block
+ pxor xmm0, [r9]
+ movdqu xmm5, OWORD PTR [r9+16]
+ aesenc xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+32]
+ aesenc xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+48]
+ aesenc xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+64]
+ aesenc xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+80]
+ aesenc xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+96]
+ aesenc xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+112]
+ aesenc xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+128]
+ aesenc xmm0, xmm5
+ movdqu xmm5, OWORD PTR [r9+144]
+ aesenc xmm0, xmm5
+ cmp eax, 11
+ movdqu xmm5, OWORD PTR [r9+160]
+ jl L_AES_CTR_encrypt_AESNI_16_aes_enc_block_last
+ aesenc xmm0, xmm5
+ movdqu xmm6, OWORD PTR [r9+176]
+ aesenc xmm0, xmm6
+ cmp eax, 13
+ movdqu xmm5, OWORD PTR [r9+192]
+ jl L_AES_CTR_encrypt_AESNI_16_aes_enc_block_last
+ aesenc xmm0, xmm5
+ movdqu xmm6, OWORD PTR [r9+208]
+ aesenc xmm0, xmm6
+ movdqu xmm5, OWORD PTR [r9+224]
+L_AES_CTR_encrypt_AESNI_16_aes_enc_block_last:
+ aesenclast xmm0, xmm5
+ lea r11, QWORD PTR [rcx+rax]
+ movdqu xmm4, OWORD PTR [r11]
+ pxor xmm0, xmm4
+ lea r11, QWORD PTR [rdx+rax]
+ movdqu OWORD PTR [r11], xmm0
+ add eax, 16
+ cmp eax, r10d
+ jl L_AES_CTR_encrypt_AESNI_enc_16
+L_AES_CTR_encrypt_AESNI_done_enc:
+ pshufb xmm7, xmm8
+ movdqu OWORD PTR [r10], xmm7
+ movdqu xmm6, OWORD PTR [rsp]
+ movdqu xmm7, OWORD PTR [rsp+16]
+ movdqu xmm8, OWORD PTR [rsp+32]
+ movdqu xmm9, OWORD PTR [rsp+48]
+ movdqu xmm10, OWORD PTR [rsp+64]
+ movdqu xmm11, OWORD PTR [rsp+80]
+ add rsp, 96
+ pop rbx
+ ret
+AES_CTR_encrypt_AESNI ENDP
+_TEXT ENDS
+IFDEF HAVE_INTEL_AVX1
+_TEXT SEGMENT READONLY PARA
+AES_ECB_encrypt_avx1 PROC
+ mov eax, DWORD PTR [rsp+40]
+ sub rsp, 16
+ vmovdqu OWORD PTR [rsp], xmm6
+ xor eax, eax
+ cmp r8d, 64
+ mov r9d, r8d
+ jl L_AES_ECB_encrypt_avx1_done_64
+ and r9d, 4294967232
+L_AES_ECB_encrypt_avx1_enc_64:
+ ; 64 bytes of input
+ ; aes_ecb_enc_64
+ lea r10, QWORD PTR [rcx+rax]
+ lea r11, QWORD PTR [rdx+rax]
+ vmovdqu xmm0, OWORD PTR [r10]
+ vmovdqu xmm1, OWORD PTR [r10+16]
+ vmovdqu xmm2, OWORD PTR [r10+32]
+ vmovdqu xmm3, OWORD PTR [r10+48]
+ ; aes_enc_block
+ vmovdqu xmm4, OWORD PTR [r9]
+ vpxor xmm0, xmm0, xmm4
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm2, xmm2, xmm4
+ vpxor xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+16]
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+32]
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+48]
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+64]
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+80]
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+96]
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+112]
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+128]
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+144]
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ cmp eax, 11
+ vmovdqu xmm4, OWORD PTR [r9+160]
+ jl L_AES_ECB_encrypt_avx1_64_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+176]
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ cmp eax, 13
+ vmovdqu xmm4, OWORD PTR [r9+192]
+ jl L_AES_ECB_encrypt_avx1_64_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+208]
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+224]
+L_AES_ECB_encrypt_avx1_64_aes_enc_block_last:
+ vaesenclast xmm0, xmm0, xmm4
+ vaesenclast xmm1, xmm1, xmm4
+ vaesenclast xmm2, xmm2, xmm4
+ vaesenclast xmm3, xmm3, xmm4
+ vmovdqu OWORD PTR [r11], xmm0
+ vmovdqu OWORD PTR [r11+16], xmm1
+ vmovdqu OWORD PTR [r11+32], xmm2
+ vmovdqu OWORD PTR [r11+48], xmm3
+ add eax, 64
+ cmp eax, r9d
+ jl L_AES_ECB_encrypt_avx1_enc_64
+L_AES_ECB_encrypt_avx1_done_64:
+ cmp eax, r8d
+ mov r9d, r8d
+ je L_AES_ECB_encrypt_avx1_done_enc
+ and r9d, 4294967280
+L_AES_ECB_encrypt_avx1_enc_16:
+ ; 16 bytes of input
+ lea r10, QWORD PTR [rcx+rax]
+ vmovdqu xmm0, OWORD PTR [r10]
+ ; aes_enc_block
+ vpxor xmm0, xmm0, [r9]
+ vmovdqu xmm5, OWORD PTR [r9+16]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+32]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+48]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+64]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+80]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+96]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+112]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+128]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+144]
+ vaesenc xmm0, xmm0, xmm5
+ cmp eax, 11
+ vmovdqu xmm5, OWORD PTR [r9+160]
+ jl L_AES_ECB_encrypt_avx1_16_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+176]
+ vaesenc xmm0, xmm0, xmm6
+ cmp eax, 13
+ vmovdqu xmm5, OWORD PTR [r9+192]
+ jl L_AES_ECB_encrypt_avx1_16_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+208]
+ vaesenc xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r9+224]
+L_AES_ECB_encrypt_avx1_16_aes_enc_block_last:
+ vaesenclast xmm0, xmm0, xmm5
+ lea r10, QWORD PTR [rdx+rax]
+ vmovdqu OWORD PTR [r10], xmm0
+ add eax, 16
+ cmp eax, r9d
+ jl L_AES_ECB_encrypt_avx1_enc_16
+L_AES_ECB_encrypt_avx1_done_enc:
+ vmovdqu xmm6, OWORD PTR [rsp]
+ add rsp, 16
+ ret
+AES_ECB_encrypt_avx1 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_ECB_decrypt_avx1 PROC
+ mov eax, DWORD PTR [rsp+40]
+ sub rsp, 16
+ vmovdqu OWORD PTR [rsp], xmm6
+ xor eax, eax
+ cmp r8d, 64
+ mov r9d, r8d
+ jl L_AES_ECB_decrypt_avx1_done_64
+ and r9d, 4294967232
+L_AES_ECB_decrypt_avx1_dec_64:
+ ; 64 bytes of input
+ ; aes_ecb_dec_64
+ lea r10, QWORD PTR [rcx+rax]
+ lea r11, QWORD PTR [rdx+rax]
+ vmovdqu xmm0, OWORD PTR [r10]
+ vmovdqu xmm1, OWORD PTR [r10+16]
+ vmovdqu xmm2, OWORD PTR [r10+32]
+ vmovdqu xmm3, OWORD PTR [r10+48]
+ ; aes_dec_block
+ vmovdqu xmm4, OWORD PTR [r9]
+ vpxor xmm0, xmm0, xmm4
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm2, xmm2, xmm4
+ vpxor xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+16]
+ vaesdec xmm0, xmm0, xmm4
+ vaesdec xmm1, xmm1, xmm4
+ vaesdec xmm2, xmm2, xmm4
+ vaesdec xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+32]
+ vaesdec xmm0, xmm0, xmm4
+ vaesdec xmm1, xmm1, xmm4
+ vaesdec xmm2, xmm2, xmm4
+ vaesdec xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+48]
+ vaesdec xmm0, xmm0, xmm4
+ vaesdec xmm1, xmm1, xmm4
+ vaesdec xmm2, xmm2, xmm4
+ vaesdec xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+64]
+ vaesdec xmm0, xmm0, xmm4
+ vaesdec xmm1, xmm1, xmm4
+ vaesdec xmm2, xmm2, xmm4
+ vaesdec xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+80]
+ vaesdec xmm0, xmm0, xmm4
+ vaesdec xmm1, xmm1, xmm4
+ vaesdec xmm2, xmm2, xmm4
+ vaesdec xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+96]
+ vaesdec xmm0, xmm0, xmm4
+ vaesdec xmm1, xmm1, xmm4
+ vaesdec xmm2, xmm2, xmm4
+ vaesdec xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+112]
+ vaesdec xmm0, xmm0, xmm4
+ vaesdec xmm1, xmm1, xmm4
+ vaesdec xmm2, xmm2, xmm4
+ vaesdec xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+128]
+ vaesdec xmm0, xmm0, xmm4
+ vaesdec xmm1, xmm1, xmm4
+ vaesdec xmm2, xmm2, xmm4
+ vaesdec xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+144]
+ vaesdec xmm0, xmm0, xmm4
+ vaesdec xmm1, xmm1, xmm4
+ vaesdec xmm2, xmm2, xmm4
+ vaesdec xmm3, xmm3, xmm4
+ cmp eax, 11
+ vmovdqu xmm4, OWORD PTR [r9+160]
+ jl L_AES_ECB_decrypt_avx1_64_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm4
+ vaesdec xmm1, xmm1, xmm4
+ vaesdec xmm2, xmm2, xmm4
+ vaesdec xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+176]
+ vaesdec xmm0, xmm0, xmm4
+ vaesdec xmm1, xmm1, xmm4
+ vaesdec xmm2, xmm2, xmm4
+ vaesdec xmm3, xmm3, xmm4
+ cmp eax, 13
+ vmovdqu xmm4, OWORD PTR [r9+192]
+ jl L_AES_ECB_decrypt_avx1_64_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm4
+ vaesdec xmm1, xmm1, xmm4
+ vaesdec xmm2, xmm2, xmm4
+ vaesdec xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+208]
+ vaesdec xmm0, xmm0, xmm4
+ vaesdec xmm1, xmm1, xmm4
+ vaesdec xmm2, xmm2, xmm4
+ vaesdec xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+224]
+L_AES_ECB_decrypt_avx1_64_aes_dec_block_last:
+ vaesdeclast xmm0, xmm0, xmm4
+ vaesdeclast xmm1, xmm1, xmm4
+ vaesdeclast xmm2, xmm2, xmm4
+ vaesdeclast xmm3, xmm3, xmm4
+ vmovdqu OWORD PTR [r11], xmm0
+ vmovdqu OWORD PTR [r11+16], xmm1
+ vmovdqu OWORD PTR [r11+32], xmm2
+ vmovdqu OWORD PTR [r11+48], xmm3
+ add eax, 64
+ cmp eax, r9d
+ jl L_AES_ECB_decrypt_avx1_dec_64
+L_AES_ECB_decrypt_avx1_done_64:
+ cmp eax, r8d
+ mov r9d, r8d
+ je L_AES_ECB_decrypt_avx1_done_dec
+ and r9d, 4294967280
+L_AES_ECB_decrypt_avx1_dec_16:
+ ; 16 bytes of input
+ lea r10, QWORD PTR [rcx+rax]
+ vmovdqu xmm0, OWORD PTR [r10]
+ ; aes_dec_block
+ vpxor xmm0, xmm0, [r9]
+ vmovdqu xmm5, OWORD PTR [r9+16]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+32]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+48]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+64]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+80]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+96]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+112]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+128]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+144]
+ vaesdec xmm0, xmm0, xmm5
+ cmp eax, 11
+ vmovdqu xmm5, OWORD PTR [r9+160]
+ jl L_AES_ECB_decrypt_avx1_16_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+176]
+ vaesdec xmm0, xmm0, xmm6
+ cmp eax, 13
+ vmovdqu xmm5, OWORD PTR [r9+192]
+ jl L_AES_ECB_decrypt_avx1_16_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+208]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r9+224]
+L_AES_ECB_decrypt_avx1_16_aes_dec_block_last:
+ vaesdeclast xmm0, xmm0, xmm5
+ lea r10, QWORD PTR [rdx+rax]
+ vmovdqu OWORD PTR [r10], xmm0
+ add eax, 16
+ cmp eax, r9d
+ jl L_AES_ECB_decrypt_avx1_dec_16
+L_AES_ECB_decrypt_avx1_done_dec:
+ vmovdqu xmm6, OWORD PTR [rsp]
+ add rsp, 16
+ ret
+AES_ECB_decrypt_avx1 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_CBC_encrypt_avx1 PROC
+ mov rax, QWORD PTR [rsp+40]
+ mov r10d, DWORD PTR [rsp+48]
+ vmovdqu xmm0, OWORD PTR [r8]
+ xor eax, eax
+ cmp eax, r9d
+ je L_AES_CBC_encrypt_avx1_done
+L_AES_CBC_encrypt_avx1_loop:
+ ; 16 bytes of input
+ lea r10, QWORD PTR [rcx+rax]
+ vmovdqu xmm1, OWORD PTR [r10]
+ vpxor xmm1, xmm1, xmm0
+ ; aes_enc_block
+ vpxor xmm1, xmm1, [rax]
+ vmovdqu xmm3, OWORD PTR [rax+16]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+32]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+48]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+64]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+80]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+96]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+112]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+128]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+144]
+ vaesenc xmm1, xmm1, xmm3
+ cmp r10d, 11
+ vmovdqu xmm3, OWORD PTR [rax+160]
+ jl L_AES_CBC_encrypt_avx1_aes_enc_block_last
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm4, OWORD PTR [rax+176]
+ vaesenc xmm1, xmm1, xmm4
+ cmp r10d, 13
+ vmovdqu xmm3, OWORD PTR [rax+192]
+ jl L_AES_CBC_encrypt_avx1_aes_enc_block_last
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm4, OWORD PTR [rax+208]
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqu xmm3, OWORD PTR [rax+224]
+L_AES_CBC_encrypt_avx1_aes_enc_block_last:
+ vaesenclast xmm1, xmm1, xmm3
+ lea r11, QWORD PTR [rdx+rax]
+ vmovdqu OWORD PTR [r11], xmm1
+ vmovdqa xmm0, xmm1
+ add eax, 16
+ cmp eax, r9d
+ jl L_AES_CBC_encrypt_avx1_loop
+L_AES_CBC_encrypt_avx1_done:
+ vmovdqu OWORD PTR [r8], xmm0
+ ret
+AES_CBC_encrypt_avx1 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_CBC_decrypt_avx1 PROC
+ push r12
+ mov rax, QWORD PTR [rsp+48]
+ mov r10d, DWORD PTR [rsp+56]
+ sub rsp, 48
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu xmm4, OWORD PTR [r8]
+ xor eax, eax
+ cmp r9d, 64
+ mov r10d, r9d
+ jl L_AES_CBC_decrypt_avx1_done_64
+ and r10d, 4294967232
+L_AES_CBC_decrypt_avx1_dec_64:
+ ; 64 bytes of input
+ ; aes_cbc_dec_64
+ lea r11, QWORD PTR [rcx+rax]
+ lea r12, QWORD PTR [rdx+rax]
+ vmovdqu xmm0, OWORD PTR [r11]
+ vmovdqu xmm1, OWORD PTR [r11+16]
+ vmovdqu xmm2, OWORD PTR [r11+32]
+ vmovdqu xmm3, OWORD PTR [r11+48]
+ ; aes_dec_block
+ vmovdqu xmm5, OWORD PTR [rax]
+ vpxor xmm0, xmm0, xmm5
+ vpxor xmm1, xmm1, xmm5
+ vpxor xmm2, xmm2, xmm5
+ vpxor xmm3, xmm3, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+16]
+ vaesdec xmm0, xmm0, xmm5
+ vaesdec xmm1, xmm1, xmm5
+ vaesdec xmm2, xmm2, xmm5
+ vaesdec xmm3, xmm3, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+32]
+ vaesdec xmm0, xmm0, xmm5
+ vaesdec xmm1, xmm1, xmm5
+ vaesdec xmm2, xmm2, xmm5
+ vaesdec xmm3, xmm3, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+48]
+ vaesdec xmm0, xmm0, xmm5
+ vaesdec xmm1, xmm1, xmm5
+ vaesdec xmm2, xmm2, xmm5
+ vaesdec xmm3, xmm3, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+64]
+ vaesdec xmm0, xmm0, xmm5
+ vaesdec xmm1, xmm1, xmm5
+ vaesdec xmm2, xmm2, xmm5
+ vaesdec xmm3, xmm3, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+80]
+ vaesdec xmm0, xmm0, xmm5
+ vaesdec xmm1, xmm1, xmm5
+ vaesdec xmm2, xmm2, xmm5
+ vaesdec xmm3, xmm3, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+96]
+ vaesdec xmm0, xmm0, xmm5
+ vaesdec xmm1, xmm1, xmm5
+ vaesdec xmm2, xmm2, xmm5
+ vaesdec xmm3, xmm3, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+112]
+ vaesdec xmm0, xmm0, xmm5
+ vaesdec xmm1, xmm1, xmm5
+ vaesdec xmm2, xmm2, xmm5
+ vaesdec xmm3, xmm3, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+128]
+ vaesdec xmm0, xmm0, xmm5
+ vaesdec xmm1, xmm1, xmm5
+ vaesdec xmm2, xmm2, xmm5
+ vaesdec xmm3, xmm3, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+144]
+ vaesdec xmm0, xmm0, xmm5
+ vaesdec xmm1, xmm1, xmm5
+ vaesdec xmm2, xmm2, xmm5
+ vaesdec xmm3, xmm3, xmm5
+ cmp r10d, 11
+ vmovdqu xmm5, OWORD PTR [rax+160]
+ jl L_AES_CBC_decrypt_avx1_64_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vaesdec xmm1, xmm1, xmm5
+ vaesdec xmm2, xmm2, xmm5
+ vaesdec xmm3, xmm3, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+176]
+ vaesdec xmm0, xmm0, xmm5
+ vaesdec xmm1, xmm1, xmm5
+ vaesdec xmm2, xmm2, xmm5
+ vaesdec xmm3, xmm3, xmm5
+ cmp r10d, 13
+ vmovdqu xmm5, OWORD PTR [rax+192]
+ jl L_AES_CBC_decrypt_avx1_64_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vaesdec xmm1, xmm1, xmm5
+ vaesdec xmm2, xmm2, xmm5
+ vaesdec xmm3, xmm3, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+208]
+ vaesdec xmm0, xmm0, xmm5
+ vaesdec xmm1, xmm1, xmm5
+ vaesdec xmm2, xmm2, xmm5
+ vaesdec xmm3, xmm3, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+224]
+L_AES_CBC_decrypt_avx1_64_aes_dec_block_last:
+ vaesdeclast xmm0, xmm0, xmm5
+ vaesdeclast xmm1, xmm1, xmm5
+ vaesdeclast xmm2, xmm2, xmm5
+ vaesdeclast xmm3, xmm3, xmm5
+ vpxor xmm0, xmm0, xmm4
+ vpxor xmm1, xmm1, [r11]
+ vpxor xmm2, xmm2, [r11+16]
+ vpxor xmm3, xmm3, [r11+32]
+ vmovdqu xmm4, OWORD PTR [r11+48]
+ vmovdqu OWORD PTR [r12], xmm0
+ vmovdqu OWORD PTR [r12+16], xmm1
+ vmovdqu OWORD PTR [r12+32], xmm2
+ vmovdqu OWORD PTR [r12+48], xmm3
+ add eax, 64
+ cmp eax, r10d
+ jl L_AES_CBC_decrypt_avx1_dec_64
+L_AES_CBC_decrypt_avx1_done_64:
+ cmp eax, r9d
+ mov r10d, r9d
+ je L_AES_CBC_decrypt_avx1_done_dec
+ and r10d, 4294967280
+L_AES_CBC_decrypt_avx1_dec_16:
+ ; 16 bytes of input
+ lea r11, QWORD PTR [rcx+rax]
+ vmovdqu xmm0, OWORD PTR [r11]
+ vmovdqa xmm8, xmm0
+ ; aes_dec_block
+ vpxor xmm0, xmm0, [rax]
+ vmovdqu xmm6, OWORD PTR [rax+16]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm6, OWORD PTR [rax+32]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm6, OWORD PTR [rax+48]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm6, OWORD PTR [rax+64]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm6, OWORD PTR [rax+80]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm6, OWORD PTR [rax+96]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm6, OWORD PTR [rax+112]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm6, OWORD PTR [rax+128]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm6, OWORD PTR [rax+144]
+ vaesdec xmm0, xmm0, xmm6
+ cmp r10d, 11
+ vmovdqu xmm6, OWORD PTR [rax+160]
+ jl L_AES_CBC_decrypt_avx1_16_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm7, OWORD PTR [rax+176]
+ vaesdec xmm0, xmm0, xmm7
+ cmp r10d, 13
+ vmovdqu xmm6, OWORD PTR [rax+192]
+ jl L_AES_CBC_decrypt_avx1_16_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm7, OWORD PTR [rax+208]
+ vaesdec xmm0, xmm0, xmm7
+ vmovdqu xmm6, OWORD PTR [rax+224]
+L_AES_CBC_decrypt_avx1_16_aes_dec_block_last:
+ vaesdeclast xmm0, xmm0, xmm6
+ vpxor xmm0, xmm0, xmm4
+ vmovdqa xmm4, xmm8
+ lea r11, QWORD PTR [rdx+rax]
+ vmovdqu OWORD PTR [r11], xmm0
+ add eax, 16
+ cmp eax, r10d
+ jl L_AES_CBC_decrypt_avx1_dec_16
+L_AES_CBC_decrypt_avx1_done_dec:
+ vmovdqu OWORD PTR [r8], xmm4
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ add rsp, 48
+ pop r12
+ ret
+AES_CBC_decrypt_avx1 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_ctr_avx1_bswap QWORD \
+ 08090a0b0c0d0e0fh, 0001020304050607h
+ptr_L_aes_ctr_avx1_bswap QWORD L_aes_ctr_avx1_bswap
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_ctr_avx1_one QWORD \
+ 0000000000000001h, 0000000000000000h
+ptr_L_aes_ctr_avx1_one QWORD L_aes_ctr_avx1_one
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+AES_CTR_encrypt_avx1 PROC
+ push rbx
+ mov eax, DWORD PTR [rsp+48]
+ mov r10, QWORD PTR [rsp+56]
+ sub rsp, 96
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu xmm8, OWORD PTR L_aes_ctr_avx1_bswap
+ vmovdqu xmm9, OWORD PTR L_aes_ctr_avx1_one
+ vpxor xmm10, xmm10, xmm10
+ vmovdqu xmm7, OWORD PTR [r10]
+ vpshufb xmm7, xmm7, xmm8
+ xor eax, eax
+ cmp r8d, 64
+ mov r10d, r8d
+ jl L_AES_CTR_encrypt_avx1_done_64
+ and r10d, 4294967232
+L_AES_CTR_encrypt_avx1_enc_64:
+ ; 64 bytes of input
+ ; aes_ctr_enc_64
+ lea r11, QWORD PTR [rcx+rax]
+ lea rbx, QWORD PTR [rdx+rax]
+ vpshufb xmm0, xmm7, xmm8
+ vpaddq xmm7, xmm7, xmm9
+ vpcmpeqq xmm11, xmm7, xmm10
+ vpslldq xmm11, xmm11, 8
+ vpsrlq xmm11, xmm11, 63
+ vpaddq xmm7, xmm7, xmm11
+ vpshufb xmm1, xmm7, xmm8
+ vpaddq xmm7, xmm7, xmm9
+ vpcmpeqq xmm11, xmm7, xmm10
+ vpslldq xmm11, xmm11, 8
+ vpsrlq xmm11, xmm11, 63
+ vpaddq xmm7, xmm7, xmm11
+ vpshufb xmm2, xmm7, xmm8
+ vpaddq xmm7, xmm7, xmm9
+ vpcmpeqq xmm11, xmm7, xmm10
+ vpslldq xmm11, xmm11, 8
+ vpsrlq xmm11, xmm11, 63
+ vpaddq xmm7, xmm7, xmm11
+ vpshufb xmm3, xmm7, xmm8
+ vpaddq xmm7, xmm7, xmm9
+ vpcmpeqq xmm11, xmm7, xmm10
+ vpslldq xmm11, xmm11, 8
+ vpsrlq xmm11, xmm11, 63
+ vpaddq xmm7, xmm7, xmm11
+ ; aes_enc_block
+ vmovdqu xmm4, OWORD PTR [r9]
+ vpxor xmm0, xmm0, xmm4
+ vpxor xmm1, xmm1, xmm4
+ vpxor xmm2, xmm2, xmm4
+ vpxor xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+16]
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+32]
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+48]
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+64]
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+80]
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+96]
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+112]
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+128]
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+144]
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ cmp eax, 11
+ vmovdqu xmm4, OWORD PTR [r9+160]
+ jl L_AES_CTR_encrypt_avx1_64_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+176]
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ cmp eax, 13
+ vmovdqu xmm4, OWORD PTR [r9+192]
+ jl L_AES_CTR_encrypt_avx1_64_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+208]
+ vaesenc xmm0, xmm0, xmm4
+ vaesenc xmm1, xmm1, xmm4
+ vaesenc xmm2, xmm2, xmm4
+ vaesenc xmm3, xmm3, xmm4
+ vmovdqu xmm4, OWORD PTR [r9+224]
+L_AES_CTR_encrypt_avx1_64_aes_enc_block_last:
+ vaesenclast xmm0, xmm0, xmm4
+ vaesenclast xmm1, xmm1, xmm4
+ vaesenclast xmm2, xmm2, xmm4
+ vaesenclast xmm3, xmm3, xmm4
+ vpxor xmm0, xmm0, [r11]
+ vpxor xmm1, xmm1, [r11+16]
+ vpxor xmm2, xmm2, [r11+32]
+ vpxor xmm3, xmm3, [r11+48]
+ vmovdqu OWORD PTR [rbx], xmm0
+ vmovdqu OWORD PTR [rbx+16], xmm1
+ vmovdqu OWORD PTR [rbx+32], xmm2
+ vmovdqu OWORD PTR [rbx+48], xmm3
+ add eax, 64
+ cmp eax, r10d
+ jl L_AES_CTR_encrypt_avx1_enc_64
+L_AES_CTR_encrypt_avx1_done_64:
+ cmp eax, r8d
+ mov r10d, r8d
+ je L_AES_CTR_encrypt_avx1_done_enc
+ and r10d, 4294967280
+L_AES_CTR_encrypt_avx1_enc_16:
+ ; 16 bytes of input
+ vpshufb xmm0, xmm7, xmm8
+ vpaddq xmm7, xmm7, xmm9
+ vpcmpeqq xmm11, xmm7, xmm10
+ vpslldq xmm11, xmm11, 8
+ vpsrlq xmm11, xmm11, 63
+ vpaddq xmm7, xmm7, xmm11
+ ; aes_enc_block
+ vpxor xmm0, xmm0, [r9]
+ vmovdqu xmm5, OWORD PTR [r9+16]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+32]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+48]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+64]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+80]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+96]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+112]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+128]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+144]
+ vaesenc xmm0, xmm0, xmm5
+ cmp eax, 11
+ vmovdqu xmm5, OWORD PTR [r9+160]
+ jl L_AES_CTR_encrypt_avx1_16_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+176]
+ vaesenc xmm0, xmm0, xmm6
+ cmp eax, 13
+ vmovdqu xmm5, OWORD PTR [r9+192]
+ jl L_AES_CTR_encrypt_avx1_16_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+208]
+ vaesenc xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r9+224]
+L_AES_CTR_encrypt_avx1_16_aes_enc_block_last:
+ vaesenclast xmm0, xmm0, xmm5
+ lea r11, QWORD PTR [rcx+rax]
+ vpxor xmm0, xmm0, [r11]
+ lea r11, QWORD PTR [rdx+rax]
+ vmovdqu OWORD PTR [r11], xmm0
+ add eax, 16
+ cmp eax, r10d
+ jl L_AES_CTR_encrypt_avx1_enc_16
+L_AES_CTR_encrypt_avx1_done_enc:
+ vpshufb xmm7, xmm7, xmm8
+ vmovdqu OWORD PTR [r10], xmm7
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ add rsp, 96
+ pop rbx
+ ret
+AES_CTR_encrypt_avx1 ENDP
+_TEXT ENDS
+ENDIF
+IFDEF HAVE_INTEL_VAES
+_TEXT SEGMENT READONLY PARA
+AES_ECB_encrypt_vaes PROC
+ mov eax, DWORD PTR [rsp+40]
+ sub rsp, 32
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ xor eax, eax
+ cmp r8d, 128
+ mov r9d, r8d
+ jl L_AES_ECB_encrypt_vaes_done_128
+ and r9d, 4294967168
+L_AES_ECB_encrypt_vaes_enc_128:
+ ; 128 bytes of input
+ ; aes_ecb_enc_128
+ lea r10, QWORD PTR [rcx+rax]
+ lea r11, QWORD PTR [rdx+rax]
+ vmovdqu ymm0, YMMWORD PTR [r10]
+ vmovdqu ymm1, YMMWORD PTR [r10+32]
+ vmovdqu ymm2, YMMWORD PTR [r10+64]
+ vmovdqu ymm3, YMMWORD PTR [r10+96]
+ ; aes_enc_block
+ vbroadcasti128 ymm7, [r9]
+ vpxor ymm0, ymm0, ymm7
+ vpxor ymm1, ymm1, ymm7
+ vpxor ymm2, ymm2, ymm7
+ vpxor ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+16]
+ vaesenc ymm0, ymm0, ymm7
+ vaesenc ymm1, ymm1, ymm7
+ vaesenc ymm2, ymm2, ymm7
+ vaesenc ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+32]
+ vaesenc ymm0, ymm0, ymm7
+ vaesenc ymm1, ymm1, ymm7
+ vaesenc ymm2, ymm2, ymm7
+ vaesenc ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+48]
+ vaesenc ymm0, ymm0, ymm7
+ vaesenc ymm1, ymm1, ymm7
+ vaesenc ymm2, ymm2, ymm7
+ vaesenc ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+64]
+ vaesenc ymm0, ymm0, ymm7
+ vaesenc ymm1, ymm1, ymm7
+ vaesenc ymm2, ymm2, ymm7
+ vaesenc ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+80]
+ vaesenc ymm0, ymm0, ymm7
+ vaesenc ymm1, ymm1, ymm7
+ vaesenc ymm2, ymm2, ymm7
+ vaesenc ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+96]
+ vaesenc ymm0, ymm0, ymm7
+ vaesenc ymm1, ymm1, ymm7
+ vaesenc ymm2, ymm2, ymm7
+ vaesenc ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+112]
+ vaesenc ymm0, ymm0, ymm7
+ vaesenc ymm1, ymm1, ymm7
+ vaesenc ymm2, ymm2, ymm7
+ vaesenc ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+128]
+ vaesenc ymm0, ymm0, ymm7
+ vaesenc ymm1, ymm1, ymm7
+ vaesenc ymm2, ymm2, ymm7
+ vaesenc ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+144]
+ vaesenc ymm0, ymm0, ymm7
+ vaesenc ymm1, ymm1, ymm7
+ vaesenc ymm2, ymm2, ymm7
+ vaesenc ymm3, ymm3, ymm7
+ cmp eax, 11
+ vbroadcasti128 ymm7, [r9+160]
+ jl L_AES_ECB_encrypt_vaes_128_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm7
+ vaesenc ymm1, ymm1, ymm7
+ vaesenc ymm2, ymm2, ymm7
+ vaesenc ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+176]
+ vaesenc ymm0, ymm0, ymm7
+ vaesenc ymm1, ymm1, ymm7
+ vaesenc ymm2, ymm2, ymm7
+ vaesenc ymm3, ymm3, ymm7
+ cmp eax, 13
+ vbroadcasti128 ymm7, [r9+192]
+ jl L_AES_ECB_encrypt_vaes_128_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm7
+ vaesenc ymm1, ymm1, ymm7
+ vaesenc ymm2, ymm2, ymm7
+ vaesenc ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+208]
+ vaesenc ymm0, ymm0, ymm7
+ vaesenc ymm1, ymm1, ymm7
+ vaesenc ymm2, ymm2, ymm7
+ vaesenc ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+224]
+L_AES_ECB_encrypt_vaes_128_aes_enc_block_last:
+ vaesenclast ymm0, ymm0, ymm7
+ vaesenclast ymm1, ymm1, ymm7
+ vaesenclast ymm2, ymm2, ymm7
+ vaesenclast ymm3, ymm3, ymm7
+ vmovdqu YMMWORD PTR [r11], ymm0
+ vmovdqu YMMWORD PTR [r11+32], ymm1
+ vmovdqu YMMWORD PTR [r11+64], ymm2
+ vmovdqu YMMWORD PTR [r11+96], ymm3
+ add eax, 128
+ cmp eax, r9d
+ jl L_AES_ECB_encrypt_vaes_enc_128
+L_AES_ECB_encrypt_vaes_done_128:
+ mov r9d, r8d
+ and r9d, 4294967264
+ cmp eax, r9d
+ je L_AES_ECB_encrypt_vaes_done_32
+L_AES_ECB_encrypt_vaes_enc_32:
+ ; 32 bytes of input
+ ; aes_ecb_enc_32
+ lea r10, QWORD PTR [rcx+rax]
+ lea r11, QWORD PTR [rdx+rax]
+ vmovdqu ymm0, YMMWORD PTR [r10]
+ ; aes_enc_block
+ vbroadcasti128 ymm7, [r9]
+ vpxor ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+16]
+ vaesenc ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+32]
+ vaesenc ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+48]
+ vaesenc ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+64]
+ vaesenc ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+80]
+ vaesenc ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+96]
+ vaesenc ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+112]
+ vaesenc ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+128]
+ vaesenc ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+144]
+ vaesenc ymm0, ymm0, ymm7
+ cmp eax, 11
+ vbroadcasti128 ymm7, [r9+160]
+ jl L_AES_ECB_encrypt_vaes_32_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+176]
+ vaesenc ymm0, ymm0, ymm7
+ cmp eax, 13
+ vbroadcasti128 ymm7, [r9+192]
+ jl L_AES_ECB_encrypt_vaes_32_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+208]
+ vaesenc ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+224]
+L_AES_ECB_encrypt_vaes_32_aes_enc_block_last:
+ vaesenclast ymm0, ymm0, ymm7
+ vmovdqu YMMWORD PTR [r11], ymm0
+ add eax, 32
+ cmp eax, r9d
+ jl L_AES_ECB_encrypt_vaes_enc_32
+L_AES_ECB_encrypt_vaes_done_32:
+ cmp eax, r8d
+ mov r9d, r8d
+ je L_AES_ECB_encrypt_vaes_done_enc
+ and r9d, 4294967280
+L_AES_ECB_encrypt_vaes_enc_16:
+ ; 16 bytes of input
+ lea r10, QWORD PTR [rcx+rax]
+ vmovdqu xmm0, OWORD PTR [r10]
+ ; aes_enc_block
+ vpxor xmm0, xmm0, [r9]
+ vmovdqu xmm5, OWORD PTR [r9+16]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+32]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+48]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+64]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+80]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+96]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+112]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+128]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+144]
+ vaesenc xmm0, xmm0, xmm5
+ cmp eax, 11
+ vmovdqu xmm5, OWORD PTR [r9+160]
+ jl L_AES_ECB_encrypt_vaes_16_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+176]
+ vaesenc xmm0, xmm0, xmm6
+ cmp eax, 13
+ vmovdqu xmm5, OWORD PTR [r9+192]
+ jl L_AES_ECB_encrypt_vaes_16_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+208]
+ vaesenc xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r9+224]
+L_AES_ECB_encrypt_vaes_16_aes_enc_block_last:
+ vaesenclast xmm0, xmm0, xmm5
+ lea r10, QWORD PTR [rdx+rax]
+ vmovdqu OWORD PTR [r10], xmm0
+ add eax, 16
+ cmp eax, r9d
+ jl L_AES_ECB_encrypt_vaes_enc_16
+L_AES_ECB_encrypt_vaes_done_enc:
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ add rsp, 32
+ ret
+AES_ECB_encrypt_vaes ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_ECB_decrypt_vaes PROC
+ mov eax, DWORD PTR [rsp+40]
+ sub rsp, 32
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ xor eax, eax
+ cmp r8d, 128
+ mov r9d, r8d
+ jl L_AES_ECB_decrypt_vaes_done_128
+ and r9d, 4294967168
+L_AES_ECB_decrypt_vaes_dec_128:
+ ; 128 bytes of input
+ ; aes_ecb_dec_128
+ lea r10, QWORD PTR [rcx+rax]
+ lea r11, QWORD PTR [rdx+rax]
+ vmovdqu ymm0, YMMWORD PTR [r10]
+ vmovdqu ymm1, YMMWORD PTR [r10+32]
+ vmovdqu ymm2, YMMWORD PTR [r10+64]
+ vmovdqu ymm3, YMMWORD PTR [r10+96]
+ ; aes_dec_block
+ vbroadcasti128 ymm7, [r9]
+ vpxor ymm0, ymm0, ymm7
+ vpxor ymm1, ymm1, ymm7
+ vpxor ymm2, ymm2, ymm7
+ vpxor ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+16]
+ vaesdec ymm0, ymm0, ymm7
+ vaesdec ymm1, ymm1, ymm7
+ vaesdec ymm2, ymm2, ymm7
+ vaesdec ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+32]
+ vaesdec ymm0, ymm0, ymm7
+ vaesdec ymm1, ymm1, ymm7
+ vaesdec ymm2, ymm2, ymm7
+ vaesdec ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+48]
+ vaesdec ymm0, ymm0, ymm7
+ vaesdec ymm1, ymm1, ymm7
+ vaesdec ymm2, ymm2, ymm7
+ vaesdec ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+64]
+ vaesdec ymm0, ymm0, ymm7
+ vaesdec ymm1, ymm1, ymm7
+ vaesdec ymm2, ymm2, ymm7
+ vaesdec ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+80]
+ vaesdec ymm0, ymm0, ymm7
+ vaesdec ymm1, ymm1, ymm7
+ vaesdec ymm2, ymm2, ymm7
+ vaesdec ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+96]
+ vaesdec ymm0, ymm0, ymm7
+ vaesdec ymm1, ymm1, ymm7
+ vaesdec ymm2, ymm2, ymm7
+ vaesdec ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+112]
+ vaesdec ymm0, ymm0, ymm7
+ vaesdec ymm1, ymm1, ymm7
+ vaesdec ymm2, ymm2, ymm7
+ vaesdec ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+128]
+ vaesdec ymm0, ymm0, ymm7
+ vaesdec ymm1, ymm1, ymm7
+ vaesdec ymm2, ymm2, ymm7
+ vaesdec ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+144]
+ vaesdec ymm0, ymm0, ymm7
+ vaesdec ymm1, ymm1, ymm7
+ vaesdec ymm2, ymm2, ymm7
+ vaesdec ymm3, ymm3, ymm7
+ cmp eax, 11
+ vbroadcasti128 ymm7, [r9+160]
+ jl L_AES_ECB_decrypt_vaes_128_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm7
+ vaesdec ymm1, ymm1, ymm7
+ vaesdec ymm2, ymm2, ymm7
+ vaesdec ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+176]
+ vaesdec ymm0, ymm0, ymm7
+ vaesdec ymm1, ymm1, ymm7
+ vaesdec ymm2, ymm2, ymm7
+ vaesdec ymm3, ymm3, ymm7
+ cmp eax, 13
+ vbroadcasti128 ymm7, [r9+192]
+ jl L_AES_ECB_decrypt_vaes_128_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm7
+ vaesdec ymm1, ymm1, ymm7
+ vaesdec ymm2, ymm2, ymm7
+ vaesdec ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+208]
+ vaesdec ymm0, ymm0, ymm7
+ vaesdec ymm1, ymm1, ymm7
+ vaesdec ymm2, ymm2, ymm7
+ vaesdec ymm3, ymm3, ymm7
+ vbroadcasti128 ymm7, [r9+224]
+L_AES_ECB_decrypt_vaes_128_aes_dec_block_last:
+ vaesdeclast ymm0, ymm0, ymm7
+ vaesdeclast ymm1, ymm1, ymm7
+ vaesdeclast ymm2, ymm2, ymm7
+ vaesdeclast ymm3, ymm3, ymm7
+ vmovdqu YMMWORD PTR [r11], ymm0
+ vmovdqu YMMWORD PTR [r11+32], ymm1
+ vmovdqu YMMWORD PTR [r11+64], ymm2
+ vmovdqu YMMWORD PTR [r11+96], ymm3
+ add eax, 128
+ cmp eax, r9d
+ jl L_AES_ECB_decrypt_vaes_dec_128
+L_AES_ECB_decrypt_vaes_done_128:
+ mov r9d, r8d
+ and r9d, 4294967264
+ cmp eax, r9d
+ je L_AES_ECB_decrypt_vaes_done_32
+L_AES_ECB_decrypt_vaes_dec_32:
+ ; 32 bytes of input
+ ; aes_ecb_dec_32
+ lea r10, QWORD PTR [rcx+rax]
+ lea r11, QWORD PTR [rdx+rax]
+ vmovdqu ymm0, YMMWORD PTR [r10]
+ ; aes_dec_block
+ vbroadcasti128 ymm7, [r9]
+ vpxor ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+16]
+ vaesdec ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+32]
+ vaesdec ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+48]
+ vaesdec ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+64]
+ vaesdec ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+80]
+ vaesdec ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+96]
+ vaesdec ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+112]
+ vaesdec ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+128]
+ vaesdec ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+144]
+ vaesdec ymm0, ymm0, ymm7
+ cmp eax, 11
+ vbroadcasti128 ymm7, [r9+160]
+ jl L_AES_ECB_decrypt_vaes_32_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+176]
+ vaesdec ymm0, ymm0, ymm7
+ cmp eax, 13
+ vbroadcasti128 ymm7, [r9+192]
+ jl L_AES_ECB_decrypt_vaes_32_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+208]
+ vaesdec ymm0, ymm0, ymm7
+ vbroadcasti128 ymm7, [r9+224]
+L_AES_ECB_decrypt_vaes_32_aes_dec_block_last:
+ vaesdeclast ymm0, ymm0, ymm7
+ vmovdqu YMMWORD PTR [r11], ymm0
+ add eax, 32
+ cmp eax, r9d
+ jl L_AES_ECB_decrypt_vaes_dec_32
+L_AES_ECB_decrypt_vaes_done_32:
+ cmp eax, r8d
+ mov r9d, r8d
+ je L_AES_ECB_decrypt_vaes_done_dec
+ and r9d, 4294967280
+L_AES_ECB_decrypt_vaes_dec_16:
+ ; 16 bytes of input
+ lea r10, QWORD PTR [rcx+rax]
+ vmovdqu xmm0, OWORD PTR [r10]
+ ; aes_dec_block
+ vpxor xmm0, xmm0, [r9]
+ vmovdqu xmm5, OWORD PTR [r9+16]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+32]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+48]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+64]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+80]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+96]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+112]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+128]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+144]
+ vaesdec xmm0, xmm0, xmm5
+ cmp eax, 11
+ vmovdqu xmm5, OWORD PTR [r9+160]
+ jl L_AES_ECB_decrypt_vaes_16_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+176]
+ vaesdec xmm0, xmm0, xmm6
+ cmp eax, 13
+ vmovdqu xmm5, OWORD PTR [r9+192]
+ jl L_AES_ECB_decrypt_vaes_16_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+208]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r9+224]
+L_AES_ECB_decrypt_vaes_16_aes_dec_block_last:
+ vaesdeclast xmm0, xmm0, xmm5
+ lea r10, QWORD PTR [rdx+rax]
+ vmovdqu OWORD PTR [r10], xmm0
+ add eax, 16
+ cmp eax, r9d
+ jl L_AES_ECB_decrypt_vaes_dec_16
+L_AES_ECB_decrypt_vaes_done_dec:
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ add rsp, 32
+ ret
+AES_ECB_decrypt_vaes ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_CBC_encrypt_vaes PROC
+ mov rax, QWORD PTR [rsp+40]
+ mov r10d, DWORD PTR [rsp+48]
+ vmovdqu xmm0, OWORD PTR [r8]
+ xor eax, eax
+ cmp eax, r9d
+ je L_AES_CBC_encrypt_vaes_done
+L_AES_CBC_encrypt_vaes_loop:
+ ; 16 bytes of input
+ lea r10, QWORD PTR [rcx+rax]
+ vmovdqu xmm1, OWORD PTR [r10]
+ vpxor xmm1, xmm1, xmm0
+ ; aes_enc_block
+ vpxor xmm1, xmm1, [rax]
+ vmovdqu xmm3, OWORD PTR [rax+16]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+32]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+48]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+64]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+80]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+96]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+112]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+128]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+144]
+ vaesenc xmm1, xmm1, xmm3
+ cmp r10d, 11
+ vmovdqu xmm3, OWORD PTR [rax+160]
+ jl L_AES_CBC_encrypt_vaes_aes_enc_block_last
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm4, OWORD PTR [rax+176]
+ vaesenc xmm1, xmm1, xmm4
+ cmp r10d, 13
+ vmovdqu xmm3, OWORD PTR [rax+192]
+ jl L_AES_CBC_encrypt_vaes_aes_enc_block_last
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm4, OWORD PTR [rax+208]
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqu xmm3, OWORD PTR [rax+224]
+L_AES_CBC_encrypt_vaes_aes_enc_block_last:
+ vaesenclast xmm1, xmm1, xmm3
+ lea r11, QWORD PTR [rdx+rax]
+ vmovdqu OWORD PTR [r11], xmm1
+ vmovdqa xmm0, xmm1
+ add eax, 16
+ cmp eax, r9d
+ jl L_AES_CBC_encrypt_vaes_loop
+L_AES_CBC_encrypt_vaes_done:
+ vmovdqu OWORD PTR [r8], xmm0
+ ret
+AES_CBC_encrypt_vaes ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_CBC_decrypt_vaes PROC
+ push r12
+ mov rax, QWORD PTR [rsp+48]
+ mov r10d, DWORD PTR [rsp+56]
+ sub rsp, 128
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu xmm8, OWORD PTR [r8]
+ xor eax, eax
+ cmp r9d, 128
+ mov r10d, r9d
+ jl L_AES_CBC_decrypt_vaes_done_128
+ and r10d, 4294967168
+L_AES_CBC_decrypt_vaes_dec_128:
+ ; 128 bytes of input
+ ; aes_cbc_dec_128
+ lea r11, QWORD PTR [rcx+rax]
+ lea r12, QWORD PTR [rdx+rax]
+ vmovdqu ymm0, YMMWORD PTR [r11]
+ vmovdqu ymm1, YMMWORD PTR [r11+32]
+ vmovdqu ymm2, YMMWORD PTR [r11+64]
+ vmovdqu ymm3, YMMWORD PTR [r11+96]
+ vinserti128 ymm10, ymm8, xmm0, 1
+ vmovdqu ymm11, YMMWORD PTR [r11+16]
+ vmovdqu ymm12, YMMWORD PTR [r11+48]
+ vmovdqu ymm13, YMMWORD PTR [r11+80]
+ vextracti128 xmm8, ymm3, 1
+ ; aes_dec_block
+ vbroadcasti128 ymm9, [rax]
+ vpxor ymm0, ymm0, ymm9
+ vpxor ymm1, ymm1, ymm9
+ vpxor ymm2, ymm2, ymm9
+ vpxor ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [rax+16]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [rax+32]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [rax+48]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [rax+64]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [rax+80]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [rax+96]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [rax+112]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [rax+128]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [rax+144]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ cmp r10d, 11
+ vbroadcasti128 ymm9, [rax+160]
+ jl L_AES_CBC_decrypt_vaes_128_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [rax+176]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ cmp r10d, 13
+ vbroadcasti128 ymm9, [rax+192]
+ jl L_AES_CBC_decrypt_vaes_128_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [rax+208]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [rax+224]
+L_AES_CBC_decrypt_vaes_128_aes_dec_block_last:
+ vaesdeclast ymm0, ymm0, ymm9
+ vaesdeclast ymm1, ymm1, ymm9
+ vaesdeclast ymm2, ymm2, ymm9
+ vaesdeclast ymm3, ymm3, ymm9
+ vpxor ymm0, ymm0, ymm10
+ vpxor ymm1, ymm1, ymm11
+ vpxor ymm2, ymm2, ymm12
+ vpxor ymm3, ymm3, ymm13
+ vmovdqu YMMWORD PTR [r12], ymm0
+ vmovdqu YMMWORD PTR [r12+32], ymm1
+ vmovdqu YMMWORD PTR [r12+64], ymm2
+ vmovdqu YMMWORD PTR [r12+96], ymm3
+ add eax, 128
+ cmp eax, r10d
+ jl L_AES_CBC_decrypt_vaes_dec_128
+L_AES_CBC_decrypt_vaes_done_128:
+ mov r10d, r9d
+ and r10d, 4294967264
+ cmp eax, r10d
+ je L_AES_CBC_decrypt_vaes_done_32
+L_AES_CBC_decrypt_vaes_dec_32:
+ ; 32 bytes of input
+ ; aes_cbc_dec_32
+ lea r11, QWORD PTR [rcx+rax]
+ lea r12, QWORD PTR [rdx+rax]
+ vmovdqu ymm0, YMMWORD PTR [r11]
+ vinserti128 ymm10, ymm8, xmm0, 1
+ vextracti128 xmm8, ymm0, 1
+ ; aes_dec_block
+ vbroadcasti128 ymm9, [rax]
+ vpxor ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [rax+16]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [rax+32]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [rax+48]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [rax+64]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [rax+80]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [rax+96]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [rax+112]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [rax+128]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [rax+144]
+ vaesdec ymm0, ymm0, ymm9
+ cmp r10d, 11
+ vbroadcasti128 ymm9, [rax+160]
+ jl L_AES_CBC_decrypt_vaes_32_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [rax+176]
+ vaesdec ymm0, ymm0, ymm9
+ cmp r10d, 13
+ vbroadcasti128 ymm9, [rax+192]
+ jl L_AES_CBC_decrypt_vaes_32_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [rax+208]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [rax+224]
+L_AES_CBC_decrypt_vaes_32_aes_dec_block_last:
+ vaesdeclast ymm0, ymm0, ymm9
+ vpxor ymm0, ymm0, ymm10
+ vmovdqu YMMWORD PTR [r12], ymm0
+ add eax, 32
+ cmp eax, r10d
+ jl L_AES_CBC_decrypt_vaes_dec_32
+L_AES_CBC_decrypt_vaes_done_32:
+ cmp eax, r9d
+ mov r10d, r9d
+ je L_AES_CBC_decrypt_vaes_done_dec
+ and r10d, 4294967280
+L_AES_CBC_decrypt_vaes_dec_16:
+ ; 16 bytes of input
+ lea r11, QWORD PTR [rcx+rax]
+ vmovdqu xmm0, OWORD PTR [r11]
+ vmovdqa xmm7, xmm0
+ ; aes_dec_block
+ vpxor xmm0, xmm0, [rax]
+ vmovdqu xmm5, OWORD PTR [rax+16]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+32]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+48]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+64]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+80]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+96]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+112]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+128]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+144]
+ vaesdec xmm0, xmm0, xmm5
+ cmp r10d, 11
+ vmovdqu xmm5, OWORD PTR [rax+160]
+ jl L_AES_CBC_decrypt_vaes_16_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [rax+176]
+ vaesdec xmm0, xmm0, xmm6
+ cmp r10d, 13
+ vmovdqu xmm5, OWORD PTR [rax+192]
+ jl L_AES_CBC_decrypt_vaes_16_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [rax+208]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [rax+224]
+L_AES_CBC_decrypt_vaes_16_aes_dec_block_last:
+ vaesdeclast xmm0, xmm0, xmm5
+ vpxor xmm0, xmm0, xmm8
+ vmovdqa xmm8, xmm7
+ lea r11, QWORD PTR [rdx+rax]
+ vmovdqu OWORD PTR [r11], xmm0
+ add eax, 16
+ cmp eax, r10d
+ jl L_AES_CBC_decrypt_vaes_dec_16
+L_AES_CBC_decrypt_vaes_done_dec:
+ vmovdqu OWORD PTR [r8], xmm8
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ add rsp, 128
+ pop r12
+ ret
+AES_CBC_decrypt_vaes ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_ctr_bswap_vaes QWORD \
+ 08090a0b0c0d0e0fh, 0001020304050607h
+ptr_L_aes_ctr_bswap_vaes QWORD L_aes_ctr_bswap_vaes
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_ctr_inc_vaes QWORD \
+ 0000000000000000h, 0000000000000000h,
+ 0000000000000001h, 0000000000000000h,
+ 0000000000000002h, 0000000000000000h,
+ 0000000000000003h, 0000000000000000h,
+ 0000000000000004h, 0000000000000000h,
+ 0000000000000005h, 0000000000000000h,
+ 0000000000000006h, 0000000000000000h,
+ 0000000000000007h, 0000000000000000h,
+ 0000000000000008h, 0000000000000000h,
+ 0000000000000009h, 0000000000000000h,
+ 000000000000000ah, 0000000000000000h,
+ 000000000000000bh, 0000000000000000h,
+ 000000000000000ch, 0000000000000000h,
+ 000000000000000dh, 0000000000000000h,
+ 000000000000000eh, 0000000000000000h,
+ 000000000000000fh, 0000000000000000h,
+ 0000000000000010h, 0000000000000000h
+ptr_L_aes_ctr_inc_vaes QWORD L_aes_ctr_inc_vaes
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+AES_CTR_encrypt_vaes PROC
+ push rbx
+ mov eax, DWORD PTR [rsp+48]
+ mov r10, QWORD PTR [rsp+56]
+ sub rsp, 144
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vbroadcasti128 ymm8, ptr_L_aes_ctr_bswap_vaes
+ vbroadcasti128 ymm7, [r10]
+ vpshufb ymm7, ymm7, ymm8
+ vbroadcasti128 ymm10, [ptr_L_aes_ctr_inc_vaes+128]
+ vbroadcasti128 ymm11, [ptr_L_aes_ctr_inc_vaes+32]
+ vbroadcasti128 ymm12, [ptr_L_aes_ctr_inc_vaes+16]
+ xor eax, eax
+ cmp r8d, 128
+ mov r10d, r8d
+ jl L_AES_CTR_encrypt_vaes_done_128
+ and r10d, 4294967168
+ vmovdqa ymm9, ymm7
+ vpaddq ymm4, ymm7, [ptr_L_aes_ctr_inc_vaes]
+ vpand ymm14, ymm9, [ptr_L_aes_ctr_inc_vaes]
+ vpor ymm9, ymm9, [ptr_L_aes_ctr_inc_vaes]
+ vpandn ymm9, ymm4, ymm9
+ vpor ymm9, ymm9, ymm14
+ vpsrlq ymm9, ymm9, 63
+ vpslldq ymm9, ymm9, 8
+ vpaddq ymm4, ymm4, ymm9
+ vmovdqa ymm9, ymm7
+ vpaddq ymm5, ymm7, [ptr_L_aes_ctr_inc_vaes+32]
+ vpand ymm14, ymm9, [ptr_L_aes_ctr_inc_vaes+32]
+ vpor ymm9, ymm9, [ptr_L_aes_ctr_inc_vaes+32]
+ vpandn ymm9, ymm5, ymm9
+ vpor ymm9, ymm9, ymm14
+ vpsrlq ymm9, ymm9, 63
+ vpslldq ymm9, ymm9, 8
+ vpaddq ymm5, ymm5, ymm9
+ vmovdqa ymm9, ymm7
+ vpaddq ymm6, ymm7, [ptr_L_aes_ctr_inc_vaes+64]
+ vpand ymm14, ymm9, [ptr_L_aes_ctr_inc_vaes+64]
+ vpor ymm9, ymm9, [ptr_L_aes_ctr_inc_vaes+64]
+ vpandn ymm9, ymm6, ymm9
+ vpor ymm9, ymm9, ymm14
+ vpsrlq ymm9, ymm9, 63
+ vpslldq ymm9, ymm9, 8
+ vpaddq ymm6, ymm6, ymm9
+ vmovdqa ymm9, ymm7
+ vpaddq ymm7, ymm7, [ptr_L_aes_ctr_inc_vaes+96]
+ vpand ymm14, ymm9, [ptr_L_aes_ctr_inc_vaes+96]
+ vpor ymm9, ymm9, [ptr_L_aes_ctr_inc_vaes+96]
+ vpandn ymm9, ymm7, ymm9
+ vpor ymm9, ymm9, ymm14
+ vpsrlq ymm9, ymm9, 63
+ vpslldq ymm9, ymm9, 8
+ vpaddq ymm7, ymm7, ymm9
+L_AES_CTR_encrypt_vaes_enc_128:
+ ; 128 bytes of input
+ lea r11, QWORD PTR [rcx+rax]
+ lea rbx, QWORD PTR [rdx+rax]
+ vpshufb ymm0, ymm4, ymm8
+ vpshufb ymm1, ymm5, ymm8
+ vpshufb ymm2, ymm6, ymm8
+ vpshufb ymm3, ymm7, ymm8
+ vmovdqa ymm9, ymm4
+ vpaddq ymm4, ymm4, ymm10
+ vpand ymm14, ymm9, ymm10
+ vpor ymm9, ymm9, ymm10
+ vpandn ymm9, ymm4, ymm9
+ vpor ymm9, ymm9, ymm14
+ vpsrlq ymm9, ymm9, 63
+ vpslldq ymm9, ymm9, 8
+ vpaddq ymm4, ymm4, ymm9
+ vmovdqa ymm9, ymm5
+ vpaddq ymm5, ymm5, ymm10
+ vpand ymm14, ymm9, ymm10
+ vpor ymm9, ymm9, ymm10
+ vpandn ymm9, ymm5, ymm9
+ vpor ymm9, ymm9, ymm14
+ vpsrlq ymm9, ymm9, 63
+ vpslldq ymm9, ymm9, 8
+ vpaddq ymm5, ymm5, ymm9
+ vmovdqa ymm9, ymm6
+ vpaddq ymm6, ymm6, ymm10
+ vpand ymm14, ymm9, ymm10
+ vpor ymm9, ymm9, ymm10
+ vpandn ymm9, ymm6, ymm9
+ vpor ymm9, ymm9, ymm14
+ vpsrlq ymm9, ymm9, 63
+ vpslldq ymm9, ymm9, 8
+ vpaddq ymm6, ymm6, ymm9
+ vmovdqa ymm9, ymm7
+ vpaddq ymm7, ymm7, ymm10
+ vpand ymm14, ymm9, ymm10
+ vpor ymm9, ymm9, ymm10
+ vpandn ymm9, ymm7, ymm9
+ vpor ymm9, ymm9, ymm14
+ vpsrlq ymm9, ymm9, 63
+ vpslldq ymm9, ymm9, 8
+ vpaddq ymm7, ymm7, ymm9
+ ; aes_enc_block
+ vbroadcasti128 ymm13, [r9]
+ vpxor ymm0, ymm0, ymm13
+ vpxor ymm1, ymm1, ymm13
+ vpxor ymm2, ymm2, ymm13
+ vpxor ymm3, ymm3, ymm13
+ vbroadcasti128 ymm13, [r9+16]
+ vaesenc ymm0, ymm0, ymm13
+ vaesenc ymm1, ymm1, ymm13
+ vaesenc ymm2, ymm2, ymm13
+ vaesenc ymm3, ymm3, ymm13
+ vbroadcasti128 ymm13, [r9+32]
+ vaesenc ymm0, ymm0, ymm13
+ vaesenc ymm1, ymm1, ymm13
+ vaesenc ymm2, ymm2, ymm13
+ vaesenc ymm3, ymm3, ymm13
+ vbroadcasti128 ymm13, [r9+48]
+ vaesenc ymm0, ymm0, ymm13
+ vaesenc ymm1, ymm1, ymm13
+ vaesenc ymm2, ymm2, ymm13
+ vaesenc ymm3, ymm3, ymm13
+ vbroadcasti128 ymm13, [r9+64]
+ vaesenc ymm0, ymm0, ymm13
+ vaesenc ymm1, ymm1, ymm13
+ vaesenc ymm2, ymm2, ymm13
+ vaesenc ymm3, ymm3, ymm13
+ vbroadcasti128 ymm13, [r9+80]
+ vaesenc ymm0, ymm0, ymm13
+ vaesenc ymm1, ymm1, ymm13
+ vaesenc ymm2, ymm2, ymm13
+ vaesenc ymm3, ymm3, ymm13
+ vbroadcasti128 ymm13, [r9+96]
+ vaesenc ymm0, ymm0, ymm13
+ vaesenc ymm1, ymm1, ymm13
+ vaesenc ymm2, ymm2, ymm13
+ vaesenc ymm3, ymm3, ymm13
+ vbroadcasti128 ymm13, [r9+112]
+ vaesenc ymm0, ymm0, ymm13
+ vaesenc ymm1, ymm1, ymm13
+ vaesenc ymm2, ymm2, ymm13
+ vaesenc ymm3, ymm3, ymm13
+ vbroadcasti128 ymm13, [r9+128]
+ vaesenc ymm0, ymm0, ymm13
+ vaesenc ymm1, ymm1, ymm13
+ vaesenc ymm2, ymm2, ymm13
+ vaesenc ymm3, ymm3, ymm13
+ vbroadcasti128 ymm13, [r9+144]
+ vaesenc ymm0, ymm0, ymm13
+ vaesenc ymm1, ymm1, ymm13
+ vaesenc ymm2, ymm2, ymm13
+ vaesenc ymm3, ymm3, ymm13
+ cmp eax, 11
+ vbroadcasti128 ymm13, [r9+160]
+ jl L_AES_CTR_encrypt_vaes_128_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm13
+ vaesenc ymm1, ymm1, ymm13
+ vaesenc ymm2, ymm2, ymm13
+ vaesenc ymm3, ymm3, ymm13
+ vbroadcasti128 ymm13, [r9+176]
+ vaesenc ymm0, ymm0, ymm13
+ vaesenc ymm1, ymm1, ymm13
+ vaesenc ymm2, ymm2, ymm13
+ vaesenc ymm3, ymm3, ymm13
+ cmp eax, 13
+ vbroadcasti128 ymm13, [r9+192]
+ jl L_AES_CTR_encrypt_vaes_128_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm13
+ vaesenc ymm1, ymm1, ymm13
+ vaesenc ymm2, ymm2, ymm13
+ vaesenc ymm3, ymm3, ymm13
+ vbroadcasti128 ymm13, [r9+208]
+ vaesenc ymm0, ymm0, ymm13
+ vaesenc ymm1, ymm1, ymm13
+ vaesenc ymm2, ymm2, ymm13
+ vaesenc ymm3, ymm3, ymm13
+ vbroadcasti128 ymm13, [r9+224]
+L_AES_CTR_encrypt_vaes_128_aes_enc_block_last:
+ vaesenclast ymm0, ymm0, ymm13
+ vaesenclast ymm1, ymm1, ymm13
+ vaesenclast ymm2, ymm2, ymm13
+ vaesenclast ymm3, ymm3, ymm13
+ vpxor ymm0, ymm0, [r11]
+ vpxor ymm1, ymm1, [r11+32]
+ vpxor ymm2, ymm2, [r11+64]
+ vpxor ymm3, ymm3, [r11+96]
+ vmovdqu YMMWORD PTR [rbx], ymm0
+ vmovdqu YMMWORD PTR [rbx+32], ymm1
+ vmovdqu YMMWORD PTR [rbx+64], ymm2
+ vmovdqu YMMWORD PTR [rbx+96], ymm3
+ add eax, 128
+ cmp eax, r10d
+ jl L_AES_CTR_encrypt_vaes_enc_128
+ vperm2i128 ymm7, ymm4, ymm4, 0
+L_AES_CTR_encrypt_vaes_done_128:
+ mov r10d, r8d
+ and r10d, 4294967264
+ cmp eax, r10d
+ je L_AES_CTR_encrypt_vaes_done_32
+L_AES_CTR_encrypt_vaes_enc_32:
+ ; 32 bytes of input
+ ; aes_ctr_enc_32
+ lea r11, QWORD PTR [rcx+rax]
+ lea rbx, QWORD PTR [rdx+rax]
+ vpaddq ymm0, ymm7, [ptr_L_aes_ctr_inc_vaes]
+ vmovdqa ymm9, ymm7
+ vpand ymm14, ymm9, [ptr_L_aes_ctr_inc_vaes]
+ vpor ymm9, ymm9, [ptr_L_aes_ctr_inc_vaes]
+ vpandn ymm9, ymm0, ymm9
+ vpor ymm9, ymm9, ymm14
+ vpsrlq ymm9, ymm9, 63
+ vpslldq ymm9, ymm9, 8
+ vpaddq ymm0, ymm0, ymm9
+ vpshufb ymm0, ymm0, ymm8
+ vmovdqa ymm9, ymm7
+ vpaddq ymm7, ymm7, ymm11
+ vpand ymm14, ymm9, ymm11
+ vpor ymm9, ymm9, ymm11
+ vpandn ymm9, ymm7, ymm9
+ vpor ymm9, ymm9, ymm14
+ vpsrlq ymm9, ymm9, 63
+ vpslldq ymm9, ymm9, 8
+ vpaddq ymm7, ymm7, ymm9
+ ; aes_enc_block
+ vbroadcasti128 ymm13, [r9]
+ vpxor ymm0, ymm0, ymm13
+ vbroadcasti128 ymm13, [r9+16]
+ vaesenc ymm0, ymm0, ymm13
+ vbroadcasti128 ymm13, [r9+32]
+ vaesenc ymm0, ymm0, ymm13
+ vbroadcasti128 ymm13, [r9+48]
+ vaesenc ymm0, ymm0, ymm13
+ vbroadcasti128 ymm13, [r9+64]
+ vaesenc ymm0, ymm0, ymm13
+ vbroadcasti128 ymm13, [r9+80]
+ vaesenc ymm0, ymm0, ymm13
+ vbroadcasti128 ymm13, [r9+96]
+ vaesenc ymm0, ymm0, ymm13
+ vbroadcasti128 ymm13, [r9+112]
+ vaesenc ymm0, ymm0, ymm13
+ vbroadcasti128 ymm13, [r9+128]
+ vaesenc ymm0, ymm0, ymm13
+ vbroadcasti128 ymm13, [r9+144]
+ vaesenc ymm0, ymm0, ymm13
+ cmp eax, 11
+ vbroadcasti128 ymm13, [r9+160]
+ jl L_AES_CTR_encrypt_vaes_32_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm13
+ vbroadcasti128 ymm13, [r9+176]
+ vaesenc ymm0, ymm0, ymm13
+ cmp eax, 13
+ vbroadcasti128 ymm13, [r9+192]
+ jl L_AES_CTR_encrypt_vaes_32_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm13
+ vbroadcasti128 ymm13, [r9+208]
+ vaesenc ymm0, ymm0, ymm13
+ vbroadcasti128 ymm13, [r9+224]
+L_AES_CTR_encrypt_vaes_32_aes_enc_block_last:
+ vaesenclast ymm0, ymm0, ymm13
+ vpxor ymm0, ymm0, [r11]
+ vmovdqu YMMWORD PTR [rbx], ymm0
+ add eax, 32
+ cmp eax, r10d
+ jl L_AES_CTR_encrypt_vaes_enc_32
+L_AES_CTR_encrypt_vaes_done_32:
+ cmp eax, r8d
+ mov r10d, r8d
+ je L_AES_CTR_encrypt_vaes_done_enc
+ and r10d, 4294967280
+L_AES_CTR_encrypt_vaes_enc_16:
+ ; 16 bytes of input
+ vpshufb xmm0, xmm7, xmm8
+ vmovdqa ymm9, ymm7
+ vpaddq ymm7, ymm7, ymm12
+ vpand ymm14, ymm9, ymm12
+ vpor ymm9, ymm9, ymm12
+ vpandn ymm9, ymm7, ymm9
+ vpor ymm9, ymm9, ymm14
+ vpsrlq ymm9, ymm9, 63
+ vpslldq ymm9, ymm9, 8
+ vpaddq ymm7, ymm7, ymm9
+ ; aes_enc_block
+ vpxor xmm0, xmm0, [r9]
+ vmovdqu xmm5, OWORD PTR [r9+16]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+32]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+48]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+64]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+80]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+96]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+112]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+128]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+144]
+ vaesenc xmm0, xmm0, xmm5
+ cmp eax, 11
+ vmovdqu xmm5, OWORD PTR [r9+160]
+ jl L_AES_CTR_encrypt_vaes_16_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+176]
+ vaesenc xmm0, xmm0, xmm6
+ cmp eax, 13
+ vmovdqu xmm5, OWORD PTR [r9+192]
+ jl L_AES_CTR_encrypt_vaes_16_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+208]
+ vaesenc xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r9+224]
+L_AES_CTR_encrypt_vaes_16_aes_enc_block_last:
+ vaesenclast xmm0, xmm0, xmm5
+ lea r11, QWORD PTR [rcx+rax]
+ vpxor xmm0, xmm0, [r11]
+ lea r11, QWORD PTR [rdx+rax]
+ vmovdqu OWORD PTR [r11], xmm0
+ add eax, 16
+ cmp eax, r10d
+ jl L_AES_CTR_encrypt_vaes_enc_16
+L_AES_CTR_encrypt_vaes_done_enc:
+ vpshufb xmm0, xmm7, xmm8
+ vmovdqu OWORD PTR [r10], xmm0
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ add rsp, 144
+ pop rbx
+ ret
+AES_CTR_encrypt_vaes ENDP
+_TEXT ENDS
+ENDIF
+IFDEF HAVE_INTEL_AVX512
+_TEXT SEGMENT READONLY PARA
+AES_ECB_encrypt_avx512 PROC
+ mov eax, DWORD PTR [rsp+40]
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ xor eax, eax
+ cmp r8d, 64
+ jl L_AES_ECB_encrypt_avx512_done_64
+ vbroadcasti32x4 zmm8, [r9]
+ vbroadcasti32x4 zmm9, [r9+16]
+ vbroadcasti32x4 zmm10, [r9+32]
+ vbroadcasti32x4 zmm11, [r9+48]
+ vbroadcasti32x4 zmm12, [r9+64]
+ vbroadcasti32x4 zmm13, [r9+80]
+ vbroadcasti32x4 zmm14, [r9+96]
+ vbroadcasti32x4 zmm15, [r9+112]
+ vbroadcasti32x4 zmm16, [r9+128]
+ vbroadcasti32x4 zmm17, [r9+144]
+ vbroadcasti32x4 zmm18, [r9+160]
+ cmp eax, 11
+ jl L_AES_ECB_encrypt_avx512_key_cached
+ vbroadcasti32x4 zmm19, [r9+176]
+ vbroadcasti32x4 zmm20, [r9+192]
+ cmp eax, 13
+ jl L_AES_ECB_encrypt_avx512_key_cached
+ vbroadcasti32x4 zmm21, [r9+208]
+ vbroadcasti32x4 zmm22, [r9+224]
+L_AES_ECB_encrypt_avx512_key_cached:
+ cmp r8d, 256
+ mov r9d, r8d
+ jl L_AES_ECB_encrypt_avx512_done_256
+ and r9d, 4294967040
+L_AES_ECB_encrypt_avx512_enc_256:
+ ; 256 bytes of input
+ ; aes_ecb_enc_256
+ lea r10, QWORD PTR [rcx+rax]
+ lea r11, QWORD PTR [rdx+rax]
+ vmovdqu64 zmm0, [r10]
+ vmovdqu64 zmm1, [r10+64]
+ vmovdqu64 zmm2, [r10+128]
+ vmovdqu64 zmm3, [r10+192]
+ ; aes_enc_block
+ vpxorq zmm0, zmm0, zmm8
+ vpxorq zmm1, zmm1, zmm8
+ vpxorq zmm2, zmm2, zmm8
+ vpxorq zmm3, zmm3, zmm8
+ vaesenc zmm0, zmm0, zmm9
+ vaesenc zmm1, zmm1, zmm9
+ vaesenc zmm2, zmm2, zmm9
+ vaesenc zmm3, zmm3, zmm9
+ vaesenc zmm0, zmm0, zmm10
+ vaesenc zmm1, zmm1, zmm10
+ vaesenc zmm2, zmm2, zmm10
+ vaesenc zmm3, zmm3, zmm10
+ vaesenc zmm0, zmm0, zmm11
+ vaesenc zmm1, zmm1, zmm11
+ vaesenc zmm2, zmm2, zmm11
+ vaesenc zmm3, zmm3, zmm11
+ vaesenc zmm0, zmm0, zmm12
+ vaesenc zmm1, zmm1, zmm12
+ vaesenc zmm2, zmm2, zmm12
+ vaesenc zmm3, zmm3, zmm12
+ vaesenc zmm0, zmm0, zmm13
+ vaesenc zmm1, zmm1, zmm13
+ vaesenc zmm2, zmm2, zmm13
+ vaesenc zmm3, zmm3, zmm13
+ vaesenc zmm0, zmm0, zmm14
+ vaesenc zmm1, zmm1, zmm14
+ vaesenc zmm2, zmm2, zmm14
+ vaesenc zmm3, zmm3, zmm14
+ vaesenc zmm0, zmm0, zmm15
+ vaesenc zmm1, zmm1, zmm15
+ vaesenc zmm2, zmm2, zmm15
+ vaesenc zmm3, zmm3, zmm15
+ vaesenc zmm0, zmm0, zmm16
+ vaesenc zmm1, zmm1, zmm16
+ vaesenc zmm2, zmm2, zmm16
+ vaesenc zmm3, zmm3, zmm16
+ vaesenc zmm0, zmm0, zmm17
+ vaesenc zmm1, zmm1, zmm17
+ vaesenc zmm2, zmm2, zmm17
+ vaesenc zmm3, zmm3, zmm17
+ cmp eax, 11
+ vmovdqa64 zmm7, zmm18
+ jl L_AES_ECB_encrypt_avx512_256_aes_enc_block_last
+ vaesenc zmm0, zmm0, zmm18
+ vaesenc zmm1, zmm1, zmm18
+ vaesenc zmm2, zmm2, zmm18
+ vaesenc zmm3, zmm3, zmm18
+ vaesenc zmm0, zmm0, zmm19
+ vaesenc zmm1, zmm1, zmm19
+ vaesenc zmm2, zmm2, zmm19
+ vaesenc zmm3, zmm3, zmm19
+ cmp eax, 13
+ vmovdqa64 zmm7, zmm20
+ jl L_AES_ECB_encrypt_avx512_256_aes_enc_block_last
+ vaesenc zmm0, zmm0, zmm20
+ vaesenc zmm1, zmm1, zmm20
+ vaesenc zmm2, zmm2, zmm20
+ vaesenc zmm3, zmm3, zmm20
+ vaesenc zmm0, zmm0, zmm21
+ vaesenc zmm1, zmm1, zmm21
+ vaesenc zmm2, zmm2, zmm21
+ vaesenc zmm3, zmm3, zmm21
+ vmovdqa64 zmm7, zmm22
+L_AES_ECB_encrypt_avx512_256_aes_enc_block_last:
+ vaesenclast zmm0, zmm0, zmm7
+ vaesenclast zmm1, zmm1, zmm7
+ vaesenclast zmm2, zmm2, zmm7
+ vaesenclast zmm3, zmm3, zmm7
+ vmovdqu64 [r11], zmm0
+ vmovdqu64 [r11+64], zmm1
+ vmovdqu64 [r11+128], zmm2
+ vmovdqu64 [r11+192], zmm3
+ add eax, 256
+ cmp eax, r9d
+ jl L_AES_ECB_encrypt_avx512_enc_256
+L_AES_ECB_encrypt_avx512_done_256:
+ mov r9d, r8d
+ and r9d, 4294967232
+ cmp eax, r9d
+ je L_AES_ECB_encrypt_avx512_done_64
+L_AES_ECB_encrypt_avx512_enc_64:
+ ; 64 bytes of input
+ ; aes_ecb_enc_64
+ lea r10, QWORD PTR [rcx+rax]
+ lea r11, QWORD PTR [rdx+rax]
+ vmovdqu64 zmm0, [r10]
+ ; aes_enc_block
+ vpxorq zmm0, zmm0, zmm8
+ vaesenc zmm0, zmm0, zmm9
+ vaesenc zmm0, zmm0, zmm10
+ vaesenc zmm0, zmm0, zmm11
+ vaesenc zmm0, zmm0, zmm12
+ vaesenc zmm0, zmm0, zmm13
+ vaesenc zmm0, zmm0, zmm14
+ vaesenc zmm0, zmm0, zmm15
+ vaesenc zmm0, zmm0, zmm16
+ vaesenc zmm0, zmm0, zmm17
+ cmp eax, 11
+ vmovdqa64 zmm7, zmm18
+ jl L_AES_ECB_encrypt_avx512_64_aes_enc_block_last
+ vaesenc zmm0, zmm0, zmm18
+ vaesenc zmm0, zmm0, zmm19
+ cmp eax, 13
+ vmovdqa64 zmm7, zmm20
+ jl L_AES_ECB_encrypt_avx512_64_aes_enc_block_last
+ vaesenc zmm0, zmm0, zmm20
+ vaesenc zmm0, zmm0, zmm21
+ vmovdqa64 zmm7, zmm22
+L_AES_ECB_encrypt_avx512_64_aes_enc_block_last:
+ vaesenclast zmm0, zmm0, zmm7
+ vmovdqu64 [r11], zmm0
+ add eax, 64
+ cmp eax, r9d
+ jl L_AES_ECB_encrypt_avx512_enc_64
+L_AES_ECB_encrypt_avx512_done_64:
+ cmp eax, r8d
+ mov r9d, r8d
+ je L_AES_ECB_encrypt_avx512_done_enc
+ and r9d, 4294967280
+L_AES_ECB_encrypt_avx512_enc_16:
+ ; 16 bytes of input
+ lea r10, QWORD PTR [rcx+rax]
+ vmovdqu xmm0, OWORD PTR [r10]
+ ; aes_enc_block
+ vpxor xmm0, xmm0, [r9]
+ vmovdqu xmm5, OWORD PTR [r9+16]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+32]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+48]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+64]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+80]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+96]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+112]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+128]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+144]
+ vaesenc xmm0, xmm0, xmm5
+ cmp eax, 11
+ vmovdqu xmm5, OWORD PTR [r9+160]
+ jl L_AES_ECB_encrypt_avx512_16_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+176]
+ vaesenc xmm0, xmm0, xmm6
+ cmp eax, 13
+ vmovdqu xmm5, OWORD PTR [r9+192]
+ jl L_AES_ECB_encrypt_avx512_16_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+208]
+ vaesenc xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r9+224]
+L_AES_ECB_encrypt_avx512_16_aes_enc_block_last:
+ vaesenclast xmm0, xmm0, xmm5
+ lea r10, QWORD PTR [rdx+rax]
+ vmovdqu OWORD PTR [r10], xmm0
+ add eax, 16
+ cmp eax, r9d
+ jl L_AES_ECB_encrypt_avx512_enc_16
+L_AES_ECB_encrypt_avx512_done_enc:
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ ret
+AES_ECB_encrypt_avx512 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_ECB_decrypt_avx512 PROC
+ mov eax, DWORD PTR [rsp+40]
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ xor eax, eax
+ cmp r8d, 64
+ jl L_AES_ECB_decrypt_avx512_done_64
+ vbroadcasti32x4 zmm8, [r9]
+ vbroadcasti32x4 zmm9, [r9+16]
+ vbroadcasti32x4 zmm10, [r9+32]
+ vbroadcasti32x4 zmm11, [r9+48]
+ vbroadcasti32x4 zmm12, [r9+64]
+ vbroadcasti32x4 zmm13, [r9+80]
+ vbroadcasti32x4 zmm14, [r9+96]
+ vbroadcasti32x4 zmm15, [r9+112]
+ vbroadcasti32x4 zmm16, [r9+128]
+ vbroadcasti32x4 zmm17, [r9+144]
+ vbroadcasti32x4 zmm18, [r9+160]
+ cmp eax, 11
+ jl L_AES_ECB_decrypt_avx512_key_cached
+ vbroadcasti32x4 zmm19, [r9+176]
+ vbroadcasti32x4 zmm20, [r9+192]
+ cmp eax, 13
+ jl L_AES_ECB_decrypt_avx512_key_cached
+ vbroadcasti32x4 zmm21, [r9+208]
+ vbroadcasti32x4 zmm22, [r9+224]
+L_AES_ECB_decrypt_avx512_key_cached:
+ cmp r8d, 256
+ mov r9d, r8d
+ jl L_AES_ECB_decrypt_avx512_done_256
+ and r9d, 4294967040
+L_AES_ECB_decrypt_avx512_dec_256:
+ ; 256 bytes of input
+ ; aes_ecb_dec_256
+ lea r10, QWORD PTR [rcx+rax]
+ lea r11, QWORD PTR [rdx+rax]
+ vmovdqu64 zmm0, [r10]
+ vmovdqu64 zmm1, [r10+64]
+ vmovdqu64 zmm2, [r10+128]
+ vmovdqu64 zmm3, [r10+192]
+ ; aes_dec_block
+ vpxorq zmm0, zmm0, zmm8
+ vpxorq zmm1, zmm1, zmm8
+ vpxorq zmm2, zmm2, zmm8
+ vpxorq zmm3, zmm3, zmm8
+ vaesdec zmm0, zmm0, zmm9
+ vaesdec zmm1, zmm1, zmm9
+ vaesdec zmm2, zmm2, zmm9
+ vaesdec zmm3, zmm3, zmm9
+ vaesdec zmm0, zmm0, zmm10
+ vaesdec zmm1, zmm1, zmm10
+ vaesdec zmm2, zmm2, zmm10
+ vaesdec zmm3, zmm3, zmm10
+ vaesdec zmm0, zmm0, zmm11
+ vaesdec zmm1, zmm1, zmm11
+ vaesdec zmm2, zmm2, zmm11
+ vaesdec zmm3, zmm3, zmm11
+ vaesdec zmm0, zmm0, zmm12
+ vaesdec zmm1, zmm1, zmm12
+ vaesdec zmm2, zmm2, zmm12
+ vaesdec zmm3, zmm3, zmm12
+ vaesdec zmm0, zmm0, zmm13
+ vaesdec zmm1, zmm1, zmm13
+ vaesdec zmm2, zmm2, zmm13
+ vaesdec zmm3, zmm3, zmm13
+ vaesdec zmm0, zmm0, zmm14
+ vaesdec zmm1, zmm1, zmm14
+ vaesdec zmm2, zmm2, zmm14
+ vaesdec zmm3, zmm3, zmm14
+ vaesdec zmm0, zmm0, zmm15
+ vaesdec zmm1, zmm1, zmm15
+ vaesdec zmm2, zmm2, zmm15
+ vaesdec zmm3, zmm3, zmm15
+ vaesdec zmm0, zmm0, zmm16
+ vaesdec zmm1, zmm1, zmm16
+ vaesdec zmm2, zmm2, zmm16
+ vaesdec zmm3, zmm3, zmm16
+ vaesdec zmm0, zmm0, zmm17
+ vaesdec zmm1, zmm1, zmm17
+ vaesdec zmm2, zmm2, zmm17
+ vaesdec zmm3, zmm3, zmm17
+ cmp eax, 11
+ vmovdqa64 zmm7, zmm18
+ jl L_AES_ECB_decrypt_avx512_256_aes_dec_block_last
+ vaesdec zmm0, zmm0, zmm18
+ vaesdec zmm1, zmm1, zmm18
+ vaesdec zmm2, zmm2, zmm18
+ vaesdec zmm3, zmm3, zmm18
+ vaesdec zmm0, zmm0, zmm19
+ vaesdec zmm1, zmm1, zmm19
+ vaesdec zmm2, zmm2, zmm19
+ vaesdec zmm3, zmm3, zmm19
+ cmp eax, 13
+ vmovdqa64 zmm7, zmm20
+ jl L_AES_ECB_decrypt_avx512_256_aes_dec_block_last
+ vaesdec zmm0, zmm0, zmm20
+ vaesdec zmm1, zmm1, zmm20
+ vaesdec zmm2, zmm2, zmm20
+ vaesdec zmm3, zmm3, zmm20
+ vaesdec zmm0, zmm0, zmm21
+ vaesdec zmm1, zmm1, zmm21
+ vaesdec zmm2, zmm2, zmm21
+ vaesdec zmm3, zmm3, zmm21
+ vmovdqa64 zmm7, zmm22
+L_AES_ECB_decrypt_avx512_256_aes_dec_block_last:
+ vaesdeclast zmm0, zmm0, zmm7
+ vaesdeclast zmm1, zmm1, zmm7
+ vaesdeclast zmm2, zmm2, zmm7
+ vaesdeclast zmm3, zmm3, zmm7
+ vmovdqu64 [r11], zmm0
+ vmovdqu64 [r11+64], zmm1
+ vmovdqu64 [r11+128], zmm2
+ vmovdqu64 [r11+192], zmm3
+ add eax, 256
+ cmp eax, r9d
+ jl L_AES_ECB_decrypt_avx512_dec_256
+L_AES_ECB_decrypt_avx512_done_256:
+ mov r9d, r8d
+ and r9d, 4294967232
+ cmp eax, r9d
+ je L_AES_ECB_decrypt_avx512_done_64
+L_AES_ECB_decrypt_avx512_dec_64:
+ ; 64 bytes of input
+ ; aes_ecb_dec_64
+ lea r10, QWORD PTR [rcx+rax]
+ lea r11, QWORD PTR [rdx+rax]
+ vmovdqu64 zmm0, [r10]
+ ; aes_dec_block
+ vpxorq zmm0, zmm0, zmm8
+ vaesdec zmm0, zmm0, zmm9
+ vaesdec zmm0, zmm0, zmm10
+ vaesdec zmm0, zmm0, zmm11
+ vaesdec zmm0, zmm0, zmm12
+ vaesdec zmm0, zmm0, zmm13
+ vaesdec zmm0, zmm0, zmm14
+ vaesdec zmm0, zmm0, zmm15
+ vaesdec zmm0, zmm0, zmm16
+ vaesdec zmm0, zmm0, zmm17
+ cmp eax, 11
+ vmovdqa64 zmm7, zmm18
+ jl L_AES_ECB_decrypt_avx512_64_aes_dec_block_last
+ vaesdec zmm0, zmm0, zmm18
+ vaesdec zmm0, zmm0, zmm19
+ cmp eax, 13
+ vmovdqa64 zmm7, zmm20
+ jl L_AES_ECB_decrypt_avx512_64_aes_dec_block_last
+ vaesdec zmm0, zmm0, zmm20
+ vaesdec zmm0, zmm0, zmm21
+ vmovdqa64 zmm7, zmm22
+L_AES_ECB_decrypt_avx512_64_aes_dec_block_last:
+ vaesdeclast zmm0, zmm0, zmm7
+ vmovdqu64 [r11], zmm0
+ add eax, 64
+ cmp eax, r9d
+ jl L_AES_ECB_decrypt_avx512_dec_64
+L_AES_ECB_decrypt_avx512_done_64:
+ cmp eax, r8d
+ mov r9d, r8d
+ je L_AES_ECB_decrypt_avx512_done_dec
+ and r9d, 4294967280
+L_AES_ECB_decrypt_avx512_dec_16:
+ ; 16 bytes of input
+ lea r10, QWORD PTR [rcx+rax]
+ vmovdqu xmm0, OWORD PTR [r10]
+ ; aes_dec_block
+ vpxor xmm0, xmm0, [r9]
+ vmovdqu xmm5, OWORD PTR [r9+16]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+32]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+48]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+64]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+80]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+96]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+112]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+128]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+144]
+ vaesdec xmm0, xmm0, xmm5
+ cmp eax, 11
+ vmovdqu xmm5, OWORD PTR [r9+160]
+ jl L_AES_ECB_decrypt_avx512_16_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+176]
+ vaesdec xmm0, xmm0, xmm6
+ cmp eax, 13
+ vmovdqu xmm5, OWORD PTR [r9+192]
+ jl L_AES_ECB_decrypt_avx512_16_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+208]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r9+224]
+L_AES_ECB_decrypt_avx512_16_aes_dec_block_last:
+ vaesdeclast xmm0, xmm0, xmm5
+ lea r10, QWORD PTR [rdx+rax]
+ vmovdqu OWORD PTR [r10], xmm0
+ add eax, 16
+ cmp eax, r9d
+ jl L_AES_ECB_decrypt_avx512_dec_16
+L_AES_ECB_decrypt_avx512_done_dec:
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ ret
+AES_ECB_decrypt_avx512 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_CBC_encrypt_avx512 PROC
+ mov rax, QWORD PTR [rsp+40]
+ mov r10d, DWORD PTR [rsp+48]
+ vmovdqu xmm0, OWORD PTR [r8]
+ xor eax, eax
+ cmp eax, r9d
+ je L_AES_CBC_encrypt_avx512_done
+L_AES_CBC_encrypt_avx512_loop:
+ ; 16 bytes of input
+ lea r10, QWORD PTR [rcx+rax]
+ vmovdqu xmm1, OWORD PTR [r10]
+ vpternlogq xmm1, xmm0, [rax], 150
+ ; aes_enc_block
+ vmovdqu xmm3, OWORD PTR [rax+16]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+32]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+48]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+64]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+80]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+96]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+112]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+128]
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm3, OWORD PTR [rax+144]
+ vaesenc xmm1, xmm1, xmm3
+ cmp r10d, 11
+ vmovdqu xmm3, OWORD PTR [rax+160]
+ jl L_AES_CBC_encrypt_avx512_aes_enc_block_last
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm4, OWORD PTR [rax+176]
+ vaesenc xmm1, xmm1, xmm4
+ cmp r10d, 13
+ vmovdqu xmm3, OWORD PTR [rax+192]
+ jl L_AES_CBC_encrypt_avx512_aes_enc_block_last
+ vaesenc xmm1, xmm1, xmm3
+ vmovdqu xmm4, OWORD PTR [rax+208]
+ vaesenc xmm1, xmm1, xmm4
+ vmovdqu xmm3, OWORD PTR [rax+224]
+L_AES_CBC_encrypt_avx512_aes_enc_block_last:
+ vaesenclast xmm1, xmm1, xmm3
+ lea r11, QWORD PTR [rdx+rax]
+ vmovdqu OWORD PTR [r11], xmm1
+ vmovdqa xmm0, xmm1
+ add eax, 16
+ cmp eax, r9d
+ jl L_AES_CBC_encrypt_avx512_loop
+L_AES_CBC_encrypt_avx512_done:
+ vmovdqu OWORD PTR [r8], xmm0
+ ret
+AES_CBC_encrypt_avx512 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_CBC_decrypt_avx512 PROC
+ push r12
+ mov rax, QWORD PTR [rsp+48]
+ mov r10d, DWORD PTR [rsp+56]
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ vmovdqu xmm8, OWORD PTR [r8]
+ xor eax, eax
+ cmp r9d, 64
+ jl L_AES_CBC_decrypt_avx512_done_64
+ vbroadcasti32x4 zmm14, [rax]
+ vbroadcasti32x4 zmm15, [rax+16]
+ vbroadcasti32x4 zmm16, [rax+32]
+ vbroadcasti32x4 zmm17, [rax+48]
+ vbroadcasti32x4 zmm18, [rax+64]
+ vbroadcasti32x4 zmm19, [rax+80]
+ vbroadcasti32x4 zmm20, [rax+96]
+ vbroadcasti32x4 zmm21, [rax+112]
+ vbroadcasti32x4 zmm22, [rax+128]
+ vbroadcasti32x4 zmm23, [rax+144]
+ vbroadcasti32x4 zmm24, [rax+160]
+ cmp r10d, 11
+ jl L_AES_CBC_decrypt_avx512_key_cached
+ vbroadcasti32x4 zmm25, [rax+176]
+ vbroadcasti32x4 zmm26, [rax+192]
+ cmp r10d, 13
+ jl L_AES_CBC_decrypt_avx512_key_cached
+ vbroadcasti32x4 zmm27, [rax+208]
+ vbroadcasti32x4 zmm28, [rax+224]
+L_AES_CBC_decrypt_avx512_key_cached:
+ cmp r9d, 256
+ mov r10d, r9d
+ jl L_AES_CBC_decrypt_avx512_done_256
+ and r10d, 4294967040
+L_AES_CBC_decrypt_avx512_dec_256:
+ ; 256 bytes of input
+ ; aes_cbc_dec_256
+ lea r11, QWORD PTR [rcx+rax]
+ lea r12, QWORD PTR [rdx+rax]
+ vmovdqu64 zmm0, [r11]
+ vmovdqu64 zmm1, [r11+64]
+ vmovdqu64 zmm2, [r11+128]
+ vmovdqu64 zmm3, [r11+192]
+ vshufi64x2 zmm10, zmm0, zmm0, 144
+ vinserti32x4 zmm10, zmm10, xmm8, 0
+ vmovdqu64 zmm11, [r11+48]
+ vmovdqu64 zmm12, [r11+112]
+ vmovdqu64 zmm13, [r11+176]
+ vextracti32x4 xmm8, zmm3, 3
+ ; aes_dec_block
+ vpxorq zmm0, zmm0, zmm14
+ vpxorq zmm1, zmm1, zmm14
+ vpxorq zmm2, zmm2, zmm14
+ vpxorq zmm3, zmm3, zmm14
+ vaesdec zmm0, zmm0, zmm15
+ vaesdec zmm1, zmm1, zmm15
+ vaesdec zmm2, zmm2, zmm15
+ vaesdec zmm3, zmm3, zmm15
+ vaesdec zmm0, zmm0, zmm16
+ vaesdec zmm1, zmm1, zmm16
+ vaesdec zmm2, zmm2, zmm16
+ vaesdec zmm3, zmm3, zmm16
+ vaesdec zmm0, zmm0, zmm17
+ vaesdec zmm1, zmm1, zmm17
+ vaesdec zmm2, zmm2, zmm17
+ vaesdec zmm3, zmm3, zmm17
+ vaesdec zmm0, zmm0, zmm18
+ vaesdec zmm1, zmm1, zmm18
+ vaesdec zmm2, zmm2, zmm18
+ vaesdec zmm3, zmm3, zmm18
+ vaesdec zmm0, zmm0, zmm19
+ vaesdec zmm1, zmm1, zmm19
+ vaesdec zmm2, zmm2, zmm19
+ vaesdec zmm3, zmm3, zmm19
+ vaesdec zmm0, zmm0, zmm20
+ vaesdec zmm1, zmm1, zmm20
+ vaesdec zmm2, zmm2, zmm20
+ vaesdec zmm3, zmm3, zmm20
+ vaesdec zmm0, zmm0, zmm21
+ vaesdec zmm1, zmm1, zmm21
+ vaesdec zmm2, zmm2, zmm21
+ vaesdec zmm3, zmm3, zmm21
+ vaesdec zmm0, zmm0, zmm22
+ vaesdec zmm1, zmm1, zmm22
+ vaesdec zmm2, zmm2, zmm22
+ vaesdec zmm3, zmm3, zmm22
+ vaesdec zmm0, zmm0, zmm23
+ vaesdec zmm1, zmm1, zmm23
+ vaesdec zmm2, zmm2, zmm23
+ vaesdec zmm3, zmm3, zmm23
+ cmp r10d, 11
+ vmovdqa64 zmm9, zmm24
+ jl L_AES_CBC_decrypt_avx512_256_aes_dec_block_last
+ vaesdec zmm0, zmm0, zmm24
+ vaesdec zmm1, zmm1, zmm24
+ vaesdec zmm2, zmm2, zmm24
+ vaesdec zmm3, zmm3, zmm24
+ vaesdec zmm0, zmm0, zmm25
+ vaesdec zmm1, zmm1, zmm25
+ vaesdec zmm2, zmm2, zmm25
+ vaesdec zmm3, zmm3, zmm25
+ cmp r10d, 13
+ vmovdqa64 zmm9, zmm26
+ jl L_AES_CBC_decrypt_avx512_256_aes_dec_block_last
+ vaesdec zmm0, zmm0, zmm26
+ vaesdec zmm1, zmm1, zmm26
+ vaesdec zmm2, zmm2, zmm26
+ vaesdec zmm3, zmm3, zmm26
+ vaesdec zmm0, zmm0, zmm27
+ vaesdec zmm1, zmm1, zmm27
+ vaesdec zmm2, zmm2, zmm27
+ vaesdec zmm3, zmm3, zmm27
+ vmovdqa64 zmm9, zmm28
+L_AES_CBC_decrypt_avx512_256_aes_dec_block_last:
+ vaesdeclast zmm0, zmm0, zmm9
+ vaesdeclast zmm1, zmm1, zmm9
+ vaesdeclast zmm2, zmm2, zmm9
+ vaesdeclast zmm3, zmm3, zmm9
+ vpxorq zmm0, zmm0, zmm10
+ vpxorq zmm1, zmm1, zmm11
+ vpxorq zmm2, zmm2, zmm12
+ vpxorq zmm3, zmm3, zmm13
+ vmovdqu64 [r12], zmm0
+ vmovdqu64 [r12+64], zmm1
+ vmovdqu64 [r12+128], zmm2
+ vmovdqu64 [r12+192], zmm3
+ add eax, 256
+ cmp eax, r10d
+ jl L_AES_CBC_decrypt_avx512_dec_256
+L_AES_CBC_decrypt_avx512_done_256:
+ mov r10d, r9d
+ and r10d, 4294967232
+ cmp eax, r10d
+ je L_AES_CBC_decrypt_avx512_done_64
+L_AES_CBC_decrypt_avx512_dec_64:
+ ; 64 bytes of input
+ ; aes_cbc_dec_64
+ lea r11, QWORD PTR [rcx+rax]
+ lea r12, QWORD PTR [rdx+rax]
+ vmovdqu64 zmm0, [r11]
+ vshufi64x2 zmm10, zmm0, zmm0, 144
+ vinserti32x4 zmm10, zmm10, xmm8, 0
+ vextracti32x4 xmm8, zmm0, 3
+ ; aes_dec_block
+ vpxorq zmm0, zmm0, zmm14
+ vaesdec zmm0, zmm0, zmm15
+ vaesdec zmm0, zmm0, zmm16
+ vaesdec zmm0, zmm0, zmm17
+ vaesdec zmm0, zmm0, zmm18
+ vaesdec zmm0, zmm0, zmm19
+ vaesdec zmm0, zmm0, zmm20
+ vaesdec zmm0, zmm0, zmm21
+ vaesdec zmm0, zmm0, zmm22
+ vaesdec zmm0, zmm0, zmm23
+ cmp r10d, 11
+ vmovdqa64 zmm9, zmm24
+ jl L_AES_CBC_decrypt_avx512_64_aes_dec_block_last
+ vaesdec zmm0, zmm0, zmm24
+ vaesdec zmm0, zmm0, zmm25
+ cmp r10d, 13
+ vmovdqa64 zmm9, zmm26
+ jl L_AES_CBC_decrypt_avx512_64_aes_dec_block_last
+ vaesdec zmm0, zmm0, zmm26
+ vaesdec zmm0, zmm0, zmm27
+ vmovdqa64 zmm9, zmm28
+L_AES_CBC_decrypt_avx512_64_aes_dec_block_last:
+ vaesdeclast zmm0, zmm0, zmm9
+ vpxorq zmm0, zmm0, zmm10
+ vmovdqu64 [r12], zmm0
+ add eax, 64
+ cmp eax, r10d
+ jl L_AES_CBC_decrypt_avx512_dec_64
+L_AES_CBC_decrypt_avx512_done_64:
+ cmp eax, r9d
+ mov r10d, r9d
+ je L_AES_CBC_decrypt_avx512_done_dec
+ and r10d, 4294967280
+L_AES_CBC_decrypt_avx512_dec_16:
+ ; 16 bytes of input
+ lea r11, QWORD PTR [rcx+rax]
+ vmovdqu xmm0, OWORD PTR [r11]
+ vmovdqa xmm7, xmm0
+ ; aes_dec_block
+ vpxor xmm0, xmm0, [rax]
+ vmovdqu xmm5, OWORD PTR [rax+16]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+32]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+48]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+64]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+80]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+96]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+112]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+128]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [rax+144]
+ vaesdec xmm0, xmm0, xmm5
+ cmp r10d, 11
+ vmovdqu xmm5, OWORD PTR [rax+160]
+ jl L_AES_CBC_decrypt_avx512_16_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [rax+176]
+ vaesdec xmm0, xmm0, xmm6
+ cmp r10d, 13
+ vmovdqu xmm5, OWORD PTR [rax+192]
+ jl L_AES_CBC_decrypt_avx512_16_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [rax+208]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [rax+224]
+L_AES_CBC_decrypt_avx512_16_aes_dec_block_last:
+ vaesdeclast xmm0, xmm0, xmm5
+ vpxor xmm0, xmm0, xmm8
+ vmovdqa xmm8, xmm7
+ lea r11, QWORD PTR [rdx+rax]
+ vmovdqu OWORD PTR [r11], xmm0
+ add eax, 16
+ cmp eax, r10d
+ jl L_AES_CBC_decrypt_avx512_dec_16
+L_AES_CBC_decrypt_avx512_done_dec:
+ vmovdqu OWORD PTR [r8], xmm8
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ pop r12
+ ret
+AES_CBC_decrypt_avx512 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_ctr_bswap_avx512 QWORD \
+ 08090a0b0c0d0e0fh, 0001020304050607h
+ptr_L_aes_ctr_bswap_avx512 QWORD L_aes_ctr_bswap_avx512
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_ctr_inc_avx512 QWORD \
+ 0000000000000000h, 0000000000000000h,
+ 0000000000000001h, 0000000000000000h,
+ 0000000000000002h, 0000000000000000h,
+ 0000000000000003h, 0000000000000000h,
+ 0000000000000004h, 0000000000000000h,
+ 0000000000000005h, 0000000000000000h,
+ 0000000000000006h, 0000000000000000h,
+ 0000000000000007h, 0000000000000000h,
+ 0000000000000008h, 0000000000000000h,
+ 0000000000000009h, 0000000000000000h,
+ 000000000000000ah, 0000000000000000h,
+ 000000000000000bh, 0000000000000000h,
+ 000000000000000ch, 0000000000000000h,
+ 000000000000000dh, 0000000000000000h,
+ 000000000000000eh, 0000000000000000h,
+ 000000000000000fh, 0000000000000000h,
+ 0000000000000010h, 0000000000000000h
+ptr_L_aes_ctr_inc_avx512 QWORD L_aes_ctr_inc_avx512
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+AES_CTR_encrypt_avx512 PROC
+ push rbx
+ mov eax, DWORD PTR [rsp+48]
+ mov r10, QWORD PTR [rsp+56]
+ sub rsp, 160
+ vmovdqu OWORD PTR [rsp], xmm6
+ vmovdqu OWORD PTR [rsp+16], xmm7
+ vmovdqu OWORD PTR [rsp+32], xmm8
+ vmovdqu OWORD PTR [rsp+48], xmm9
+ vmovdqu OWORD PTR [rsp+64], xmm10
+ vmovdqu OWORD PTR [rsp+80], xmm11
+ vmovdqu OWORD PTR [rsp+96], xmm12
+ vmovdqu OWORD PTR [rsp+112], xmm13
+ vmovdqu OWORD PTR [rsp+128], xmm14
+ vmovdqu OWORD PTR [rsp+144], xmm15
+ vbroadcasti32x4 zmm8, ptr_L_aes_ctr_bswap_avx512
+ vbroadcasti32x4 zmm7, [r10]
+ vpshufb zmm7, zmm7, zmm8
+ vbroadcasti32x4 zmm10, [ptr_L_aes_ctr_inc_avx512+256]
+ vbroadcasti32x4 zmm11, [ptr_L_aes_ctr_inc_avx512+64]
+ vbroadcasti32x4 zmm12, [ptr_L_aes_ctr_inc_avx512+16]
+ xor eax, eax
+ cmp r8d, 64
+ jl L_AES_CTR_encrypt_avx512_done_64
+ vbroadcasti32x4 zmm14, [r9]
+ vbroadcasti32x4 zmm15, [r9+16]
+ vbroadcasti32x4 zmm16, [r9+32]
+ vbroadcasti32x4 zmm17, [r9+48]
+ vbroadcasti32x4 zmm18, [r9+64]
+ vbroadcasti32x4 zmm19, [r9+80]
+ vbroadcasti32x4 zmm20, [r9+96]
+ vbroadcasti32x4 zmm21, [r9+112]
+ vbroadcasti32x4 zmm22, [r9+128]
+ vbroadcasti32x4 zmm23, [r9+144]
+ vbroadcasti32x4 zmm24, [r9+160]
+ cmp eax, 11
+ jl L_AES_CTR_encrypt_avx512_key_cached
+ vbroadcasti32x4 zmm25, [r9+176]
+ vbroadcasti32x4 zmm26, [r9+192]
+ cmp eax, 13
+ jl L_AES_CTR_encrypt_avx512_key_cached
+ vbroadcasti32x4 zmm27, [r9+208]
+ vbroadcasti32x4 zmm28, [r9+224]
+L_AES_CTR_encrypt_avx512_key_cached:
+ cmp r8d, 256
+ mov r10d, r8d
+ jl L_AES_CTR_encrypt_avx512_done_256
+ and r10d, 4294967040
+ vmovdqa64 zmm9, zmm7
+ vpaddq zmm4, zmm7, [ptr_L_aes_ctr_inc_avx512]
+ vpternlogq zmm9, zmm4, [ptr_L_aes_ctr_inc_avx512], 178
+ vpsrlq zmm9, zmm9, 63
+ vpslldq zmm9, zmm9, 8
+ vpaddq zmm4, zmm4, zmm9
+ vmovdqa64 zmm9, zmm7
+ vpaddq zmm5, zmm7, [ptr_L_aes_ctr_inc_avx512+64]
+ vpternlogq zmm9, zmm5, [ptr_L_aes_ctr_inc_avx512+64], 178
+ vpsrlq zmm9, zmm9, 63
+ vpslldq zmm9, zmm9, 8
+ vpaddq zmm5, zmm5, zmm9
+ vmovdqa64 zmm9, zmm7
+ vpaddq zmm6, zmm7, [ptr_L_aes_ctr_inc_avx512+128]
+ vpternlogq zmm9, zmm6, [ptr_L_aes_ctr_inc_avx512+128], 178
+ vpsrlq zmm9, zmm9, 63
+ vpslldq zmm9, zmm9, 8
+ vpaddq zmm6, zmm6, zmm9
+ vmovdqa64 zmm9, zmm7
+ vpaddq zmm7, zmm7, [ptr_L_aes_ctr_inc_avx512+192]
+ vpternlogq zmm9, zmm7, [ptr_L_aes_ctr_inc_avx512+192], 178
+ vpsrlq zmm9, zmm9, 63
+ vpslldq zmm9, zmm9, 8
+ vpaddq zmm7, zmm7, zmm9
+L_AES_CTR_encrypt_avx512_enc_256:
+ ; 256 bytes of input
+ lea r11, QWORD PTR [rcx+rax]
+ lea rbx, QWORD PTR [rdx+rax]
+ vpshufb zmm0, zmm4, zmm8
+ vpshufb zmm1, zmm5, zmm8
+ vpshufb zmm2, zmm6, zmm8
+ vpshufb zmm3, zmm7, zmm8
+ vmovdqa64 zmm9, zmm4
+ vpaddq zmm4, zmm4, zmm10
+ vpternlogq zmm9, zmm4, zmm10, 178
+ vpsrlq zmm9, zmm9, 63
+ vpslldq zmm9, zmm9, 8
+ vpaddq zmm4, zmm4, zmm9
+ vmovdqa64 zmm9, zmm5
+ vpaddq zmm5, zmm5, zmm10
+ vpternlogq zmm9, zmm5, zmm10, 178
+ vpsrlq zmm9, zmm9, 63
+ vpslldq zmm9, zmm9, 8
+ vpaddq zmm5, zmm5, zmm9
+ vmovdqa64 zmm9, zmm6
+ vpaddq zmm6, zmm6, zmm10
+ vpternlogq zmm9, zmm6, zmm10, 178
+ vpsrlq zmm9, zmm9, 63
+ vpslldq zmm9, zmm9, 8
+ vpaddq zmm6, zmm6, zmm9
+ vmovdqa64 zmm9, zmm7
+ vpaddq zmm7, zmm7, zmm10
+ vpternlogq zmm9, zmm7, zmm10, 178
+ vpsrlq zmm9, zmm9, 63
+ vpslldq zmm9, zmm9, 8
+ vpaddq zmm7, zmm7, zmm9
+ ; aes_enc_block
+ vpxorq zmm0, zmm0, zmm14
+ vpxorq zmm1, zmm1, zmm14
+ vpxorq zmm2, zmm2, zmm14
+ vpxorq zmm3, zmm3, zmm14
+ vaesenc zmm0, zmm0, zmm15
+ vaesenc zmm1, zmm1, zmm15
+ vaesenc zmm2, zmm2, zmm15
+ vaesenc zmm3, zmm3, zmm15
+ vaesenc zmm0, zmm0, zmm16
+ vaesenc zmm1, zmm1, zmm16
+ vaesenc zmm2, zmm2, zmm16
+ vaesenc zmm3, zmm3, zmm16
+ vaesenc zmm0, zmm0, zmm17
+ vaesenc zmm1, zmm1, zmm17
+ vaesenc zmm2, zmm2, zmm17
+ vaesenc zmm3, zmm3, zmm17
+ vaesenc zmm0, zmm0, zmm18
+ vaesenc zmm1, zmm1, zmm18
+ vaesenc zmm2, zmm2, zmm18
+ vaesenc zmm3, zmm3, zmm18
+ vaesenc zmm0, zmm0, zmm19
+ vaesenc zmm1, zmm1, zmm19
+ vaesenc zmm2, zmm2, zmm19
+ vaesenc zmm3, zmm3, zmm19
+ vaesenc zmm0, zmm0, zmm20
+ vaesenc zmm1, zmm1, zmm20
+ vaesenc zmm2, zmm2, zmm20
+ vaesenc zmm3, zmm3, zmm20
+ vaesenc zmm0, zmm0, zmm21
+ vaesenc zmm1, zmm1, zmm21
+ vaesenc zmm2, zmm2, zmm21
+ vaesenc zmm3, zmm3, zmm21
+ vaesenc zmm0, zmm0, zmm22
+ vaesenc zmm1, zmm1, zmm22
+ vaesenc zmm2, zmm2, zmm22
+ vaesenc zmm3, zmm3, zmm22
+ vaesenc zmm0, zmm0, zmm23
+ vaesenc zmm1, zmm1, zmm23
+ vaesenc zmm2, zmm2, zmm23
+ vaesenc zmm3, zmm3, zmm23
+ cmp eax, 11
+ vmovdqa64 zmm13, zmm24
+ jl L_AES_CTR_encrypt_avx512_256_aes_enc_block_last
+ vaesenc zmm0, zmm0, zmm24
+ vaesenc zmm1, zmm1, zmm24
+ vaesenc zmm2, zmm2, zmm24
+ vaesenc zmm3, zmm3, zmm24
+ vaesenc zmm0, zmm0, zmm25
+ vaesenc zmm1, zmm1, zmm25
+ vaesenc zmm2, zmm2, zmm25
+ vaesenc zmm3, zmm3, zmm25
+ cmp eax, 13
+ vmovdqa64 zmm13, zmm26
+ jl L_AES_CTR_encrypt_avx512_256_aes_enc_block_last
+ vaesenc zmm0, zmm0, zmm26
+ vaesenc zmm1, zmm1, zmm26
+ vaesenc zmm2, zmm2, zmm26
+ vaesenc zmm3, zmm3, zmm26
+ vaesenc zmm0, zmm0, zmm27
+ vaesenc zmm1, zmm1, zmm27
+ vaesenc zmm2, zmm2, zmm27
+ vaesenc zmm3, zmm3, zmm27
+ vmovdqa64 zmm13, zmm28
+L_AES_CTR_encrypt_avx512_256_aes_enc_block_last:
+ vaesenclast zmm0, zmm0, zmm13
+ vaesenclast zmm1, zmm1, zmm13
+ vaesenclast zmm2, zmm2, zmm13
+ vaesenclast zmm3, zmm3, zmm13
+ vpxorq zmm0, zmm0, [r11]
+ vpxorq zmm1, zmm1, [r11+64]
+ vpxorq zmm2, zmm2, [r11+128]
+ vpxorq zmm3, zmm3, [r11+192]
+ vmovdqu64 [rbx], zmm0
+ vmovdqu64 [rbx+64], zmm1
+ vmovdqu64 [rbx+128], zmm2
+ vmovdqu64 [rbx+192], zmm3
+ add eax, 256
+ cmp eax, r10d
+ jl L_AES_CTR_encrypt_avx512_enc_256
+ vshufi64x2 zmm7, zmm4, zmm4, 0
+L_AES_CTR_encrypt_avx512_done_256:
+ mov r10d, r8d
+ and r10d, 4294967232
+ cmp eax, r10d
+ je L_AES_CTR_encrypt_avx512_done_64
+L_AES_CTR_encrypt_avx512_enc_64:
+ ; 64 bytes of input
+ ; aes_ctr_enc_64
+ lea r11, QWORD PTR [rcx+rax]
+ lea rbx, QWORD PTR [rdx+rax]
+ vpaddq zmm0, zmm7, [ptr_L_aes_ctr_inc_avx512]
+ vmovdqa64 zmm9, zmm7
+ vpternlogq zmm9, zmm0, [ptr_L_aes_ctr_inc_avx512], 178
+ vpsrlq zmm9, zmm9, 63
+ vpslldq zmm9, zmm9, 8
+ vpaddq zmm0, zmm0, zmm9
+ vpshufb zmm0, zmm0, zmm8
+ vmovdqa64 zmm9, zmm7
+ vpaddq zmm7, zmm7, zmm11
+ vpternlogq zmm9, zmm7, zmm11, 178
+ vpsrlq zmm9, zmm9, 63
+ vpslldq zmm9, zmm9, 8
+ vpaddq zmm7, zmm7, zmm9
+ ; aes_enc_block
+ vpxorq zmm0, zmm0, zmm14
+ vaesenc zmm0, zmm0, zmm15
+ vaesenc zmm0, zmm0, zmm16
+ vaesenc zmm0, zmm0, zmm17
+ vaesenc zmm0, zmm0, zmm18
+ vaesenc zmm0, zmm0, zmm19
+ vaesenc zmm0, zmm0, zmm20
+ vaesenc zmm0, zmm0, zmm21
+ vaesenc zmm0, zmm0, zmm22
+ vaesenc zmm0, zmm0, zmm23
+ cmp eax, 11
+ vmovdqa64 zmm13, zmm24
+ jl L_AES_CTR_encrypt_avx512_64_aes_enc_block_last
+ vaesenc zmm0, zmm0, zmm24
+ vaesenc zmm0, zmm0, zmm25
+ cmp eax, 13
+ vmovdqa64 zmm13, zmm26
+ jl L_AES_CTR_encrypt_avx512_64_aes_enc_block_last
+ vaesenc zmm0, zmm0, zmm26
+ vaesenc zmm0, zmm0, zmm27
+ vmovdqa64 zmm13, zmm28
+L_AES_CTR_encrypt_avx512_64_aes_enc_block_last:
+ vaesenclast zmm0, zmm0, zmm13
+ vpxorq zmm0, zmm0, [r11]
+ vmovdqu64 [rbx], zmm0
+ add eax, 64
+ cmp eax, r10d
+ jl L_AES_CTR_encrypt_avx512_enc_64
+L_AES_CTR_encrypt_avx512_done_64:
+ cmp eax, r8d
+ mov r10d, r8d
+ je L_AES_CTR_encrypt_avx512_done_enc
+ and r10d, 4294967280
+L_AES_CTR_encrypt_avx512_enc_16:
+ ; 16 bytes of input
+ vpshufb xmm0, xmm7, xmm8
+ vmovdqa64 zmm9, zmm7
+ vpaddq zmm7, zmm7, zmm12
+ vpternlogq zmm9, zmm7, zmm12, 178
+ vpsrlq zmm9, zmm9, 63
+ vpslldq zmm9, zmm9, 8
+ vpaddq zmm7, zmm7, zmm9
+ ; aes_enc_block
+ vpxor xmm0, xmm0, [r9]
+ vmovdqu xmm5, OWORD PTR [r9+16]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+32]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+48]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+64]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+80]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+96]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+112]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+128]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+144]
+ vaesenc xmm0, xmm0, xmm5
+ cmp eax, 11
+ vmovdqu xmm5, OWORD PTR [r9+160]
+ jl L_AES_CTR_encrypt_avx512_16_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+176]
+ vaesenc xmm0, xmm0, xmm6
+ cmp eax, 13
+ vmovdqu xmm5, OWORD PTR [r9+192]
+ jl L_AES_CTR_encrypt_avx512_16_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+208]
+ vaesenc xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r9+224]
+L_AES_CTR_encrypt_avx512_16_aes_enc_block_last:
+ vaesenclast xmm0, xmm0, xmm5
+ lea r11, QWORD PTR [rcx+rax]
+ vpxor xmm0, xmm0, [r11]
+ lea r11, QWORD PTR [rdx+rax]
+ vmovdqu OWORD PTR [r11], xmm0
+ add eax, 16
+ cmp eax, r10d
+ jl L_AES_CTR_encrypt_avx512_enc_16
+L_AES_CTR_encrypt_avx512_done_enc:
+ vpshufb xmm0, xmm7, xmm8
+ vmovdqu OWORD PTR [r10], xmm0
+ vmovdqu xmm6, OWORD PTR [rsp]
+ vmovdqu xmm7, OWORD PTR [rsp+16]
+ vmovdqu xmm8, OWORD PTR [rsp+32]
+ vmovdqu xmm9, OWORD PTR [rsp+48]
+ vmovdqu xmm10, OWORD PTR [rsp+64]
+ vmovdqu xmm11, OWORD PTR [rsp+80]
+ vmovdqu xmm12, OWORD PTR [rsp+96]
+ vmovdqu xmm13, OWORD PTR [rsp+112]
+ vmovdqu xmm14, OWORD PTR [rsp+128]
+ vmovdqu xmm15, OWORD PTR [rsp+144]
+ add rsp, 160
+ pop rbx
+ ret
+AES_CTR_encrypt_avx512 ENDP
+_TEXT ENDS
+ENDIF
+END
diff --git a/wolfcrypt/src/aes_xts_asm.S b/wolfcrypt/src/aes_xts_asm.S
index 09045c6d8f7..29f3a0174b4 100644
--- a/wolfcrypt/src/aes_xts_asm.S
+++ b/wolfcrypt/src/aes_xts_asm.S
@@ -46,6 +46,16 @@
#define HAVE_INTEL_AVX2
#endif /* HAVE_INTEL_AVX2 */
#endif /* NO_AVX2_SUPPORT */
+#ifndef NO_VAES_SUPPORT
+#ifndef HAVE_INTEL_VAES
+#define HAVE_INTEL_VAES
+#endif /* HAVE_INTEL_VAES */
+#endif /* NO_VAES_SUPPORT */
+#ifndef NO_AVX512_SUPPORT
+#ifndef HAVE_INTEL_AVX512
+#define HAVE_INTEL_AVX512
+#endif /* HAVE_INTEL_AVX512 */
+#endif /* NO_AVX512_SUPPORT */
#ifdef WOLFSSL_AES_XTS
#ifdef WOLFSSL_X86_64_BUILD
@@ -2785,6 +2795,4408 @@ L_AES_XTS_decrypt_update_avx1_done_dec:
.size AES_XTS_decrypt_update_avx1,.-AES_XTS_decrypt_update_avx1
#endif /* __APPLE__ */
#endif /* HAVE_INTEL_AVX1 */
+#ifdef HAVE_INTEL_VAES
+#ifndef __APPLE__
+.text
+.globl AES_XTS_init_vaes
+.type AES_XTS_init_vaes,@function
+.align 16
+AES_XTS_init_vaes:
+#else
+.section __TEXT,__text
+.globl _AES_XTS_init_vaes
+.p2align 4
+_AES_XTS_init_vaes:
+#endif /* __APPLE__ */
+ vmovdqu (%rdi), %xmm0
+ # aes_enc_block
+ vpxor (%rsi), %xmm0, %xmm0
+ vmovdqu 16(%rsi), %xmm2
+ vaesenc %xmm2, %xmm0, %xmm0
+ vmovdqu 32(%rsi), %xmm2
+ vaesenc %xmm2, %xmm0, %xmm0
+ vmovdqu 48(%rsi), %xmm2
+ vaesenc %xmm2, %xmm0, %xmm0
+ vmovdqu 64(%rsi), %xmm2
+ vaesenc %xmm2, %xmm0, %xmm0
+ vmovdqu 80(%rsi), %xmm2
+ vaesenc %xmm2, %xmm0, %xmm0
+ vmovdqu 96(%rsi), %xmm2
+ vaesenc %xmm2, %xmm0, %xmm0
+ vmovdqu 112(%rsi), %xmm2
+ vaesenc %xmm2, %xmm0, %xmm0
+ vmovdqu 128(%rsi), %xmm2
+ vaesenc %xmm2, %xmm0, %xmm0
+ vmovdqu 144(%rsi), %xmm2
+ vaesenc %xmm2, %xmm0, %xmm0
+ cmpl $11, %edx
+ vmovdqu 160(%rsi), %xmm2
+ jl L_AES_XTS_init_vaes_tweak_aes_enc_block_last
+ vaesenc %xmm2, %xmm0, %xmm0
+ vmovdqu 176(%rsi), %xmm3
+ vaesenc %xmm3, %xmm0, %xmm0
+ cmpl $13, %edx
+ vmovdqu 192(%rsi), %xmm2
+ jl L_AES_XTS_init_vaes_tweak_aes_enc_block_last
+ vaesenc %xmm2, %xmm0, %xmm0
+ vmovdqu 208(%rsi), %xmm3
+ vaesenc %xmm3, %xmm0, %xmm0
+ vmovdqu 224(%rsi), %xmm2
+L_AES_XTS_init_vaes_tweak_aes_enc_block_last:
+ vaesenclast %xmm2, %xmm0, %xmm0
+ vmovdqu %xmm0, (%rdi)
+ repz retq
+#ifndef __APPLE__
+.size AES_XTS_init_vaes,.-AES_XTS_init_vaes
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_vaes_aes_xts_gc_xts:
+.long 0x00000087,0x00000000,0x00000001,0x00000000
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_vaes_aes_xts_poly:
+.long 0x00000087,0x00000000,0x00000000,0x00000000
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_vaes_aes_xts_shl:
+.long 0x00000000,0x00000000,0x00000000,0x00000000
+.long 0x00000001,0x00000000,0x00000001,0x00000000
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_vaes_aes_xts_shr:
+.long 0x00000040,0x00000000,0x00000040,0x00000000
+.long 0x0000003f,0x00000000,0x0000003f,0x00000000
+#ifndef __APPLE__
+.text
+.globl AES_XTS_encrypt_vaes
+.type AES_XTS_encrypt_vaes,@function
+.align 16
+AES_XTS_encrypt_vaes:
+#else
+.section __TEXT,__text
+.globl _AES_XTS_encrypt_vaes
+.p2align 4
+_AES_XTS_encrypt_vaes:
+#endif /* __APPLE__ */
+ pushq %r12
+ pushq %r13
+ movq %rdx, %rax
+ movq %rcx, %r12
+ movl 24(%rsp), %r10d
+ subq $0x40, %rsp
+ vmovdqu L_vaes_aes_xts_gc_xts(%rip), %xmm12
+ vbroadcasti128 L_vaes_aes_xts_poly(%rip), %ymm13
+ vmovdqu L_vaes_aes_xts_shl(%rip), %ymm14
+ vmovdqu L_vaes_aes_xts_shr(%rip), %ymm15
+ vmovdqu (%r12), %xmm8
+ # aes_enc_block
+ vpxor (%r9), %xmm8, %xmm8
+ vmovdqu 16(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 32(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 48(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 64(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 80(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 96(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 112(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 128(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 144(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ cmpl $11, %r10d
+ vmovdqu 160(%r9), %xmm5
+ jl L_AES_XTS_encrypt_vaes_tweak_aes_enc_block_last
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 176(%r9), %xmm6
+ vaesenc %xmm6, %xmm8, %xmm8
+ cmpl $13, %r10d
+ vmovdqu 192(%r9), %xmm5
+ jl L_AES_XTS_encrypt_vaes_tweak_aes_enc_block_last
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 208(%r9), %xmm6
+ vaesenc %xmm6, %xmm8, %xmm8
+ vmovdqu 224(%r9), %xmm5
+L_AES_XTS_encrypt_vaes_tweak_aes_enc_block_last:
+ vaesenclast %xmm5, %xmm8, %xmm8
+ xorl %r13d, %r13d
+ cmpl $32, %eax
+ jl L_AES_XTS_encrypt_vaes_done_128
+ cmpl $0x80, %eax
+ movl %eax, %r11d
+ jl L_AES_XTS_encrypt_vaes_done_128
+ andl $0xffffff80, %r11d
+ vperm2i128 $0x00, %ymm8, %ymm8, %ymm5
+ vpsrlvq %ymm15, %ymm5, %ymm6
+ vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7
+ vpslldq $8, %ymm6, %ymm6
+ vpsllvq %ymm14, %ymm5, %ymm4
+ vpxor %ymm7, %ymm4, %ymm4
+ vpxor %ymm6, %ymm4, %ymm4
+ vpsrlq $62, %ymm4, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $2, %ymm4, %ymm5
+ vpxor %ymm10, %ymm5, %ymm5
+ vpxor %ymm9, %ymm5, %ymm5
+ vpsrlq $62, %ymm5, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $2, %ymm5, %ymm6
+ vpxor %ymm10, %ymm6, %ymm6
+ vpxor %ymm9, %ymm6, %ymm6
+ vpsrlq $62, %ymm6, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $2, %ymm6, %ymm7
+ vpxor %ymm10, %ymm7, %ymm7
+ vpxor %ymm9, %ymm7, %ymm7
+L_AES_XTS_encrypt_vaes_enc_128:
+ # 128 bytes of input
+ # aes_enc_128
+ leaq (%rdi,%r13,1), %rcx
+ leaq (%rsi,%r13,1), %rdx
+ vmovdqu (%rcx), %ymm0
+ vmovdqu 32(%rcx), %ymm1
+ vmovdqu 64(%rcx), %ymm2
+ vmovdqu 96(%rcx), %ymm3
+ # aes_enc_block
+ vbroadcasti128 (%r8), %ymm9
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm9, %ymm0, %ymm0
+ vpxor %ymm5, %ymm1, %ymm1
+ vpxor %ymm9, %ymm1, %ymm1
+ vpxor %ymm6, %ymm2, %ymm2
+ vpxor %ymm9, %ymm2, %ymm2
+ vpxor %ymm7, %ymm3, %ymm3
+ vpxor %ymm9, %ymm3, %ymm3
+ vbroadcasti128 16(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ vbroadcasti128 32(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ vbroadcasti128 48(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ vbroadcasti128 64(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ vbroadcasti128 80(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ vbroadcasti128 96(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ vbroadcasti128 112(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ vbroadcasti128 128(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ vbroadcasti128 144(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ cmpl $11, %r10d
+ vbroadcasti128 160(%r8), %ymm9
+ jl L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ vbroadcasti128 176(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ cmpl $13, %r10d
+ vbroadcasti128 192(%r8), %ymm9
+ jl L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ vbroadcasti128 208(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ vbroadcasti128 224(%r8), %ymm9
+L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last:
+ vaesenclast %ymm9, %ymm0, %ymm0
+ vaesenclast %ymm9, %ymm1, %ymm1
+ vaesenclast %ymm9, %ymm2, %ymm2
+ vaesenclast %ymm9, %ymm3, %ymm3
+ vpxor %ymm4, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vpsrlq $56, %ymm4, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $8, %ymm4, %ymm4
+ vpxor %ymm10, %ymm4, %ymm4
+ vpxor %ymm9, %ymm4, %ymm4
+ vpxor %ymm5, %ymm1, %ymm1
+ vmovdqu %ymm1, 32(%rdx)
+ vpsrlq $56, %ymm5, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $8, %ymm5, %ymm5
+ vpxor %ymm10, %ymm5, %ymm5
+ vpxor %ymm9, %ymm5, %ymm5
+ vpxor %ymm6, %ymm2, %ymm2
+ vmovdqu %ymm2, 64(%rdx)
+ vpsrlq $56, %ymm6, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $8, %ymm6, %ymm6
+ vpxor %ymm10, %ymm6, %ymm6
+ vpxor %ymm9, %ymm6, %ymm6
+ vpxor %ymm7, %ymm3, %ymm3
+ vmovdqu %ymm3, 96(%rdx)
+ vpsrlq $56, %ymm7, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $8, %ymm7, %ymm7
+ vpxor %ymm10, %ymm7, %ymm7
+ vpxor %ymm9, %ymm7, %ymm7
+ addl $0x80, %r13d
+ cmpl %r11d, %r13d
+ jl L_AES_XTS_encrypt_vaes_enc_128
+ vextracti128 $0x00, %ymm4, %xmm8
+L_AES_XTS_encrypt_vaes_done_128:
+ movl %eax, %r11d
+ andl $0xffffffc0, %r11d
+ cmpl %r11d, %r13d
+ je L_AES_XTS_encrypt_vaes_done_64
+ # 64 bytes of input
+ # aes_enc_64
+ leaq (%rdi,%r13,1), %rcx
+ leaq (%rsi,%r13,1), %rdx
+ vmovdqu (%rcx), %ymm0
+ vmovdqu 32(%rcx), %ymm1
+ vperm2i128 $0x00, %ymm8, %ymm8, %ymm5
+ vpsrlvq %ymm15, %ymm5, %ymm6
+ vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7
+ vpslldq $8, %ymm6, %ymm6
+ vpsllvq %ymm14, %ymm5, %ymm4
+ vpxor %ymm7, %ymm4, %ymm4
+ vpxor %ymm6, %ymm4, %ymm4
+ vpsrlq $62, %ymm4, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $2, %ymm4, %ymm5
+ vpxor %ymm10, %ymm5, %ymm5
+ vpxor %ymm9, %ymm5, %ymm5
+ # aes_enc_block
+ vbroadcasti128 (%r8), %ymm9
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm9, %ymm0, %ymm0
+ vpxor %ymm5, %ymm1, %ymm1
+ vpxor %ymm9, %ymm1, %ymm1
+ vbroadcasti128 16(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vbroadcasti128 32(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vbroadcasti128 48(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vbroadcasti128 64(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vbroadcasti128 80(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vbroadcasti128 96(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vbroadcasti128 112(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vbroadcasti128 128(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vbroadcasti128 144(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ cmpl $11, %r10d
+ vbroadcasti128 160(%r8), %ymm9
+ jl L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vbroadcasti128 176(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ cmpl $13, %r10d
+ vbroadcasti128 192(%r8), %ymm9
+ jl L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vbroadcasti128 208(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vbroadcasti128 224(%r8), %ymm9
+L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last:
+ vaesenclast %ymm9, %ymm0, %ymm0
+ vaesenclast %ymm9, %ymm1, %ymm1
+ vpxor %ymm4, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vpxor %ymm5, %ymm1, %ymm1
+ vmovdqu %ymm1, 32(%rdx)
+ vextracti128 $0x01, %ymm5, %xmm8
+ vpshufd $19, %xmm8, %xmm9
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm9, %xmm9
+ vpand %xmm12, %xmm9, %xmm9
+ vpxor %xmm9, %xmm8, %xmm8
+ addl $0x40, %r13d
+L_AES_XTS_encrypt_vaes_done_64:
+ movl %eax, %r11d
+ andl $0xffffffe0, %r11d
+ cmpl %r11d, %r13d
+ je L_AES_XTS_encrypt_vaes_done_32
+ # 32 bytes of input
+ # aes_enc_32
+ leaq (%rdi,%r13,1), %rcx
+ leaq (%rsi,%r13,1), %rdx
+ vmovdqu (%rcx), %ymm0
+ vperm2i128 $0x00, %ymm8, %ymm8, %ymm5
+ vpsrlvq %ymm15, %ymm5, %ymm6
+ vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7
+ vpslldq $8, %ymm6, %ymm6
+ vpsllvq %ymm14, %ymm5, %ymm4
+ vpxor %ymm7, %ymm4, %ymm4
+ vpxor %ymm6, %ymm4, %ymm4
+ # aes_enc_block
+ vbroadcasti128 (%r8), %ymm9
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm9, %ymm0, %ymm0
+ vbroadcasti128 16(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vbroadcasti128 32(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vbroadcasti128 48(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vbroadcasti128 64(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vbroadcasti128 80(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vbroadcasti128 96(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vbroadcasti128 112(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vbroadcasti128 128(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vbroadcasti128 144(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ cmpl $11, %r10d
+ vbroadcasti128 160(%r8), %ymm9
+ jl L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last
+ vaesenc %ymm9, %ymm0, %ymm0
+ vbroadcasti128 176(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ cmpl $13, %r10d
+ vbroadcasti128 192(%r8), %ymm9
+ jl L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last
+ vaesenc %ymm9, %ymm0, %ymm0
+ vbroadcasti128 208(%r8), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vbroadcasti128 224(%r8), %ymm9
+L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last:
+ vaesenclast %ymm9, %ymm0, %ymm0
+ vpxor %ymm4, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vextracti128 $0x01, %ymm4, %xmm8
+ vpshufd $19, %xmm8, %xmm9
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm9, %xmm9
+ vpand %xmm12, %xmm9, %xmm9
+ vpxor %xmm9, %xmm8, %xmm8
+ addl $32, %r13d
+L_AES_XTS_encrypt_vaes_done_32:
+ cmpl %eax, %r13d
+ movl %eax, %r11d
+ je L_AES_XTS_encrypt_vaes_done_enc
+ subl %r13d, %r11d
+ cmpl $16, %r11d
+ movl %eax, %r11d
+ jl L_AES_XTS_encrypt_vaes_last_15
+ andl $0xfffffff0, %r11d
+ # 16 bytes of input
+L_AES_XTS_encrypt_vaes_enc_16:
+ leaq (%rdi,%r13,1), %rcx
+ vmovdqu (%rcx), %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ # aes_enc_block
+ vpxor (%r8), %xmm0, %xmm0
+ vmovdqu 16(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ cmpl $11, %r10d
+ vmovdqu 160(%r8), %xmm5
+ jl L_AES_XTS_encrypt_vaes_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%r8), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ cmpl $13, %r10d
+ vmovdqu 192(%r8), %xmm5
+ jl L_AES_XTS_encrypt_vaes_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%r8), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%r8), %xmm5
+L_AES_XTS_encrypt_vaes_aes_enc_block_last:
+ vaesenclast %xmm5, %xmm0, %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ leaq (%rsi,%r13,1), %rcx
+ vmovdqu %xmm0, (%rcx)
+ vpshufd $19, %xmm8, %xmm4
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm4, %xmm4
+ vpand %xmm12, %xmm4, %xmm4
+ vpxor %xmm4, %xmm8, %xmm8
+ addl $16, %r13d
+ cmpl %r11d, %r13d
+ jl L_AES_XTS_encrypt_vaes_enc_16
+ cmpl %eax, %r13d
+ je L_AES_XTS_encrypt_vaes_done_enc
+L_AES_XTS_encrypt_vaes_last_15:
+ subq $16, %r13
+ leaq (%rsi,%r13,1), %rcx
+ vmovdqu (%rcx), %xmm0
+ addq $16, %r13
+ vmovdqu %xmm0, (%rsp)
+ xorq %rdx, %rdx
+L_AES_XTS_encrypt_vaes_last_15_byte_loop:
+ movb (%rsp,%rdx,1), %r11b
+ movb (%rdi,%r13,1), %cl
+ movb %r11b, (%rsi,%r13,1)
+ movb %cl, (%rsp,%rdx,1)
+ incl %r13d
+ incl %edx
+ cmpl %eax, %r13d
+ jl L_AES_XTS_encrypt_vaes_last_15_byte_loop
+ subq %rdx, %r13
+ vmovdqu (%rsp), %xmm0
+ subq $16, %r13
+ vpxor %xmm8, %xmm0, %xmm0
+ # aes_enc_block
+ vpxor (%r8), %xmm0, %xmm0
+ vmovdqu 16(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ cmpl $11, %r10d
+ vmovdqu 160(%r8), %xmm5
+ jl L_AES_XTS_encrypt_vaes_last_15_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%r8), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ cmpl $13, %r10d
+ vmovdqu 192(%r8), %xmm5
+ jl L_AES_XTS_encrypt_vaes_last_15_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%r8), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%r8), %xmm5
+L_AES_XTS_encrypt_vaes_last_15_aes_enc_block_last:
+ vaesenclast %xmm5, %xmm0, %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ leaq (%rsi,%r13,1), %rcx
+ vmovdqu %xmm0, (%rcx)
+L_AES_XTS_encrypt_vaes_done_enc:
+ addq $0x40, %rsp
+ popq %r13
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size AES_XTS_encrypt_vaes,.-AES_XTS_encrypt_vaes
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_XTS_encrypt_update_vaes
+.type AES_XTS_encrypt_update_vaes,@function
+.align 16
+AES_XTS_encrypt_update_vaes:
+#else
+.section __TEXT,__text
+.globl _AES_XTS_encrypt_update_vaes
+.p2align 4
+_AES_XTS_encrypt_update_vaes:
+#endif /* __APPLE__ */
+ pushq %r12
+ movq %rdx, %rax
+ movq %rcx, %r10
+ subq $0x40, %rsp
+ vmovdqu L_vaes_aes_xts_gc_xts(%rip), %xmm12
+ vbroadcasti128 L_vaes_aes_xts_poly(%rip), %ymm13
+ vmovdqu L_vaes_aes_xts_shl(%rip), %ymm14
+ vmovdqu L_vaes_aes_xts_shr(%rip), %ymm15
+ vmovdqu (%r8), %xmm8
+ xorl %r12d, %r12d
+ cmpl $32, %eax
+ jl L_AES_XTS_encrypt_update_vaes_done_128
+ cmpl $0x80, %eax
+ movl %eax, %r11d
+ jl L_AES_XTS_encrypt_update_vaes_done_128
+ andl $0xffffff80, %r11d
+ vperm2i128 $0x00, %ymm8, %ymm8, %ymm5
+ vpsrlvq %ymm15, %ymm5, %ymm6
+ vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7
+ vpslldq $8, %ymm6, %ymm6
+ vpsllvq %ymm14, %ymm5, %ymm4
+ vpxor %ymm7, %ymm4, %ymm4
+ vpxor %ymm6, %ymm4, %ymm4
+ vpsrlq $62, %ymm4, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $2, %ymm4, %ymm5
+ vpxor %ymm10, %ymm5, %ymm5
+ vpxor %ymm9, %ymm5, %ymm5
+ vpsrlq $62, %ymm5, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $2, %ymm5, %ymm6
+ vpxor %ymm10, %ymm6, %ymm6
+ vpxor %ymm9, %ymm6, %ymm6
+ vpsrlq $62, %ymm6, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $2, %ymm6, %ymm7
+ vpxor %ymm10, %ymm7, %ymm7
+ vpxor %ymm9, %ymm7, %ymm7
+L_AES_XTS_encrypt_update_vaes_enc_128:
+ # 128 bytes of input
+ # aes_enc_128
+ leaq (%rdi,%r12,1), %rcx
+ leaq (%rsi,%r12,1), %rdx
+ vmovdqu (%rcx), %ymm0
+ vmovdqu 32(%rcx), %ymm1
+ vmovdqu 64(%rcx), %ymm2
+ vmovdqu 96(%rcx), %ymm3
+ # aes_enc_block
+ vbroadcasti128 (%r10), %ymm9
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm9, %ymm0, %ymm0
+ vpxor %ymm5, %ymm1, %ymm1
+ vpxor %ymm9, %ymm1, %ymm1
+ vpxor %ymm6, %ymm2, %ymm2
+ vpxor %ymm9, %ymm2, %ymm2
+ vpxor %ymm7, %ymm3, %ymm3
+ vpxor %ymm9, %ymm3, %ymm3
+ vbroadcasti128 16(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ vbroadcasti128 32(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ vbroadcasti128 48(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ vbroadcasti128 64(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ vbroadcasti128 80(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ vbroadcasti128 96(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ vbroadcasti128 112(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ vbroadcasti128 128(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ vbroadcasti128 144(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ cmpl $11, %r9d
+ vbroadcasti128 160(%r10), %ymm9
+ jl L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ vbroadcasti128 176(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ cmpl $13, %r9d
+ vbroadcasti128 192(%r10), %ymm9
+ jl L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ vbroadcasti128 208(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vaesenc %ymm9, %ymm2, %ymm2
+ vaesenc %ymm9, %ymm3, %ymm3
+ vbroadcasti128 224(%r10), %ymm9
+L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last:
+ vaesenclast %ymm9, %ymm0, %ymm0
+ vaesenclast %ymm9, %ymm1, %ymm1
+ vaesenclast %ymm9, %ymm2, %ymm2
+ vaesenclast %ymm9, %ymm3, %ymm3
+ vpxor %ymm4, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vpsrlq $56, %ymm4, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $8, %ymm4, %ymm4
+ vpxor %ymm10, %ymm4, %ymm4
+ vpxor %ymm9, %ymm4, %ymm4
+ vpxor %ymm5, %ymm1, %ymm1
+ vmovdqu %ymm1, 32(%rdx)
+ vpsrlq $56, %ymm5, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $8, %ymm5, %ymm5
+ vpxor %ymm10, %ymm5, %ymm5
+ vpxor %ymm9, %ymm5, %ymm5
+ vpxor %ymm6, %ymm2, %ymm2
+ vmovdqu %ymm2, 64(%rdx)
+ vpsrlq $56, %ymm6, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $8, %ymm6, %ymm6
+ vpxor %ymm10, %ymm6, %ymm6
+ vpxor %ymm9, %ymm6, %ymm6
+ vpxor %ymm7, %ymm3, %ymm3
+ vmovdqu %ymm3, 96(%rdx)
+ vpsrlq $56, %ymm7, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $8, %ymm7, %ymm7
+ vpxor %ymm10, %ymm7, %ymm7
+ vpxor %ymm9, %ymm7, %ymm7
+ addl $0x80, %r12d
+ cmpl %r11d, %r12d
+ jl L_AES_XTS_encrypt_update_vaes_enc_128
+ vextracti128 $0x00, %ymm4, %xmm8
+L_AES_XTS_encrypt_update_vaes_done_128:
+ movl %eax, %r11d
+ andl $0xffffffc0, %r11d
+ cmpl %r11d, %r12d
+ je L_AES_XTS_encrypt_update_vaes_done_64
+ # 64 bytes of input
+ # aes_enc_64
+ leaq (%rdi,%r12,1), %rcx
+ leaq (%rsi,%r12,1), %rdx
+ vmovdqu (%rcx), %ymm0
+ vmovdqu 32(%rcx), %ymm1
+ vperm2i128 $0x00, %ymm8, %ymm8, %ymm5
+ vpsrlvq %ymm15, %ymm5, %ymm6
+ vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7
+ vpslldq $8, %ymm6, %ymm6
+ vpsllvq %ymm14, %ymm5, %ymm4
+ vpxor %ymm7, %ymm4, %ymm4
+ vpxor %ymm6, %ymm4, %ymm4
+ vpsrlq $62, %ymm4, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $2, %ymm4, %ymm5
+ vpxor %ymm10, %ymm5, %ymm5
+ vpxor %ymm9, %ymm5, %ymm5
+ # aes_enc_block
+ vbroadcasti128 (%r10), %ymm9
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm9, %ymm0, %ymm0
+ vpxor %ymm5, %ymm1, %ymm1
+ vpxor %ymm9, %ymm1, %ymm1
+ vbroadcasti128 16(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vbroadcasti128 32(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vbroadcasti128 48(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vbroadcasti128 64(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vbroadcasti128 80(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vbroadcasti128 96(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vbroadcasti128 112(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vbroadcasti128 128(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vbroadcasti128 144(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ cmpl $11, %r9d
+ vbroadcasti128 160(%r10), %ymm9
+ jl L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vbroadcasti128 176(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ cmpl $13, %r9d
+ vbroadcasti128 192(%r10), %ymm9
+ jl L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vbroadcasti128 208(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vaesenc %ymm9, %ymm1, %ymm1
+ vbroadcasti128 224(%r10), %ymm9
+L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last:
+ vaesenclast %ymm9, %ymm0, %ymm0
+ vaesenclast %ymm9, %ymm1, %ymm1
+ vpxor %ymm4, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vpxor %ymm5, %ymm1, %ymm1
+ vmovdqu %ymm1, 32(%rdx)
+ vextracti128 $0x01, %ymm5, %xmm8
+ vpshufd $19, %xmm8, %xmm9
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm9, %xmm9
+ vpand %xmm12, %xmm9, %xmm9
+ vpxor %xmm9, %xmm8, %xmm8
+ addl $0x40, %r12d
+L_AES_XTS_encrypt_update_vaes_done_64:
+ movl %eax, %r11d
+ andl $0xffffffe0, %r11d
+ cmpl %r11d, %r12d
+ je L_AES_XTS_encrypt_update_vaes_done_32
+ # 32 bytes of input
+ # aes_enc_32
+ leaq (%rdi,%r12,1), %rcx
+ leaq (%rsi,%r12,1), %rdx
+ vmovdqu (%rcx), %ymm0
+ vperm2i128 $0x00, %ymm8, %ymm8, %ymm5
+ vpsrlvq %ymm15, %ymm5, %ymm6
+ vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7
+ vpslldq $8, %ymm6, %ymm6
+ vpsllvq %ymm14, %ymm5, %ymm4
+ vpxor %ymm7, %ymm4, %ymm4
+ vpxor %ymm6, %ymm4, %ymm4
+ # aes_enc_block
+ vbroadcasti128 (%r10), %ymm9
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm9, %ymm0, %ymm0
+ vbroadcasti128 16(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vbroadcasti128 32(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vbroadcasti128 48(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vbroadcasti128 64(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vbroadcasti128 80(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vbroadcasti128 96(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vbroadcasti128 112(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vbroadcasti128 128(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vbroadcasti128 144(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ cmpl $11, %r9d
+ vbroadcasti128 160(%r10), %ymm9
+ jl L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last
+ vaesenc %ymm9, %ymm0, %ymm0
+ vbroadcasti128 176(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ cmpl $13, %r9d
+ vbroadcasti128 192(%r10), %ymm9
+ jl L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last
+ vaesenc %ymm9, %ymm0, %ymm0
+ vbroadcasti128 208(%r10), %ymm9
+ vaesenc %ymm9, %ymm0, %ymm0
+ vbroadcasti128 224(%r10), %ymm9
+L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last:
+ vaesenclast %ymm9, %ymm0, %ymm0
+ vpxor %ymm4, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vextracti128 $0x01, %ymm4, %xmm8
+ vpshufd $19, %xmm8, %xmm9
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm9, %xmm9
+ vpand %xmm12, %xmm9, %xmm9
+ vpxor %xmm9, %xmm8, %xmm8
+ addl $32, %r12d
+L_AES_XTS_encrypt_update_vaes_done_32:
+ cmpl %eax, %r12d
+ movl %eax, %r11d
+ je L_AES_XTS_encrypt_update_vaes_done_enc
+ subl %r12d, %r11d
+ cmpl $16, %r11d
+ movl %eax, %r11d
+ jl L_AES_XTS_encrypt_update_vaes_last_15
+ andl $0xfffffff0, %r11d
+ # 16 bytes of input
+L_AES_XTS_encrypt_update_vaes_enc_16:
+ leaq (%rdi,%r12,1), %rcx
+ vmovdqu (%rcx), %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ # aes_enc_block
+ vpxor (%r10), %xmm0, %xmm0
+ vmovdqu 16(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ cmpl $11, %r9d
+ vmovdqu 160(%r10), %xmm5
+ jl L_AES_XTS_encrypt_update_vaes_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%r10), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ cmpl $13, %r9d
+ vmovdqu 192(%r10), %xmm5
+ jl L_AES_XTS_encrypt_update_vaes_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%r10), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%r10), %xmm5
+L_AES_XTS_encrypt_update_vaes_aes_enc_block_last:
+ vaesenclast %xmm5, %xmm0, %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ leaq (%rsi,%r12,1), %rcx
+ vmovdqu %xmm0, (%rcx)
+ vpshufd $19, %xmm8, %xmm4
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm4, %xmm4
+ vpand %xmm12, %xmm4, %xmm4
+ vpxor %xmm4, %xmm8, %xmm8
+ addl $16, %r12d
+ cmpl %r11d, %r12d
+ jl L_AES_XTS_encrypt_update_vaes_enc_16
+ cmpl %eax, %r12d
+ je L_AES_XTS_encrypt_update_vaes_done_enc
+L_AES_XTS_encrypt_update_vaes_last_15:
+ subq $16, %r12
+ leaq (%rsi,%r12,1), %rcx
+ vmovdqu (%rcx), %xmm0
+ addq $16, %r12
+ vmovdqu %xmm0, (%rsp)
+ xorq %rdx, %rdx
+L_AES_XTS_encrypt_update_vaes_last_15_byte_loop:
+ movb (%rsp,%rdx,1), %r11b
+ movb (%rdi,%r12,1), %cl
+ movb %r11b, (%rsi,%r12,1)
+ movb %cl, (%rsp,%rdx,1)
+ incl %r12d
+ incl %edx
+ cmpl %eax, %r12d
+ jl L_AES_XTS_encrypt_update_vaes_last_15_byte_loop
+ subq %rdx, %r12
+ vmovdqu (%rsp), %xmm0
+ subq $16, %r12
+ vpxor %xmm8, %xmm0, %xmm0
+ # aes_enc_block
+ vpxor (%r10), %xmm0, %xmm0
+ vmovdqu 16(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ cmpl $11, %r9d
+ vmovdqu 160(%r10), %xmm5
+ jl L_AES_XTS_encrypt_update_vaes_last_15_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%r10), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ cmpl $13, %r9d
+ vmovdqu 192(%r10), %xmm5
+ jl L_AES_XTS_encrypt_update_vaes_last_15_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%r10), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%r10), %xmm5
+L_AES_XTS_encrypt_update_vaes_last_15_aes_enc_block_last:
+ vaesenclast %xmm5, %xmm0, %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ leaq (%rsi,%r12,1), %rcx
+ vmovdqu %xmm0, (%rcx)
+L_AES_XTS_encrypt_update_vaes_done_enc:
+ vmovdqu %xmm8, (%r8)
+ addq $0x40, %rsp
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size AES_XTS_encrypt_update_vaes,.-AES_XTS_encrypt_update_vaes
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_XTS_decrypt_vaes
+.type AES_XTS_decrypt_vaes,@function
+.align 16
+AES_XTS_decrypt_vaes:
+#else
+.section __TEXT,__text
+.globl _AES_XTS_decrypt_vaes
+.p2align 4
+_AES_XTS_decrypt_vaes:
+#endif /* __APPLE__ */
+ pushq %r12
+ pushq %r13
+ movq %rdx, %rax
+ movq %rcx, %r12
+ movl 24(%rsp), %r10d
+ subq $0x40, %rsp
+ vmovdqu L_vaes_aes_xts_gc_xts(%rip), %xmm12
+ vbroadcasti128 L_vaes_aes_xts_poly(%rip), %ymm13
+ vmovdqu L_vaes_aes_xts_shl(%rip), %ymm14
+ vmovdqu L_vaes_aes_xts_shr(%rip), %ymm15
+ vmovdqu (%r12), %xmm8
+ # aes_enc_block
+ vpxor (%r9), %xmm8, %xmm8
+ vmovdqu 16(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 32(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 48(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 64(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 80(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 96(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 112(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 128(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 144(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ cmpl $11, %r10d
+ vmovdqu 160(%r9), %xmm5
+ jl L_AES_XTS_decrypt_vaes_tweak_aes_enc_block_last
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 176(%r9), %xmm6
+ vaesenc %xmm6, %xmm8, %xmm8
+ cmpl $13, %r10d
+ vmovdqu 192(%r9), %xmm5
+ jl L_AES_XTS_decrypt_vaes_tweak_aes_enc_block_last
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 208(%r9), %xmm6
+ vaesenc %xmm6, %xmm8, %xmm8
+ vmovdqu 224(%r9), %xmm5
+L_AES_XTS_decrypt_vaes_tweak_aes_enc_block_last:
+ vaesenclast %xmm5, %xmm8, %xmm8
+ xorl %r13d, %r13d
+ movl %eax, %r11d
+ andl $0xfffffff0, %r11d
+ cmpl %eax, %r11d
+ je L_AES_XTS_decrypt_vaes_mul16_128
+ subl $16, %r11d
+ cmpl $16, %r11d
+ jl L_AES_XTS_decrypt_vaes_last_31_start
+L_AES_XTS_decrypt_vaes_mul16_128:
+ cmpl $32, %r11d
+ jl L_AES_XTS_decrypt_vaes_done_128
+ cmpl $0x80, %r11d
+ jl L_AES_XTS_decrypt_vaes_done_128
+ andl $0xffffff80, %r11d
+ vperm2i128 $0x00, %ymm8, %ymm8, %ymm5
+ vpsrlvq %ymm15, %ymm5, %ymm6
+ vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7
+ vpslldq $8, %ymm6, %ymm6
+ vpsllvq %ymm14, %ymm5, %ymm4
+ vpxor %ymm7, %ymm4, %ymm4
+ vpxor %ymm6, %ymm4, %ymm4
+ vpsrlq $62, %ymm4, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $2, %ymm4, %ymm5
+ vpxor %ymm10, %ymm5, %ymm5
+ vpxor %ymm9, %ymm5, %ymm5
+ vpsrlq $62, %ymm5, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $2, %ymm5, %ymm6
+ vpxor %ymm10, %ymm6, %ymm6
+ vpxor %ymm9, %ymm6, %ymm6
+ vpsrlq $62, %ymm6, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $2, %ymm6, %ymm7
+ vpxor %ymm10, %ymm7, %ymm7
+ vpxor %ymm9, %ymm7, %ymm7
+L_AES_XTS_decrypt_vaes_dec_128:
+ # 128 bytes of input
+ # aes_dec_128
+ leaq (%rdi,%r13,1), %rcx
+ leaq (%rsi,%r13,1), %rdx
+ vmovdqu (%rcx), %ymm0
+ vmovdqu 32(%rcx), %ymm1
+ vmovdqu 64(%rcx), %ymm2
+ vmovdqu 96(%rcx), %ymm3
+ # aes_dec_block
+ vbroadcasti128 (%r8), %ymm9
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm9, %ymm0, %ymm0
+ vpxor %ymm5, %ymm1, %ymm1
+ vpxor %ymm9, %ymm1, %ymm1
+ vpxor %ymm6, %ymm2, %ymm2
+ vpxor %ymm9, %ymm2, %ymm2
+ vpxor %ymm7, %ymm3, %ymm3
+ vpxor %ymm9, %ymm3, %ymm3
+ vbroadcasti128 16(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 32(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 48(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 64(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 80(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 96(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 112(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 128(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 144(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ cmpl $11, %r10d
+ vbroadcasti128 160(%r8), %ymm9
+ jl L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 176(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ cmpl $13, %r10d
+ vbroadcasti128 192(%r8), %ymm9
+ jl L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 208(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 224(%r8), %ymm9
+L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last:
+ vaesdeclast %ymm9, %ymm0, %ymm0
+ vaesdeclast %ymm9, %ymm1, %ymm1
+ vaesdeclast %ymm9, %ymm2, %ymm2
+ vaesdeclast %ymm9, %ymm3, %ymm3
+ vpxor %ymm4, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vpsrlq $56, %ymm4, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $8, %ymm4, %ymm4
+ vpxor %ymm10, %ymm4, %ymm4
+ vpxor %ymm9, %ymm4, %ymm4
+ vpxor %ymm5, %ymm1, %ymm1
+ vmovdqu %ymm1, 32(%rdx)
+ vpsrlq $56, %ymm5, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $8, %ymm5, %ymm5
+ vpxor %ymm10, %ymm5, %ymm5
+ vpxor %ymm9, %ymm5, %ymm5
+ vpxor %ymm6, %ymm2, %ymm2
+ vmovdqu %ymm2, 64(%rdx)
+ vpsrlq $56, %ymm6, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $8, %ymm6, %ymm6
+ vpxor %ymm10, %ymm6, %ymm6
+ vpxor %ymm9, %ymm6, %ymm6
+ vpxor %ymm7, %ymm3, %ymm3
+ vmovdqu %ymm3, 96(%rdx)
+ vpsrlq $56, %ymm7, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $8, %ymm7, %ymm7
+ vpxor %ymm10, %ymm7, %ymm7
+ vpxor %ymm9, %ymm7, %ymm7
+ addl $0x80, %r13d
+ cmpl %r11d, %r13d
+ jl L_AES_XTS_decrypt_vaes_dec_128
+ vextracti128 $0x00, %ymm4, %xmm8
+L_AES_XTS_decrypt_vaes_done_128:
+ cmpl %eax, %r13d
+ movl %eax, %r11d
+ je L_AES_XTS_decrypt_vaes_done_dec
+ andl $0xfffffff0, %r11d
+ cmpl %eax, %r11d
+ je L_AES_XTS_decrypt_vaes_mul16_64
+ subl $16, %r11d
+ subl %r13d, %r11d
+ cmpl $16, %r11d
+ jl L_AES_XTS_decrypt_vaes_last_31_start
+ addl %r13d, %r11d
+L_AES_XTS_decrypt_vaes_mul16_64:
+ andl $0xffffffc0, %r11d
+ cmpl %r11d, %r13d
+ je L_AES_XTS_decrypt_vaes_done_64
+ # 64 bytes of input
+ # aes_dec_64
+ leaq (%rdi,%r13,1), %rcx
+ leaq (%rsi,%r13,1), %rdx
+ vmovdqu (%rcx), %ymm0
+ vmovdqu 32(%rcx), %ymm1
+ vperm2i128 $0x00, %ymm8, %ymm8, %ymm5
+ vpsrlvq %ymm15, %ymm5, %ymm6
+ vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7
+ vpslldq $8, %ymm6, %ymm6
+ vpsllvq %ymm14, %ymm5, %ymm4
+ vpxor %ymm7, %ymm4, %ymm4
+ vpxor %ymm6, %ymm4, %ymm4
+ vpsrlq $62, %ymm4, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $2, %ymm4, %ymm5
+ vpxor %ymm10, %ymm5, %ymm5
+ vpxor %ymm9, %ymm5, %ymm5
+ # aes_dec_block
+ vbroadcasti128 (%r8), %ymm9
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm9, %ymm0, %ymm0
+ vpxor %ymm5, %ymm1, %ymm1
+ vpxor %ymm9, %ymm1, %ymm1
+ vbroadcasti128 16(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vbroadcasti128 32(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vbroadcasti128 48(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vbroadcasti128 64(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vbroadcasti128 80(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vbroadcasti128 96(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vbroadcasti128 112(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vbroadcasti128 128(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vbroadcasti128 144(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ cmpl $11, %r10d
+ vbroadcasti128 160(%r8), %ymm9
+ jl L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vbroadcasti128 176(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ cmpl $13, %r10d
+ vbroadcasti128 192(%r8), %ymm9
+ jl L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vbroadcasti128 208(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vbroadcasti128 224(%r8), %ymm9
+L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last:
+ vaesdeclast %ymm9, %ymm0, %ymm0
+ vaesdeclast %ymm9, %ymm1, %ymm1
+ vpxor %ymm4, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vpxor %ymm5, %ymm1, %ymm1
+ vmovdqu %ymm1, 32(%rdx)
+ vextracti128 $0x01, %ymm5, %xmm8
+ vpshufd $19, %xmm8, %xmm9
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm9, %xmm9
+ vpand %xmm12, %xmm9, %xmm9
+ vpxor %xmm9, %xmm8, %xmm8
+ addl $0x40, %r13d
+L_AES_XTS_decrypt_vaes_done_64:
+ cmpl %eax, %r13d
+ movl %eax, %r11d
+ je L_AES_XTS_decrypt_vaes_done_dec
+ andl $0xfffffff0, %r11d
+ cmpl %eax, %r11d
+ je L_AES_XTS_decrypt_vaes_mul16_32
+ subl $16, %r11d
+ subl %r13d, %r11d
+ cmpl $16, %r11d
+ jl L_AES_XTS_decrypt_vaes_last_31_start
+ addl %r13d, %r11d
+L_AES_XTS_decrypt_vaes_mul16_32:
+ andl $0xffffffe0, %r11d
+ cmpl %r11d, %r13d
+ je L_AES_XTS_decrypt_vaes_done_32
+ # 32 bytes of input
+ # aes_dec_32
+ leaq (%rdi,%r13,1), %rcx
+ leaq (%rsi,%r13,1), %rdx
+ vmovdqu (%rcx), %ymm0
+ vperm2i128 $0x00, %ymm8, %ymm8, %ymm5
+ vpsrlvq %ymm15, %ymm5, %ymm6
+ vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7
+ vpslldq $8, %ymm6, %ymm6
+ vpsllvq %ymm14, %ymm5, %ymm4
+ vpxor %ymm7, %ymm4, %ymm4
+ vpxor %ymm6, %ymm4, %ymm4
+ # aes_dec_block
+ vbroadcasti128 (%r8), %ymm9
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm9, %ymm0, %ymm0
+ vbroadcasti128 16(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 32(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 48(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 64(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 80(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 96(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 112(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 128(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 144(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ cmpl $11, %r10d
+ vbroadcasti128 160(%r8), %ymm9
+ jl L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 176(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ cmpl $13, %r10d
+ vbroadcasti128 192(%r8), %ymm9
+ jl L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 208(%r8), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 224(%r8), %ymm9
+L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last:
+ vaesdeclast %ymm9, %ymm0, %ymm0
+ vpxor %ymm4, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vextracti128 $0x01, %ymm4, %xmm8
+ vpshufd $19, %xmm8, %xmm9
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm9, %xmm9
+ vpand %xmm12, %xmm9, %xmm9
+ vpxor %xmm9, %xmm8, %xmm8
+ addl $32, %r13d
+L_AES_XTS_decrypt_vaes_done_32:
+ cmpl %eax, %r13d
+ movl %eax, %r11d
+ je L_AES_XTS_decrypt_vaes_done_dec
+ andl $0xfffffff0, %r11d
+ cmpl %eax, %r11d
+ je L_AES_XTS_decrypt_vaes_mul16
+ subl $16, %r11d
+ subl %r13d, %r11d
+ cmpl $16, %r11d
+ jl L_AES_XTS_decrypt_vaes_last_31_start
+ addl %r13d, %r11d
+L_AES_XTS_decrypt_vaes_mul16:
+L_AES_XTS_decrypt_vaes_dec_16:
+ # 16 bytes of input
+ leaq (%rdi,%r13,1), %rcx
+ vmovdqu (%rcx), %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ # aes_dec_block
+ vpxor (%r8), %xmm0, %xmm0
+ vmovdqu 16(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ cmpl $11, %r10d
+ vmovdqu 160(%r8), %xmm5
+ jl L_AES_XTS_decrypt_vaes_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ cmpl $13, %r10d
+ vmovdqu 192(%r8), %xmm5
+ jl L_AES_XTS_decrypt_vaes_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%r8), %xmm5
+L_AES_XTS_decrypt_vaes_aes_dec_block_last:
+ vaesdeclast %xmm5, %xmm0, %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ leaq (%rsi,%r13,1), %rcx
+ vmovdqu %xmm0, (%rcx)
+ vpshufd $19, %xmm8, %xmm4
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm4, %xmm4
+ vpand %xmm12, %xmm4, %xmm4
+ vpxor %xmm4, %xmm8, %xmm8
+ addl $16, %r13d
+ cmpl %r11d, %r13d
+ jl L_AES_XTS_decrypt_vaes_dec_16
+ cmpl %eax, %r13d
+ je L_AES_XTS_decrypt_vaes_done_dec
+L_AES_XTS_decrypt_vaes_last_31_start:
+ vpshufd $19, %xmm8, %xmm4
+ vpaddq %xmm8, %xmm8, %xmm7
+ vpsrad $31, %xmm4, %xmm4
+ vpand %xmm12, %xmm4, %xmm4
+ vpxor %xmm4, %xmm7, %xmm7
+ leaq (%rdi,%r13,1), %rcx
+ vmovdqu (%rcx), %xmm0
+ vpxor %xmm7, %xmm0, %xmm0
+ # aes_dec_block
+ vpxor (%r8), %xmm0, %xmm0
+ vmovdqu 16(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ cmpl $11, %r10d
+ vmovdqu 160(%r8), %xmm5
+ jl L_AES_XTS_decrypt_vaes_last_31_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ cmpl $13, %r10d
+ vmovdqu 192(%r8), %xmm5
+ jl L_AES_XTS_decrypt_vaes_last_31_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%r8), %xmm5
+L_AES_XTS_decrypt_vaes_last_31_aes_dec_block_last:
+ vaesdeclast %xmm5, %xmm0, %xmm0
+ vpxor %xmm7, %xmm0, %xmm0
+ vmovdqu %xmm0, (%rsp)
+ addq $16, %r13
+ xorq %rdx, %rdx
+L_AES_XTS_decrypt_vaes_last_31_byte_loop:
+ movb (%rsp,%rdx,1), %r11b
+ movb (%rdi,%r13,1), %cl
+ movb %r11b, (%rsi,%r13,1)
+ movb %cl, (%rsp,%rdx,1)
+ incl %r13d
+ incl %edx
+ cmpl %eax, %r13d
+ jl L_AES_XTS_decrypt_vaes_last_31_byte_loop
+ subq %rdx, %r13
+ vmovdqu (%rsp), %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ # aes_dec_block
+ vpxor (%r8), %xmm0, %xmm0
+ vmovdqu 16(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ cmpl $11, %r10d
+ vmovdqu 160(%r8), %xmm5
+ jl L_AES_XTS_decrypt_vaes_last_31_2_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ cmpl $13, %r10d
+ vmovdqu 192(%r8), %xmm5
+ jl L_AES_XTS_decrypt_vaes_last_31_2_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%r8), %xmm5
+L_AES_XTS_decrypt_vaes_last_31_2_aes_dec_block_last:
+ vaesdeclast %xmm5, %xmm0, %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ subq $16, %r13
+ leaq (%rsi,%r13,1), %rcx
+ vmovdqu %xmm0, (%rcx)
+L_AES_XTS_decrypt_vaes_done_dec:
+ addq $0x40, %rsp
+ popq %r13
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size AES_XTS_decrypt_vaes,.-AES_XTS_decrypt_vaes
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_XTS_decrypt_update_vaes
+.type AES_XTS_decrypt_update_vaes,@function
+.align 16
+AES_XTS_decrypt_update_vaes:
+#else
+.section __TEXT,__text
+.globl _AES_XTS_decrypt_update_vaes
+.p2align 4
+_AES_XTS_decrypt_update_vaes:
+#endif /* __APPLE__ */
+ pushq %r12
+ movq %rdx, %rax
+ movq %rcx, %r10
+ subq $0x40, %rsp
+ vmovdqu L_vaes_aes_xts_gc_xts(%rip), %xmm12
+ vbroadcasti128 L_vaes_aes_xts_poly(%rip), %ymm13
+ vmovdqu L_vaes_aes_xts_shl(%rip), %ymm14
+ vmovdqu L_vaes_aes_xts_shr(%rip), %ymm15
+ vmovdqu (%r8), %xmm8
+ xorl %r12d, %r12d
+ movl %eax, %r11d
+ andl $0xfffffff0, %r11d
+ cmpl %eax, %r11d
+ je L_AES_XTS_decrypt_update_vaes_mul16_128
+ subl $16, %r11d
+ cmpl $16, %r11d
+ jl L_AES_XTS_decrypt_update_vaes_last_31_start
+L_AES_XTS_decrypt_update_vaes_mul16_128:
+ cmpl $32, %r11d
+ jl L_AES_XTS_decrypt_update_vaes_done_128
+ cmpl $0x80, %r11d
+ jl L_AES_XTS_decrypt_update_vaes_done_128
+ andl $0xffffff80, %r11d
+ vperm2i128 $0x00, %ymm8, %ymm8, %ymm5
+ vpsrlvq %ymm15, %ymm5, %ymm6
+ vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7
+ vpslldq $8, %ymm6, %ymm6
+ vpsllvq %ymm14, %ymm5, %ymm4
+ vpxor %ymm7, %ymm4, %ymm4
+ vpxor %ymm6, %ymm4, %ymm4
+ vpsrlq $62, %ymm4, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $2, %ymm4, %ymm5
+ vpxor %ymm10, %ymm5, %ymm5
+ vpxor %ymm9, %ymm5, %ymm5
+ vpsrlq $62, %ymm5, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $2, %ymm5, %ymm6
+ vpxor %ymm10, %ymm6, %ymm6
+ vpxor %ymm9, %ymm6, %ymm6
+ vpsrlq $62, %ymm6, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $2, %ymm6, %ymm7
+ vpxor %ymm10, %ymm7, %ymm7
+ vpxor %ymm9, %ymm7, %ymm7
+L_AES_XTS_decrypt_update_vaes_dec_128:
+ # 128 bytes of input
+ # aes_dec_128
+ leaq (%rdi,%r12,1), %rcx
+ leaq (%rsi,%r12,1), %rdx
+ vmovdqu (%rcx), %ymm0
+ vmovdqu 32(%rcx), %ymm1
+ vmovdqu 64(%rcx), %ymm2
+ vmovdqu 96(%rcx), %ymm3
+ # aes_dec_block
+ vbroadcasti128 (%r10), %ymm9
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm9, %ymm0, %ymm0
+ vpxor %ymm5, %ymm1, %ymm1
+ vpxor %ymm9, %ymm1, %ymm1
+ vpxor %ymm6, %ymm2, %ymm2
+ vpxor %ymm9, %ymm2, %ymm2
+ vpxor %ymm7, %ymm3, %ymm3
+ vpxor %ymm9, %ymm3, %ymm3
+ vbroadcasti128 16(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 32(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 48(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 64(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 80(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 96(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 112(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 128(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 144(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ cmpl $11, %r9d
+ vbroadcasti128 160(%r10), %ymm9
+ jl L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 176(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ cmpl $13, %r9d
+ vbroadcasti128 192(%r10), %ymm9
+ jl L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 208(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vaesdec %ymm9, %ymm2, %ymm2
+ vaesdec %ymm9, %ymm3, %ymm3
+ vbroadcasti128 224(%r10), %ymm9
+L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last:
+ vaesdeclast %ymm9, %ymm0, %ymm0
+ vaesdeclast %ymm9, %ymm1, %ymm1
+ vaesdeclast %ymm9, %ymm2, %ymm2
+ vaesdeclast %ymm9, %ymm3, %ymm3
+ vpxor %ymm4, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vpsrlq $56, %ymm4, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $8, %ymm4, %ymm4
+ vpxor %ymm10, %ymm4, %ymm4
+ vpxor %ymm9, %ymm4, %ymm4
+ vpxor %ymm5, %ymm1, %ymm1
+ vmovdqu %ymm1, 32(%rdx)
+ vpsrlq $56, %ymm5, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $8, %ymm5, %ymm5
+ vpxor %ymm10, %ymm5, %ymm5
+ vpxor %ymm9, %ymm5, %ymm5
+ vpxor %ymm6, %ymm2, %ymm2
+ vmovdqu %ymm2, 64(%rdx)
+ vpsrlq $56, %ymm6, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $8, %ymm6, %ymm6
+ vpxor %ymm10, %ymm6, %ymm6
+ vpxor %ymm9, %ymm6, %ymm6
+ vpxor %ymm7, %ymm3, %ymm3
+ vmovdqu %ymm3, 96(%rdx)
+ vpsrlq $56, %ymm7, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $8, %ymm7, %ymm7
+ vpxor %ymm10, %ymm7, %ymm7
+ vpxor %ymm9, %ymm7, %ymm7
+ addl $0x80, %r12d
+ cmpl %r11d, %r12d
+ jl L_AES_XTS_decrypt_update_vaes_dec_128
+ vextracti128 $0x00, %ymm4, %xmm8
+L_AES_XTS_decrypt_update_vaes_done_128:
+ cmpl %eax, %r12d
+ movl %eax, %r11d
+ je L_AES_XTS_decrypt_update_vaes_done_dec
+ andl $0xfffffff0, %r11d
+ cmpl %eax, %r11d
+ je L_AES_XTS_decrypt_update_vaes_mul16_64
+ subl $16, %r11d
+ subl %r12d, %r11d
+ cmpl $16, %r11d
+ jl L_AES_XTS_decrypt_update_vaes_last_31_start
+ addl %r12d, %r11d
+L_AES_XTS_decrypt_update_vaes_mul16_64:
+ andl $0xffffffc0, %r11d
+ cmpl %r11d, %r12d
+ je L_AES_XTS_decrypt_update_vaes_done_64
+ # 64 bytes of input
+ # aes_dec_64
+ leaq (%rdi,%r12,1), %rcx
+ leaq (%rsi,%r12,1), %rdx
+ vmovdqu (%rcx), %ymm0
+ vmovdqu 32(%rcx), %ymm1
+ vperm2i128 $0x00, %ymm8, %ymm8, %ymm5
+ vpsrlvq %ymm15, %ymm5, %ymm6
+ vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7
+ vpslldq $8, %ymm6, %ymm6
+ vpsllvq %ymm14, %ymm5, %ymm4
+ vpxor %ymm7, %ymm4, %ymm4
+ vpxor %ymm6, %ymm4, %ymm4
+ vpsrlq $62, %ymm4, %ymm9
+ vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10
+ vpslldq $8, %ymm9, %ymm9
+ vpsllq $2, %ymm4, %ymm5
+ vpxor %ymm10, %ymm5, %ymm5
+ vpxor %ymm9, %ymm5, %ymm5
+ # aes_dec_block
+ vbroadcasti128 (%r10), %ymm9
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm9, %ymm0, %ymm0
+ vpxor %ymm5, %ymm1, %ymm1
+ vpxor %ymm9, %ymm1, %ymm1
+ vbroadcasti128 16(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vbroadcasti128 32(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vbroadcasti128 48(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vbroadcasti128 64(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vbroadcasti128 80(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vbroadcasti128 96(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vbroadcasti128 112(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vbroadcasti128 128(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vbroadcasti128 144(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ cmpl $11, %r9d
+ vbroadcasti128 160(%r10), %ymm9
+ jl L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vbroadcasti128 176(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ cmpl $13, %r9d
+ vbroadcasti128 192(%r10), %ymm9
+ jl L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vbroadcasti128 208(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vaesdec %ymm9, %ymm1, %ymm1
+ vbroadcasti128 224(%r10), %ymm9
+L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last:
+ vaesdeclast %ymm9, %ymm0, %ymm0
+ vaesdeclast %ymm9, %ymm1, %ymm1
+ vpxor %ymm4, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vpxor %ymm5, %ymm1, %ymm1
+ vmovdqu %ymm1, 32(%rdx)
+ vextracti128 $0x01, %ymm5, %xmm8
+ vpshufd $19, %xmm8, %xmm9
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm9, %xmm9
+ vpand %xmm12, %xmm9, %xmm9
+ vpxor %xmm9, %xmm8, %xmm8
+ addl $0x40, %r12d
+L_AES_XTS_decrypt_update_vaes_done_64:
+ cmpl %eax, %r12d
+ movl %eax, %r11d
+ je L_AES_XTS_decrypt_update_vaes_done_dec
+ andl $0xfffffff0, %r11d
+ cmpl %eax, %r11d
+ je L_AES_XTS_decrypt_update_vaes_mul16_32
+ subl $16, %r11d
+ subl %r12d, %r11d
+ cmpl $16, %r11d
+ jl L_AES_XTS_decrypt_update_vaes_last_31_start
+ addl %r12d, %r11d
+L_AES_XTS_decrypt_update_vaes_mul16_32:
+ andl $0xffffffe0, %r11d
+ cmpl %r11d, %r12d
+ je L_AES_XTS_decrypt_update_vaes_done_32
+ # 32 bytes of input
+ # aes_dec_32
+ leaq (%rdi,%r12,1), %rcx
+ leaq (%rsi,%r12,1), %rdx
+ vmovdqu (%rcx), %ymm0
+ vperm2i128 $0x00, %ymm8, %ymm8, %ymm5
+ vpsrlvq %ymm15, %ymm5, %ymm6
+ vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7
+ vpslldq $8, %ymm6, %ymm6
+ vpsllvq %ymm14, %ymm5, %ymm4
+ vpxor %ymm7, %ymm4, %ymm4
+ vpxor %ymm6, %ymm4, %ymm4
+ # aes_dec_block
+ vbroadcasti128 (%r10), %ymm9
+ vpxor %ymm4, %ymm0, %ymm0
+ vpxor %ymm9, %ymm0, %ymm0
+ vbroadcasti128 16(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 32(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 48(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 64(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 80(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 96(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 112(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 128(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 144(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ cmpl $11, %r9d
+ vbroadcasti128 160(%r10), %ymm9
+ jl L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 176(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ cmpl $13, %r9d
+ vbroadcasti128 192(%r10), %ymm9
+ jl L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 208(%r10), %ymm9
+ vaesdec %ymm9, %ymm0, %ymm0
+ vbroadcasti128 224(%r10), %ymm9
+L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last:
+ vaesdeclast %ymm9, %ymm0, %ymm0
+ vpxor %ymm4, %ymm0, %ymm0
+ vmovdqu %ymm0, (%rdx)
+ vextracti128 $0x01, %ymm4, %xmm8
+ vpshufd $19, %xmm8, %xmm9
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm9, %xmm9
+ vpand %xmm12, %xmm9, %xmm9
+ vpxor %xmm9, %xmm8, %xmm8
+ addl $32, %r12d
+L_AES_XTS_decrypt_update_vaes_done_32:
+ cmpl %eax, %r12d
+ movl %eax, %r11d
+ je L_AES_XTS_decrypt_update_vaes_done_dec
+ andl $0xfffffff0, %r11d
+ cmpl %eax, %r11d
+ je L_AES_XTS_decrypt_update_vaes_mul16
+ subl $16, %r11d
+ subl %r12d, %r11d
+ cmpl $16, %r11d
+ jl L_AES_XTS_decrypt_update_vaes_last_31_start
+ addl %r12d, %r11d
+L_AES_XTS_decrypt_update_vaes_mul16:
+L_AES_XTS_decrypt_update_vaes_dec_16:
+ # 16 bytes of input
+ leaq (%rdi,%r12,1), %rcx
+ vmovdqu (%rcx), %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ # aes_dec_block
+ vpxor (%r10), %xmm0, %xmm0
+ vmovdqu 16(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ cmpl $11, %r9d
+ vmovdqu 160(%r10), %xmm5
+ jl L_AES_XTS_decrypt_update_vaes_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%r10), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ cmpl $13, %r9d
+ vmovdqu 192(%r10), %xmm5
+ jl L_AES_XTS_decrypt_update_vaes_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%r10), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%r10), %xmm5
+L_AES_XTS_decrypt_update_vaes_aes_dec_block_last:
+ vaesdeclast %xmm5, %xmm0, %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ leaq (%rsi,%r12,1), %rcx
+ vmovdqu %xmm0, (%rcx)
+ vpshufd $19, %xmm8, %xmm4
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm4, %xmm4
+ vpand %xmm12, %xmm4, %xmm4
+ vpxor %xmm4, %xmm8, %xmm8
+ addl $16, %r12d
+ cmpl %r11d, %r12d
+ jl L_AES_XTS_decrypt_update_vaes_dec_16
+ cmpl %eax, %r12d
+ je L_AES_XTS_decrypt_update_vaes_done_dec
+L_AES_XTS_decrypt_update_vaes_last_31_start:
+ vpshufd $19, %xmm8, %xmm4
+ vpaddq %xmm8, %xmm8, %xmm7
+ vpsrad $31, %xmm4, %xmm4
+ vpand %xmm12, %xmm4, %xmm4
+ vpxor %xmm4, %xmm7, %xmm7
+ leaq (%rdi,%r12,1), %rcx
+ vmovdqu (%rcx), %xmm0
+ vpxor %xmm7, %xmm0, %xmm0
+ # aes_dec_block
+ vpxor (%r10), %xmm0, %xmm0
+ vmovdqu 16(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ cmpl $11, %r9d
+ vmovdqu 160(%r10), %xmm5
+ jl L_AES_XTS_decrypt_update_vaes_last_31_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%r10), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ cmpl $13, %r9d
+ vmovdqu 192(%r10), %xmm5
+ jl L_AES_XTS_decrypt_update_vaes_last_31_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%r10), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%r10), %xmm5
+L_AES_XTS_decrypt_update_vaes_last_31_aes_dec_block_last:
+ vaesdeclast %xmm5, %xmm0, %xmm0
+ vpxor %xmm7, %xmm0, %xmm0
+ vmovdqu %xmm0, (%rsp)
+ addq $16, %r12
+ xorq %rdx, %rdx
+L_AES_XTS_decrypt_update_vaes_last_31_byte_loop:
+ movb (%rsp,%rdx,1), %r11b
+ movb (%rdi,%r12,1), %cl
+ movb %r11b, (%rsi,%r12,1)
+ movb %cl, (%rsp,%rdx,1)
+ incl %r12d
+ incl %edx
+ cmpl %eax, %r12d
+ jl L_AES_XTS_decrypt_update_vaes_last_31_byte_loop
+ subq %rdx, %r12
+ vmovdqu (%rsp), %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ # aes_dec_block
+ vpxor (%r10), %xmm0, %xmm0
+ vmovdqu 16(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ cmpl $11, %r9d
+ vmovdqu 160(%r10), %xmm5
+ jl L_AES_XTS_decrypt_update_vaes_last_31_2_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%r10), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ cmpl $13, %r9d
+ vmovdqu 192(%r10), %xmm5
+ jl L_AES_XTS_decrypt_update_vaes_last_31_2_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%r10), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%r10), %xmm5
+L_AES_XTS_decrypt_update_vaes_last_31_2_aes_dec_block_last:
+ vaesdeclast %xmm5, %xmm0, %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ subq $16, %r12
+ leaq (%rsi,%r12,1), %rcx
+ vmovdqu %xmm0, (%rcx)
+L_AES_XTS_decrypt_update_vaes_done_dec:
+ vmovdqu %xmm8, (%r8)
+ addq $0x40, %rsp
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size AES_XTS_decrypt_update_vaes,.-AES_XTS_decrypt_update_vaes
+#endif /* __APPLE__ */
+#endif /* HAVE_INTEL_VAES */
+#ifdef HAVE_INTEL_AVX512
+#ifndef __APPLE__
+.text
+.globl AES_XTS_init_avx512
+.type AES_XTS_init_avx512,@function
+.align 16
+AES_XTS_init_avx512:
+#else
+.section __TEXT,__text
+.globl _AES_XTS_init_avx512
+.p2align 4
+_AES_XTS_init_avx512:
+#endif /* __APPLE__ */
+ vmovdqu (%rdi), %xmm0
+ # aes_enc_block
+ vpxor (%rsi), %xmm0, %xmm0
+ vmovdqu 16(%rsi), %xmm2
+ vaesenc %xmm2, %xmm0, %xmm0
+ vmovdqu 32(%rsi), %xmm2
+ vaesenc %xmm2, %xmm0, %xmm0
+ vmovdqu 48(%rsi), %xmm2
+ vaesenc %xmm2, %xmm0, %xmm0
+ vmovdqu 64(%rsi), %xmm2
+ vaesenc %xmm2, %xmm0, %xmm0
+ vmovdqu 80(%rsi), %xmm2
+ vaesenc %xmm2, %xmm0, %xmm0
+ vmovdqu 96(%rsi), %xmm2
+ vaesenc %xmm2, %xmm0, %xmm0
+ vmovdqu 112(%rsi), %xmm2
+ vaesenc %xmm2, %xmm0, %xmm0
+ vmovdqu 128(%rsi), %xmm2
+ vaesenc %xmm2, %xmm0, %xmm0
+ vmovdqu 144(%rsi), %xmm2
+ vaesenc %xmm2, %xmm0, %xmm0
+ cmpl $11, %edx
+ vmovdqu 160(%rsi), %xmm2
+ jl L_AES_XTS_init_avx512_tweak_aes_enc_block_last
+ vaesenc %xmm2, %xmm0, %xmm0
+ vmovdqu 176(%rsi), %xmm3
+ vaesenc %xmm3, %xmm0, %xmm0
+ cmpl $13, %edx
+ vmovdqu 192(%rsi), %xmm2
+ jl L_AES_XTS_init_avx512_tweak_aes_enc_block_last
+ vaesenc %xmm2, %xmm0, %xmm0
+ vmovdqu 208(%rsi), %xmm3
+ vaesenc %xmm3, %xmm0, %xmm0
+ vmovdqu 224(%rsi), %xmm2
+L_AES_XTS_init_avx512_tweak_aes_enc_block_last:
+ vaesenclast %xmm2, %xmm0, %xmm0
+ vmovdqu %xmm0, (%rdi)
+ repz retq
+#ifndef __APPLE__
+.size AES_XTS_init_avx512,.-AES_XTS_init_avx512
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_avx512_aes_xts_gc_xts:
+.long 0x00000087,0x00000000,0x00000001,0x00000000
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_avx512_aes_xts_poly:
+.long 0x00000087,0x00000000,0x00000000,0x00000000
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_avx512_aes_xts_shl:
+.long 0x00000000,0x00000000,0x00000000,0x00000000
+.long 0x00000001,0x00000000,0x00000001,0x00000000
+.long 0x00000002,0x00000000,0x00000002,0x00000000
+.long 0x00000003,0x00000000,0x00000003,0x00000000
+#ifndef __APPLE__
+.data
+#else
+.section __DATA,__data
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.align 16
+#else
+.p2align 4
+#endif /* __APPLE__ */
+L_avx512_aes_xts_shr:
+.long 0x00000040,0x00000000,0x00000040,0x00000000
+.long 0x0000003f,0x00000000,0x0000003f,0x00000000
+.long 0x0000003e,0x00000000,0x0000003e,0x00000000
+.long 0x0000003d,0x00000000,0x0000003d,0x00000000
+#ifndef __APPLE__
+.text
+.globl AES_XTS_encrypt_avx512
+.type AES_XTS_encrypt_avx512,@function
+.align 16
+AES_XTS_encrypt_avx512:
+#else
+.section __TEXT,__text
+.globl _AES_XTS_encrypt_avx512
+.p2align 4
+_AES_XTS_encrypt_avx512:
+#endif /* __APPLE__ */
+ pushq %r12
+ pushq %r13
+ movq %rdx, %rax
+ movq %rcx, %r12
+ movl 24(%rsp), %r10d
+ subq $0x40, %rsp
+ vmovdqu L_avx512_aes_xts_gc_xts(%rip), %xmm12
+ vbroadcasti32x4 L_avx512_aes_xts_poly(%rip), %zmm13
+ vmovdqu64 L_avx512_aes_xts_shl(%rip), %zmm14
+ vmovdqu64 L_avx512_aes_xts_shr(%rip), %zmm15
+ vmovdqu (%r12), %xmm8
+ # aes_enc_block
+ vpxor (%r9), %xmm8, %xmm8
+ vmovdqu 16(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 32(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 48(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 64(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 80(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 96(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 112(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 128(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 144(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ cmpl $11, %r10d
+ vmovdqu 160(%r9), %xmm5
+ jl L_AES_XTS_encrypt_avx512_tweak_aes_enc_block_last
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 176(%r9), %xmm6
+ vaesenc %xmm6, %xmm8, %xmm8
+ cmpl $13, %r10d
+ vmovdqu 192(%r9), %xmm5
+ jl L_AES_XTS_encrypt_avx512_tweak_aes_enc_block_last
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 208(%r9), %xmm6
+ vaesenc %xmm6, %xmm8, %xmm8
+ vmovdqu 224(%r9), %xmm5
+L_AES_XTS_encrypt_avx512_tweak_aes_enc_block_last:
+ vaesenclast %xmm5, %xmm8, %xmm8
+ xorl %r13d, %r13d
+ cmpl $32, %eax
+ jl L_AES_XTS_encrypt_avx512_done_128
+ vbroadcasti32x4 (%r8), %zmm16
+ vbroadcasti32x4 16(%r8), %zmm17
+ vbroadcasti32x4 32(%r8), %zmm18
+ vbroadcasti32x4 48(%r8), %zmm19
+ vbroadcasti32x4 64(%r8), %zmm20
+ vbroadcasti32x4 80(%r8), %zmm21
+ vbroadcasti32x4 96(%r8), %zmm22
+ vbroadcasti32x4 112(%r8), %zmm23
+ vbroadcasti32x4 128(%r8), %zmm24
+ vbroadcasti32x4 144(%r8), %zmm25
+ vbroadcasti32x4 160(%r8), %zmm26
+ cmpl $11, %r10d
+ jl L_AES_XTS_encrypt_avx512_key_cached
+ vbroadcasti32x4 176(%r8), %zmm27
+ vbroadcasti32x4 192(%r8), %zmm28
+ cmpl $13, %r10d
+ jl L_AES_XTS_encrypt_avx512_key_cached
+ vbroadcasti32x4 208(%r8), %zmm29
+ vbroadcasti32x4 224(%r8), %zmm30
+L_AES_XTS_encrypt_avx512_key_cached:
+ cmpl $0x100, %eax
+ movl %eax, %r11d
+ jl L_AES_XTS_encrypt_avx512_done_256
+ andl $0xffffff00, %r11d
+ vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5
+ vpsrlvq %zmm15, %zmm5, %zmm6
+ vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7
+ vpslldq $8, %zmm6, %zmm6
+ vpsllvq %zmm14, %zmm5, %zmm4
+ vpternlogq $0x96, %zmm6, %zmm7, %zmm4
+ vpsrlq $60, %zmm4, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $4, %zmm4, %zmm5
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm5
+ vpsrlq $60, %zmm5, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $4, %zmm5, %zmm6
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm6
+ vpsrlq $60, %zmm6, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $4, %zmm6, %zmm7
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm7
+L_AES_XTS_encrypt_avx512_enc_256:
+ # 256 bytes of input
+ # aes_enc_256
+ leaq (%rdi,%r13,1), %rcx
+ leaq (%rsi,%r13,1), %rdx
+ vmovdqu64 (%rcx), %zmm0
+ vmovdqu64 64(%rcx), %zmm1
+ vmovdqu64 128(%rcx), %zmm2
+ vmovdqu64 192(%rcx), %zmm3
+ # aes_enc_block
+ vpternlogq $0x96, %zmm4, %zmm16, %zmm0
+ vpternlogq $0x96, %zmm5, %zmm16, %zmm1
+ vpternlogq $0x96, %zmm6, %zmm16, %zmm2
+ vpternlogq $0x96, %zmm7, %zmm16, %zmm3
+ vaesenc %zmm17, %zmm0, %zmm0
+ vaesenc %zmm17, %zmm1, %zmm1
+ vaesenc %zmm17, %zmm2, %zmm2
+ vaesenc %zmm17, %zmm3, %zmm3
+ vaesenc %zmm18, %zmm0, %zmm0
+ vaesenc %zmm18, %zmm1, %zmm1
+ vaesenc %zmm18, %zmm2, %zmm2
+ vaesenc %zmm18, %zmm3, %zmm3
+ vaesenc %zmm19, %zmm0, %zmm0
+ vaesenc %zmm19, %zmm1, %zmm1
+ vaesenc %zmm19, %zmm2, %zmm2
+ vaesenc %zmm19, %zmm3, %zmm3
+ vaesenc %zmm20, %zmm0, %zmm0
+ vaesenc %zmm20, %zmm1, %zmm1
+ vaesenc %zmm20, %zmm2, %zmm2
+ vaesenc %zmm20, %zmm3, %zmm3
+ vaesenc %zmm21, %zmm0, %zmm0
+ vaesenc %zmm21, %zmm1, %zmm1
+ vaesenc %zmm21, %zmm2, %zmm2
+ vaesenc %zmm21, %zmm3, %zmm3
+ vaesenc %zmm22, %zmm0, %zmm0
+ vaesenc %zmm22, %zmm1, %zmm1
+ vaesenc %zmm22, %zmm2, %zmm2
+ vaesenc %zmm22, %zmm3, %zmm3
+ vaesenc %zmm23, %zmm0, %zmm0
+ vaesenc %zmm23, %zmm1, %zmm1
+ vaesenc %zmm23, %zmm2, %zmm2
+ vaesenc %zmm23, %zmm3, %zmm3
+ vaesenc %zmm24, %zmm0, %zmm0
+ vaesenc %zmm24, %zmm1, %zmm1
+ vaesenc %zmm24, %zmm2, %zmm2
+ vaesenc %zmm24, %zmm3, %zmm3
+ vaesenc %zmm25, %zmm0, %zmm0
+ vaesenc %zmm25, %zmm1, %zmm1
+ vaesenc %zmm25, %zmm2, %zmm2
+ vaesenc %zmm25, %zmm3, %zmm3
+ cmpl $11, %r10d
+ vmovdqa64 %zmm26, %zmm9
+ jl L_AES_XTS_encrypt_avx512_aes_enc_256_aes_enc_block_last
+ vaesenc %zmm26, %zmm0, %zmm0
+ vaesenc %zmm26, %zmm1, %zmm1
+ vaesenc %zmm26, %zmm2, %zmm2
+ vaesenc %zmm26, %zmm3, %zmm3
+ vaesenc %zmm27, %zmm0, %zmm0
+ vaesenc %zmm27, %zmm1, %zmm1
+ vaesenc %zmm27, %zmm2, %zmm2
+ vaesenc %zmm27, %zmm3, %zmm3
+ cmpl $13, %r10d
+ vmovdqa64 %zmm28, %zmm9
+ jl L_AES_XTS_encrypt_avx512_aes_enc_256_aes_enc_block_last
+ vaesenc %zmm28, %zmm0, %zmm0
+ vaesenc %zmm28, %zmm1, %zmm1
+ vaesenc %zmm28, %zmm2, %zmm2
+ vaesenc %zmm28, %zmm3, %zmm3
+ vaesenc %zmm29, %zmm0, %zmm0
+ vaesenc %zmm29, %zmm1, %zmm1
+ vaesenc %zmm29, %zmm2, %zmm2
+ vaesenc %zmm29, %zmm3, %zmm3
+ vmovdqa64 %zmm30, %zmm9
+L_AES_XTS_encrypt_avx512_aes_enc_256_aes_enc_block_last:
+ vaesenclast %zmm9, %zmm0, %zmm0
+ vaesenclast %zmm9, %zmm1, %zmm1
+ vaesenclast %zmm9, %zmm2, %zmm2
+ vaesenclast %zmm9, %zmm3, %zmm3
+ vpxorq %zmm4, %zmm0, %zmm0
+ vmovdqu64 %zmm0, (%rdx)
+ vpsrlq $48, %zmm4, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $16, %zmm4, %zmm4
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm4
+ vpxorq %zmm5, %zmm1, %zmm1
+ vmovdqu64 %zmm1, 64(%rdx)
+ vpsrlq $48, %zmm5, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $16, %zmm5, %zmm5
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm5
+ vpxorq %zmm6, %zmm2, %zmm2
+ vmovdqu64 %zmm2, 128(%rdx)
+ vpsrlq $48, %zmm6, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $16, %zmm6, %zmm6
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm6
+ vpxorq %zmm7, %zmm3, %zmm3
+ vmovdqu64 %zmm3, 192(%rdx)
+ vpsrlq $48, %zmm7, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $16, %zmm7, %zmm7
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm7
+ addl $0x100, %r13d
+ cmpl %r11d, %r13d
+ jl L_AES_XTS_encrypt_avx512_enc_256
+ vextracti32x4 $0x00, %zmm4, %xmm8
+L_AES_XTS_encrypt_avx512_done_256:
+ movl %eax, %r11d
+ andl $0xffffff80, %r11d
+ cmpl %r11d, %r13d
+ je L_AES_XTS_encrypt_avx512_done_128
+ # 128 bytes of input
+ # aes_enc_128
+ leaq (%rdi,%r13,1), %rcx
+ leaq (%rsi,%r13,1), %rdx
+ vmovdqu64 (%rcx), %zmm0
+ vmovdqu64 64(%rcx), %zmm1
+ vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5
+ vpsrlvq %zmm15, %zmm5, %zmm6
+ vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7
+ vpslldq $8, %zmm6, %zmm6
+ vpsllvq %zmm14, %zmm5, %zmm4
+ vpternlogq $0x96, %zmm6, %zmm7, %zmm4
+ vpsrlq $60, %zmm4, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $4, %zmm4, %zmm5
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm5
+ # aes_enc_block
+ vpternlogq $0x96, %zmm4, %zmm16, %zmm0
+ vpternlogq $0x96, %zmm5, %zmm16, %zmm1
+ vaesenc %zmm17, %zmm0, %zmm0
+ vaesenc %zmm17, %zmm1, %zmm1
+ vaesenc %zmm18, %zmm0, %zmm0
+ vaesenc %zmm18, %zmm1, %zmm1
+ vaesenc %zmm19, %zmm0, %zmm0
+ vaesenc %zmm19, %zmm1, %zmm1
+ vaesenc %zmm20, %zmm0, %zmm0
+ vaesenc %zmm20, %zmm1, %zmm1
+ vaesenc %zmm21, %zmm0, %zmm0
+ vaesenc %zmm21, %zmm1, %zmm1
+ vaesenc %zmm22, %zmm0, %zmm0
+ vaesenc %zmm22, %zmm1, %zmm1
+ vaesenc %zmm23, %zmm0, %zmm0
+ vaesenc %zmm23, %zmm1, %zmm1
+ vaesenc %zmm24, %zmm0, %zmm0
+ vaesenc %zmm24, %zmm1, %zmm1
+ vaesenc %zmm25, %zmm0, %zmm0
+ vaesenc %zmm25, %zmm1, %zmm1
+ cmpl $11, %r10d
+ vmovdqa64 %zmm26, %zmm9
+ jl L_AES_XTS_encrypt_avx512_aes_enc_128_aes_enc_block_last
+ vaesenc %zmm26, %zmm0, %zmm0
+ vaesenc %zmm26, %zmm1, %zmm1
+ vaesenc %zmm27, %zmm0, %zmm0
+ vaesenc %zmm27, %zmm1, %zmm1
+ cmpl $13, %r10d
+ vmovdqa64 %zmm28, %zmm9
+ jl L_AES_XTS_encrypt_avx512_aes_enc_128_aes_enc_block_last
+ vaesenc %zmm28, %zmm0, %zmm0
+ vaesenc %zmm28, %zmm1, %zmm1
+ vaesenc %zmm29, %zmm0, %zmm0
+ vaesenc %zmm29, %zmm1, %zmm1
+ vmovdqa64 %zmm30, %zmm9
+L_AES_XTS_encrypt_avx512_aes_enc_128_aes_enc_block_last:
+ vaesenclast %zmm9, %zmm0, %zmm0
+ vaesenclast %zmm9, %zmm1, %zmm1
+ vpxorq %zmm4, %zmm0, %zmm0
+ vmovdqu64 %zmm0, (%rdx)
+ vpxorq %zmm5, %zmm1, %zmm1
+ vmovdqu64 %zmm1, 64(%rdx)
+ vextracti32x4 $3, %zmm5, %xmm8
+ vpshufd $19, %xmm8, %xmm9
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm9, %xmm9
+ vpternlogd $0x78, %xmm12, %xmm9, %xmm8
+ addl $0x80, %r13d
+L_AES_XTS_encrypt_avx512_done_128:
+ movl %eax, %r11d
+ andl $0xffffffc0, %r11d
+ cmpl %r11d, %r13d
+ je L_AES_XTS_encrypt_avx512_done_64
+ # 64 bytes of input
+ # aes_enc_64
+ leaq (%rdi,%r13,1), %rcx
+ leaq (%rsi,%r13,1), %rdx
+ vmovdqu64 (%rcx), %zmm0
+ vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5
+ vpsrlvq %zmm15, %zmm5, %zmm6
+ vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7
+ vpslldq $8, %zmm6, %zmm6
+ vpsllvq %zmm14, %zmm5, %zmm4
+ vpternlogq $0x96, %zmm6, %zmm7, %zmm4
+ # aes_enc_block
+ vpternlogq $0x96, %zmm4, %zmm16, %zmm0
+ vaesenc %zmm17, %zmm0, %zmm0
+ vaesenc %zmm18, %zmm0, %zmm0
+ vaesenc %zmm19, %zmm0, %zmm0
+ vaesenc %zmm20, %zmm0, %zmm0
+ vaesenc %zmm21, %zmm0, %zmm0
+ vaesenc %zmm22, %zmm0, %zmm0
+ vaesenc %zmm23, %zmm0, %zmm0
+ vaesenc %zmm24, %zmm0, %zmm0
+ vaesenc %zmm25, %zmm0, %zmm0
+ cmpl $11, %r10d
+ vmovdqa64 %zmm26, %zmm9
+ jl L_AES_XTS_encrypt_avx512_aes_enc_64_aes_enc_block_last
+ vaesenc %zmm26, %zmm0, %zmm0
+ vaesenc %zmm27, %zmm0, %zmm0
+ cmpl $13, %r10d
+ vmovdqa64 %zmm28, %zmm9
+ jl L_AES_XTS_encrypt_avx512_aes_enc_64_aes_enc_block_last
+ vaesenc %zmm28, %zmm0, %zmm0
+ vaesenc %zmm29, %zmm0, %zmm0
+ vmovdqa64 %zmm30, %zmm9
+L_AES_XTS_encrypt_avx512_aes_enc_64_aes_enc_block_last:
+ vaesenclast %zmm9, %zmm0, %zmm0
+ vpxorq %zmm4, %zmm0, %zmm0
+ vmovdqu64 %zmm0, (%rdx)
+ vextracti32x4 $3, %zmm4, %xmm8
+ vpshufd $19, %xmm8, %xmm9
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm9, %xmm9
+ vpternlogd $0x78, %xmm12, %xmm9, %xmm8
+ addl $0x40, %r13d
+L_AES_XTS_encrypt_avx512_done_64:
+ movl %eax, %r11d
+ andl $0xffffffe0, %r11d
+ cmpl %r11d, %r13d
+ je L_AES_XTS_encrypt_avx512_done_32
+ # 32 bytes of input
+ # aes_enc_32
+ leaq (%rdi,%r13,1), %rcx
+ leaq (%rsi,%r13,1), %rdx
+ vmovdqu64 (%rcx), %ymm0
+ vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5
+ vpsrlvq %zmm15, %zmm5, %zmm6
+ vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7
+ vpslldq $8, %zmm6, %zmm6
+ vpsllvq %zmm14, %zmm5, %zmm4
+ vpternlogq $0x96, %zmm6, %zmm7, %zmm4
+ # aes_enc_block
+ vpternlogq $0x96, %ymm4, %ymm16, %ymm0
+ vaesenc %ymm17, %ymm0, %ymm0
+ vaesenc %ymm18, %ymm0, %ymm0
+ vaesenc %ymm19, %ymm0, %ymm0
+ vaesenc %ymm20, %ymm0, %ymm0
+ vaesenc %ymm21, %ymm0, %ymm0
+ vaesenc %ymm22, %ymm0, %ymm0
+ vaesenc %ymm23, %ymm0, %ymm0
+ vaesenc %ymm24, %ymm0, %ymm0
+ vaesenc %ymm25, %ymm0, %ymm0
+ cmpl $11, %r10d
+ vmovdqa64 %ymm26, %ymm9
+ jl L_AES_XTS_encrypt_avx512_aes_enc_32_aes_enc_block_last
+ vaesenc %ymm26, %ymm0, %ymm0
+ vaesenc %ymm27, %ymm0, %ymm0
+ cmpl $13, %r10d
+ vmovdqa64 %ymm28, %ymm9
+ jl L_AES_XTS_encrypt_avx512_aes_enc_32_aes_enc_block_last
+ vaesenc %ymm28, %ymm0, %ymm0
+ vaesenc %ymm29, %ymm0, %ymm0
+ vmovdqa64 %ymm30, %ymm9
+L_AES_XTS_encrypt_avx512_aes_enc_32_aes_enc_block_last:
+ vaesenclast %ymm9, %ymm0, %ymm0
+ vpxorq %ymm4, %ymm0, %ymm0
+ vmovdqu64 %ymm0, (%rdx)
+ vextracti32x4 $2, %zmm4, %xmm8
+ addl $32, %r13d
+L_AES_XTS_encrypt_avx512_done_32:
+ cmpl %eax, %r13d
+ movl %eax, %r11d
+ je L_AES_XTS_encrypt_avx512_done_enc
+ subl %r13d, %r11d
+ cmpl $16, %r11d
+ movl %eax, %r11d
+ jl L_AES_XTS_encrypt_avx512_last_15
+ andl $0xfffffff0, %r11d
+ # 16 bytes of input
+L_AES_XTS_encrypt_avx512_enc_16:
+ leaq (%rdi,%r13,1), %rcx
+ vmovdqu (%rcx), %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ # aes_enc_block
+ vpxor (%r8), %xmm0, %xmm0
+ vmovdqu 16(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ cmpl $11, %r10d
+ vmovdqu 160(%r8), %xmm5
+ jl L_AES_XTS_encrypt_avx512_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%r8), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ cmpl $13, %r10d
+ vmovdqu 192(%r8), %xmm5
+ jl L_AES_XTS_encrypt_avx512_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%r8), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%r8), %xmm5
+L_AES_XTS_encrypt_avx512_aes_enc_block_last:
+ vaesenclast %xmm5, %xmm0, %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ leaq (%rsi,%r13,1), %rcx
+ vmovdqu %xmm0, (%rcx)
+ vpshufd $19, %xmm8, %xmm4
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm4, %xmm4
+ vpternlogd $0x78, %xmm12, %xmm4, %xmm8
+ addl $16, %r13d
+ cmpl %r11d, %r13d
+ jl L_AES_XTS_encrypt_avx512_enc_16
+ cmpl %eax, %r13d
+ je L_AES_XTS_encrypt_avx512_done_enc
+L_AES_XTS_encrypt_avx512_last_15:
+ subq $16, %r13
+ leaq (%rsi,%r13,1), %rcx
+ vmovdqu (%rcx), %xmm0
+ addq $16, %r13
+ vmovdqu %xmm0, (%rsp)
+ xorq %rdx, %rdx
+L_AES_XTS_encrypt_avx512_last_15_byte_loop:
+ movb (%rsp,%rdx,1), %r11b
+ movb (%rdi,%r13,1), %cl
+ movb %r11b, (%rsi,%r13,1)
+ movb %cl, (%rsp,%rdx,1)
+ incl %r13d
+ incl %edx
+ cmpl %eax, %r13d
+ jl L_AES_XTS_encrypt_avx512_last_15_byte_loop
+ subq %rdx, %r13
+ vmovdqu (%rsp), %xmm0
+ subq $16, %r13
+ vpxor %xmm8, %xmm0, %xmm0
+ # aes_enc_block
+ vpxor (%r8), %xmm0, %xmm0
+ vmovdqu 16(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%r8), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ cmpl $11, %r10d
+ vmovdqu 160(%r8), %xmm5
+ jl L_AES_XTS_encrypt_avx512_last_15_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%r8), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ cmpl $13, %r10d
+ vmovdqu 192(%r8), %xmm5
+ jl L_AES_XTS_encrypt_avx512_last_15_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%r8), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%r8), %xmm5
+L_AES_XTS_encrypt_avx512_last_15_aes_enc_block_last:
+ vaesenclast %xmm5, %xmm0, %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ leaq (%rsi,%r13,1), %rcx
+ vmovdqu %xmm0, (%rcx)
+L_AES_XTS_encrypt_avx512_done_enc:
+ addq $0x40, %rsp
+ popq %r13
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size AES_XTS_encrypt_avx512,.-AES_XTS_encrypt_avx512
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_XTS_encrypt_update_avx512
+.type AES_XTS_encrypt_update_avx512,@function
+.align 16
+AES_XTS_encrypt_update_avx512:
+#else
+.section __TEXT,__text
+.globl _AES_XTS_encrypt_update_avx512
+.p2align 4
+_AES_XTS_encrypt_update_avx512:
+#endif /* __APPLE__ */
+ pushq %r12
+ movq %rdx, %rax
+ movq %rcx, %r10
+ subq $0x40, %rsp
+ vmovdqu L_avx512_aes_xts_gc_xts(%rip), %xmm12
+ vbroadcasti32x4 L_avx512_aes_xts_poly(%rip), %zmm13
+ vmovdqu64 L_avx512_aes_xts_shl(%rip), %zmm14
+ vmovdqu64 L_avx512_aes_xts_shr(%rip), %zmm15
+ vmovdqu (%r8), %xmm8
+ xorl %r12d, %r12d
+ cmpl $32, %eax
+ jl L_AES_XTS_encrypt_update_avx512_done_128
+ vbroadcasti32x4 (%r10), %zmm16
+ vbroadcasti32x4 16(%r10), %zmm17
+ vbroadcasti32x4 32(%r10), %zmm18
+ vbroadcasti32x4 48(%r10), %zmm19
+ vbroadcasti32x4 64(%r10), %zmm20
+ vbroadcasti32x4 80(%r10), %zmm21
+ vbroadcasti32x4 96(%r10), %zmm22
+ vbroadcasti32x4 112(%r10), %zmm23
+ vbroadcasti32x4 128(%r10), %zmm24
+ vbroadcasti32x4 144(%r10), %zmm25
+ vbroadcasti32x4 160(%r10), %zmm26
+ cmpl $11, %r9d
+ jl L_AES_XTS_encrypt_update_avx512_key_cached
+ vbroadcasti32x4 176(%r10), %zmm27
+ vbroadcasti32x4 192(%r10), %zmm28
+ cmpl $13, %r9d
+ jl L_AES_XTS_encrypt_update_avx512_key_cached
+ vbroadcasti32x4 208(%r10), %zmm29
+ vbroadcasti32x4 224(%r10), %zmm30
+L_AES_XTS_encrypt_update_avx512_key_cached:
+ cmpl $0x100, %eax
+ movl %eax, %r11d
+ jl L_AES_XTS_encrypt_update_avx512_done_256
+ andl $0xffffff00, %r11d
+ vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5
+ vpsrlvq %zmm15, %zmm5, %zmm6
+ vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7
+ vpslldq $8, %zmm6, %zmm6
+ vpsllvq %zmm14, %zmm5, %zmm4
+ vpternlogq $0x96, %zmm6, %zmm7, %zmm4
+ vpsrlq $60, %zmm4, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $4, %zmm4, %zmm5
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm5
+ vpsrlq $60, %zmm5, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $4, %zmm5, %zmm6
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm6
+ vpsrlq $60, %zmm6, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $4, %zmm6, %zmm7
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm7
+L_AES_XTS_encrypt_update_avx512_enc_256:
+ # 256 bytes of input
+ # aes_enc_256
+ leaq (%rdi,%r12,1), %rcx
+ leaq (%rsi,%r12,1), %rdx
+ vmovdqu64 (%rcx), %zmm0
+ vmovdqu64 64(%rcx), %zmm1
+ vmovdqu64 128(%rcx), %zmm2
+ vmovdqu64 192(%rcx), %zmm3
+ # aes_enc_block
+ vpternlogq $0x96, %zmm4, %zmm16, %zmm0
+ vpternlogq $0x96, %zmm5, %zmm16, %zmm1
+ vpternlogq $0x96, %zmm6, %zmm16, %zmm2
+ vpternlogq $0x96, %zmm7, %zmm16, %zmm3
+ vaesenc %zmm17, %zmm0, %zmm0
+ vaesenc %zmm17, %zmm1, %zmm1
+ vaesenc %zmm17, %zmm2, %zmm2
+ vaesenc %zmm17, %zmm3, %zmm3
+ vaesenc %zmm18, %zmm0, %zmm0
+ vaesenc %zmm18, %zmm1, %zmm1
+ vaesenc %zmm18, %zmm2, %zmm2
+ vaesenc %zmm18, %zmm3, %zmm3
+ vaesenc %zmm19, %zmm0, %zmm0
+ vaesenc %zmm19, %zmm1, %zmm1
+ vaesenc %zmm19, %zmm2, %zmm2
+ vaesenc %zmm19, %zmm3, %zmm3
+ vaesenc %zmm20, %zmm0, %zmm0
+ vaesenc %zmm20, %zmm1, %zmm1
+ vaesenc %zmm20, %zmm2, %zmm2
+ vaesenc %zmm20, %zmm3, %zmm3
+ vaesenc %zmm21, %zmm0, %zmm0
+ vaesenc %zmm21, %zmm1, %zmm1
+ vaesenc %zmm21, %zmm2, %zmm2
+ vaesenc %zmm21, %zmm3, %zmm3
+ vaesenc %zmm22, %zmm0, %zmm0
+ vaesenc %zmm22, %zmm1, %zmm1
+ vaesenc %zmm22, %zmm2, %zmm2
+ vaesenc %zmm22, %zmm3, %zmm3
+ vaesenc %zmm23, %zmm0, %zmm0
+ vaesenc %zmm23, %zmm1, %zmm1
+ vaesenc %zmm23, %zmm2, %zmm2
+ vaesenc %zmm23, %zmm3, %zmm3
+ vaesenc %zmm24, %zmm0, %zmm0
+ vaesenc %zmm24, %zmm1, %zmm1
+ vaesenc %zmm24, %zmm2, %zmm2
+ vaesenc %zmm24, %zmm3, %zmm3
+ vaesenc %zmm25, %zmm0, %zmm0
+ vaesenc %zmm25, %zmm1, %zmm1
+ vaesenc %zmm25, %zmm2, %zmm2
+ vaesenc %zmm25, %zmm3, %zmm3
+ cmpl $11, %r9d
+ vmovdqa64 %zmm26, %zmm9
+ jl L_AES_XTS_encrypt_update_avx512_aes_enc_256_aes_enc_block_last
+ vaesenc %zmm26, %zmm0, %zmm0
+ vaesenc %zmm26, %zmm1, %zmm1
+ vaesenc %zmm26, %zmm2, %zmm2
+ vaesenc %zmm26, %zmm3, %zmm3
+ vaesenc %zmm27, %zmm0, %zmm0
+ vaesenc %zmm27, %zmm1, %zmm1
+ vaesenc %zmm27, %zmm2, %zmm2
+ vaesenc %zmm27, %zmm3, %zmm3
+ cmpl $13, %r9d
+ vmovdqa64 %zmm28, %zmm9
+ jl L_AES_XTS_encrypt_update_avx512_aes_enc_256_aes_enc_block_last
+ vaesenc %zmm28, %zmm0, %zmm0
+ vaesenc %zmm28, %zmm1, %zmm1
+ vaesenc %zmm28, %zmm2, %zmm2
+ vaesenc %zmm28, %zmm3, %zmm3
+ vaesenc %zmm29, %zmm0, %zmm0
+ vaesenc %zmm29, %zmm1, %zmm1
+ vaesenc %zmm29, %zmm2, %zmm2
+ vaesenc %zmm29, %zmm3, %zmm3
+ vmovdqa64 %zmm30, %zmm9
+L_AES_XTS_encrypt_update_avx512_aes_enc_256_aes_enc_block_last:
+ vaesenclast %zmm9, %zmm0, %zmm0
+ vaesenclast %zmm9, %zmm1, %zmm1
+ vaesenclast %zmm9, %zmm2, %zmm2
+ vaesenclast %zmm9, %zmm3, %zmm3
+ vpxorq %zmm4, %zmm0, %zmm0
+ vmovdqu64 %zmm0, (%rdx)
+ vpsrlq $48, %zmm4, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $16, %zmm4, %zmm4
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm4
+ vpxorq %zmm5, %zmm1, %zmm1
+ vmovdqu64 %zmm1, 64(%rdx)
+ vpsrlq $48, %zmm5, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $16, %zmm5, %zmm5
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm5
+ vpxorq %zmm6, %zmm2, %zmm2
+ vmovdqu64 %zmm2, 128(%rdx)
+ vpsrlq $48, %zmm6, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $16, %zmm6, %zmm6
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm6
+ vpxorq %zmm7, %zmm3, %zmm3
+ vmovdqu64 %zmm3, 192(%rdx)
+ vpsrlq $48, %zmm7, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $16, %zmm7, %zmm7
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm7
+ addl $0x100, %r12d
+ cmpl %r11d, %r12d
+ jl L_AES_XTS_encrypt_update_avx512_enc_256
+ vextracti32x4 $0x00, %zmm4, %xmm8
+L_AES_XTS_encrypt_update_avx512_done_256:
+ movl %eax, %r11d
+ andl $0xffffff80, %r11d
+ cmpl %r11d, %r12d
+ je L_AES_XTS_encrypt_update_avx512_done_128
+ # 128 bytes of input
+ # aes_enc_128
+ leaq (%rdi,%r12,1), %rcx
+ leaq (%rsi,%r12,1), %rdx
+ vmovdqu64 (%rcx), %zmm0
+ vmovdqu64 64(%rcx), %zmm1
+ vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5
+ vpsrlvq %zmm15, %zmm5, %zmm6
+ vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7
+ vpslldq $8, %zmm6, %zmm6
+ vpsllvq %zmm14, %zmm5, %zmm4
+ vpternlogq $0x96, %zmm6, %zmm7, %zmm4
+ vpsrlq $60, %zmm4, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $4, %zmm4, %zmm5
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm5
+ # aes_enc_block
+ vpternlogq $0x96, %zmm4, %zmm16, %zmm0
+ vpternlogq $0x96, %zmm5, %zmm16, %zmm1
+ vaesenc %zmm17, %zmm0, %zmm0
+ vaesenc %zmm17, %zmm1, %zmm1
+ vaesenc %zmm18, %zmm0, %zmm0
+ vaesenc %zmm18, %zmm1, %zmm1
+ vaesenc %zmm19, %zmm0, %zmm0
+ vaesenc %zmm19, %zmm1, %zmm1
+ vaesenc %zmm20, %zmm0, %zmm0
+ vaesenc %zmm20, %zmm1, %zmm1
+ vaesenc %zmm21, %zmm0, %zmm0
+ vaesenc %zmm21, %zmm1, %zmm1
+ vaesenc %zmm22, %zmm0, %zmm0
+ vaesenc %zmm22, %zmm1, %zmm1
+ vaesenc %zmm23, %zmm0, %zmm0
+ vaesenc %zmm23, %zmm1, %zmm1
+ vaesenc %zmm24, %zmm0, %zmm0
+ vaesenc %zmm24, %zmm1, %zmm1
+ vaesenc %zmm25, %zmm0, %zmm0
+ vaesenc %zmm25, %zmm1, %zmm1
+ cmpl $11, %r9d
+ vmovdqa64 %zmm26, %zmm9
+ jl L_AES_XTS_encrypt_update_avx512_aes_enc_128_aes_enc_block_last
+ vaesenc %zmm26, %zmm0, %zmm0
+ vaesenc %zmm26, %zmm1, %zmm1
+ vaesenc %zmm27, %zmm0, %zmm0
+ vaesenc %zmm27, %zmm1, %zmm1
+ cmpl $13, %r9d
+ vmovdqa64 %zmm28, %zmm9
+ jl L_AES_XTS_encrypt_update_avx512_aes_enc_128_aes_enc_block_last
+ vaesenc %zmm28, %zmm0, %zmm0
+ vaesenc %zmm28, %zmm1, %zmm1
+ vaesenc %zmm29, %zmm0, %zmm0
+ vaesenc %zmm29, %zmm1, %zmm1
+ vmovdqa64 %zmm30, %zmm9
+L_AES_XTS_encrypt_update_avx512_aes_enc_128_aes_enc_block_last:
+ vaesenclast %zmm9, %zmm0, %zmm0
+ vaesenclast %zmm9, %zmm1, %zmm1
+ vpxorq %zmm4, %zmm0, %zmm0
+ vmovdqu64 %zmm0, (%rdx)
+ vpxorq %zmm5, %zmm1, %zmm1
+ vmovdqu64 %zmm1, 64(%rdx)
+ vextracti32x4 $3, %zmm5, %xmm8
+ vpshufd $19, %xmm8, %xmm9
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm9, %xmm9
+ vpternlogd $0x78, %xmm12, %xmm9, %xmm8
+ addl $0x80, %r12d
+L_AES_XTS_encrypt_update_avx512_done_128:
+ movl %eax, %r11d
+ andl $0xffffffc0, %r11d
+ cmpl %r11d, %r12d
+ je L_AES_XTS_encrypt_update_avx512_done_64
+ # 64 bytes of input
+ # aes_enc_64
+ leaq (%rdi,%r12,1), %rcx
+ leaq (%rsi,%r12,1), %rdx
+ vmovdqu64 (%rcx), %zmm0
+ vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5
+ vpsrlvq %zmm15, %zmm5, %zmm6
+ vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7
+ vpslldq $8, %zmm6, %zmm6
+ vpsllvq %zmm14, %zmm5, %zmm4
+ vpternlogq $0x96, %zmm6, %zmm7, %zmm4
+ # aes_enc_block
+ vpternlogq $0x96, %zmm4, %zmm16, %zmm0
+ vaesenc %zmm17, %zmm0, %zmm0
+ vaesenc %zmm18, %zmm0, %zmm0
+ vaesenc %zmm19, %zmm0, %zmm0
+ vaesenc %zmm20, %zmm0, %zmm0
+ vaesenc %zmm21, %zmm0, %zmm0
+ vaesenc %zmm22, %zmm0, %zmm0
+ vaesenc %zmm23, %zmm0, %zmm0
+ vaesenc %zmm24, %zmm0, %zmm0
+ vaesenc %zmm25, %zmm0, %zmm0
+ cmpl $11, %r9d
+ vmovdqa64 %zmm26, %zmm9
+ jl L_AES_XTS_encrypt_update_avx512_aes_enc_64_aes_enc_block_last
+ vaesenc %zmm26, %zmm0, %zmm0
+ vaesenc %zmm27, %zmm0, %zmm0
+ cmpl $13, %r9d
+ vmovdqa64 %zmm28, %zmm9
+ jl L_AES_XTS_encrypt_update_avx512_aes_enc_64_aes_enc_block_last
+ vaesenc %zmm28, %zmm0, %zmm0
+ vaesenc %zmm29, %zmm0, %zmm0
+ vmovdqa64 %zmm30, %zmm9
+L_AES_XTS_encrypt_update_avx512_aes_enc_64_aes_enc_block_last:
+ vaesenclast %zmm9, %zmm0, %zmm0
+ vpxorq %zmm4, %zmm0, %zmm0
+ vmovdqu64 %zmm0, (%rdx)
+ vextracti32x4 $3, %zmm4, %xmm8
+ vpshufd $19, %xmm8, %xmm9
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm9, %xmm9
+ vpternlogd $0x78, %xmm12, %xmm9, %xmm8
+ addl $0x40, %r12d
+L_AES_XTS_encrypt_update_avx512_done_64:
+ movl %eax, %r11d
+ andl $0xffffffe0, %r11d
+ cmpl %r11d, %r12d
+ je L_AES_XTS_encrypt_update_avx512_done_32
+ # 32 bytes of input
+ # aes_enc_32
+ leaq (%rdi,%r12,1), %rcx
+ leaq (%rsi,%r12,1), %rdx
+ vmovdqu64 (%rcx), %ymm0
+ vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5
+ vpsrlvq %zmm15, %zmm5, %zmm6
+ vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7
+ vpslldq $8, %zmm6, %zmm6
+ vpsllvq %zmm14, %zmm5, %zmm4
+ vpternlogq $0x96, %zmm6, %zmm7, %zmm4
+ # aes_enc_block
+ vpternlogq $0x96, %ymm4, %ymm16, %ymm0
+ vaesenc %ymm17, %ymm0, %ymm0
+ vaesenc %ymm18, %ymm0, %ymm0
+ vaesenc %ymm19, %ymm0, %ymm0
+ vaesenc %ymm20, %ymm0, %ymm0
+ vaesenc %ymm21, %ymm0, %ymm0
+ vaesenc %ymm22, %ymm0, %ymm0
+ vaesenc %ymm23, %ymm0, %ymm0
+ vaesenc %ymm24, %ymm0, %ymm0
+ vaesenc %ymm25, %ymm0, %ymm0
+ cmpl $11, %r9d
+ vmovdqa64 %ymm26, %ymm9
+ jl L_AES_XTS_encrypt_update_avx512_aes_enc_32_aes_enc_block_last
+ vaesenc %ymm26, %ymm0, %ymm0
+ vaesenc %ymm27, %ymm0, %ymm0
+ cmpl $13, %r9d
+ vmovdqa64 %ymm28, %ymm9
+ jl L_AES_XTS_encrypt_update_avx512_aes_enc_32_aes_enc_block_last
+ vaesenc %ymm28, %ymm0, %ymm0
+ vaesenc %ymm29, %ymm0, %ymm0
+ vmovdqa64 %ymm30, %ymm9
+L_AES_XTS_encrypt_update_avx512_aes_enc_32_aes_enc_block_last:
+ vaesenclast %ymm9, %ymm0, %ymm0
+ vpxorq %ymm4, %ymm0, %ymm0
+ vmovdqu64 %ymm0, (%rdx)
+ vextracti32x4 $2, %zmm4, %xmm8
+ addl $32, %r12d
+L_AES_XTS_encrypt_update_avx512_done_32:
+ cmpl %eax, %r12d
+ movl %eax, %r11d
+ je L_AES_XTS_encrypt_update_avx512_done_enc
+ subl %r12d, %r11d
+ cmpl $16, %r11d
+ movl %eax, %r11d
+ jl L_AES_XTS_encrypt_update_avx512_last_15
+ andl $0xfffffff0, %r11d
+ # 16 bytes of input
+L_AES_XTS_encrypt_update_avx512_enc_16:
+ leaq (%rdi,%r12,1), %rcx
+ vmovdqu (%rcx), %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ # aes_enc_block
+ vpxor (%r10), %xmm0, %xmm0
+ vmovdqu 16(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ cmpl $11, %r9d
+ vmovdqu 160(%r10), %xmm5
+ jl L_AES_XTS_encrypt_update_avx512_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%r10), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ cmpl $13, %r9d
+ vmovdqu 192(%r10), %xmm5
+ jl L_AES_XTS_encrypt_update_avx512_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%r10), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%r10), %xmm5
+L_AES_XTS_encrypt_update_avx512_aes_enc_block_last:
+ vaesenclast %xmm5, %xmm0, %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ leaq (%rsi,%r12,1), %rcx
+ vmovdqu %xmm0, (%rcx)
+ vpshufd $19, %xmm8, %xmm4
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm4, %xmm4
+ vpternlogd $0x78, %xmm12, %xmm4, %xmm8
+ addl $16, %r12d
+ cmpl %r11d, %r12d
+ jl L_AES_XTS_encrypt_update_avx512_enc_16
+ cmpl %eax, %r12d
+ je L_AES_XTS_encrypt_update_avx512_done_enc
+L_AES_XTS_encrypt_update_avx512_last_15:
+ subq $16, %r12
+ leaq (%rsi,%r12,1), %rcx
+ vmovdqu (%rcx), %xmm0
+ addq $16, %r12
+ vmovdqu %xmm0, (%rsp)
+ xorq %rdx, %rdx
+L_AES_XTS_encrypt_update_avx512_last_15_byte_loop:
+ movb (%rsp,%rdx,1), %r11b
+ movb (%rdi,%r12,1), %cl
+ movb %r11b, (%rsi,%r12,1)
+ movb %cl, (%rsp,%rdx,1)
+ incl %r12d
+ incl %edx
+ cmpl %eax, %r12d
+ jl L_AES_XTS_encrypt_update_avx512_last_15_byte_loop
+ subq %rdx, %r12
+ vmovdqu (%rsp), %xmm0
+ subq $16, %r12
+ vpxor %xmm8, %xmm0, %xmm0
+ # aes_enc_block
+ vpxor (%r10), %xmm0, %xmm0
+ vmovdqu 16(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%r10), %xmm5
+ vaesenc %xmm5, %xmm0, %xmm0
+ cmpl $11, %r9d
+ vmovdqu 160(%r10), %xmm5
+ jl L_AES_XTS_encrypt_update_avx512_last_15_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%r10), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ cmpl $13, %r9d
+ vmovdqu 192(%r10), %xmm5
+ jl L_AES_XTS_encrypt_update_avx512_last_15_aes_enc_block_last
+ vaesenc %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%r10), %xmm6
+ vaesenc %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%r10), %xmm5
+L_AES_XTS_encrypt_update_avx512_last_15_aes_enc_block_last:
+ vaesenclast %xmm5, %xmm0, %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ leaq (%rsi,%r12,1), %rcx
+ vmovdqu %xmm0, (%rcx)
+L_AES_XTS_encrypt_update_avx512_done_enc:
+ vmovdqu %xmm8, (%r8)
+ addq $0x40, %rsp
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size AES_XTS_encrypt_update_avx512,.-AES_XTS_encrypt_update_avx512
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_XTS_decrypt_avx512
+.type AES_XTS_decrypt_avx512,@function
+.align 16
+AES_XTS_decrypt_avx512:
+#else
+.section __TEXT,__text
+.globl _AES_XTS_decrypt_avx512
+.p2align 4
+_AES_XTS_decrypt_avx512:
+#endif /* __APPLE__ */
+ pushq %r12
+ pushq %r13
+ movq %rdx, %rax
+ movq %rcx, %r12
+ movl 24(%rsp), %r10d
+ subq $0x40, %rsp
+ vmovdqu L_avx512_aes_xts_gc_xts(%rip), %xmm12
+ vbroadcasti32x4 L_avx512_aes_xts_poly(%rip), %zmm13
+ vmovdqu64 L_avx512_aes_xts_shl(%rip), %zmm14
+ vmovdqu64 L_avx512_aes_xts_shr(%rip), %zmm15
+ vmovdqu (%r12), %xmm8
+ # aes_enc_block
+ vpxor (%r9), %xmm8, %xmm8
+ vmovdqu 16(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 32(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 48(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 64(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 80(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 96(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 112(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 128(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 144(%r9), %xmm5
+ vaesenc %xmm5, %xmm8, %xmm8
+ cmpl $11, %r10d
+ vmovdqu 160(%r9), %xmm5
+ jl L_AES_XTS_decrypt_avx512_tweak_aes_enc_block_last
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 176(%r9), %xmm6
+ vaesenc %xmm6, %xmm8, %xmm8
+ cmpl $13, %r10d
+ vmovdqu 192(%r9), %xmm5
+ jl L_AES_XTS_decrypt_avx512_tweak_aes_enc_block_last
+ vaesenc %xmm5, %xmm8, %xmm8
+ vmovdqu 208(%r9), %xmm6
+ vaesenc %xmm6, %xmm8, %xmm8
+ vmovdqu 224(%r9), %xmm5
+L_AES_XTS_decrypt_avx512_tweak_aes_enc_block_last:
+ vaesenclast %xmm5, %xmm8, %xmm8
+ xorl %r13d, %r13d
+ movl %eax, %r11d
+ andl $0xfffffff0, %r11d
+ cmpl %eax, %r11d
+ je L_AES_XTS_decrypt_avx512_mul16_256
+ subl $16, %r11d
+ cmpl $16, %r11d
+ jl L_AES_XTS_decrypt_avx512_last_31_start
+L_AES_XTS_decrypt_avx512_mul16_256:
+ cmpl $32, %r11d
+ jl L_AES_XTS_decrypt_avx512_done_128
+ vbroadcasti32x4 (%r8), %zmm16
+ vbroadcasti32x4 16(%r8), %zmm17
+ vbroadcasti32x4 32(%r8), %zmm18
+ vbroadcasti32x4 48(%r8), %zmm19
+ vbroadcasti32x4 64(%r8), %zmm20
+ vbroadcasti32x4 80(%r8), %zmm21
+ vbroadcasti32x4 96(%r8), %zmm22
+ vbroadcasti32x4 112(%r8), %zmm23
+ vbroadcasti32x4 128(%r8), %zmm24
+ vbroadcasti32x4 144(%r8), %zmm25
+ vbroadcasti32x4 160(%r8), %zmm26
+ cmpl $11, %r10d
+ jl L_AES_XTS_decrypt_avx512_key_cached
+ vbroadcasti32x4 176(%r8), %zmm27
+ vbroadcasti32x4 192(%r8), %zmm28
+ cmpl $13, %r10d
+ jl L_AES_XTS_decrypt_avx512_key_cached
+ vbroadcasti32x4 208(%r8), %zmm29
+ vbroadcasti32x4 224(%r8), %zmm30
+L_AES_XTS_decrypt_avx512_key_cached:
+ cmpl $0x100, %r11d
+ jl L_AES_XTS_decrypt_avx512_done_256
+ andl $0xffffff00, %r11d
+ vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5
+ vpsrlvq %zmm15, %zmm5, %zmm6
+ vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7
+ vpslldq $8, %zmm6, %zmm6
+ vpsllvq %zmm14, %zmm5, %zmm4
+ vpternlogq $0x96, %zmm6, %zmm7, %zmm4
+ vpsrlq $60, %zmm4, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $4, %zmm4, %zmm5
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm5
+ vpsrlq $60, %zmm5, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $4, %zmm5, %zmm6
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm6
+ vpsrlq $60, %zmm6, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $4, %zmm6, %zmm7
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm7
+L_AES_XTS_decrypt_avx512_dec_256:
+ # 256 bytes of input
+ # aes_dec_256
+ leaq (%rdi,%r13,1), %rcx
+ leaq (%rsi,%r13,1), %rdx
+ vmovdqu64 (%rcx), %zmm0
+ vmovdqu64 64(%rcx), %zmm1
+ vmovdqu64 128(%rcx), %zmm2
+ vmovdqu64 192(%rcx), %zmm3
+ # aes_dec_block
+ vpternlogq $0x96, %zmm4, %zmm16, %zmm0
+ vpternlogq $0x96, %zmm5, %zmm16, %zmm1
+ vpternlogq $0x96, %zmm6, %zmm16, %zmm2
+ vpternlogq $0x96, %zmm7, %zmm16, %zmm3
+ vaesdec %zmm17, %zmm0, %zmm0
+ vaesdec %zmm17, %zmm1, %zmm1
+ vaesdec %zmm17, %zmm2, %zmm2
+ vaesdec %zmm17, %zmm3, %zmm3
+ vaesdec %zmm18, %zmm0, %zmm0
+ vaesdec %zmm18, %zmm1, %zmm1
+ vaesdec %zmm18, %zmm2, %zmm2
+ vaesdec %zmm18, %zmm3, %zmm3
+ vaesdec %zmm19, %zmm0, %zmm0
+ vaesdec %zmm19, %zmm1, %zmm1
+ vaesdec %zmm19, %zmm2, %zmm2
+ vaesdec %zmm19, %zmm3, %zmm3
+ vaesdec %zmm20, %zmm0, %zmm0
+ vaesdec %zmm20, %zmm1, %zmm1
+ vaesdec %zmm20, %zmm2, %zmm2
+ vaesdec %zmm20, %zmm3, %zmm3
+ vaesdec %zmm21, %zmm0, %zmm0
+ vaesdec %zmm21, %zmm1, %zmm1
+ vaesdec %zmm21, %zmm2, %zmm2
+ vaesdec %zmm21, %zmm3, %zmm3
+ vaesdec %zmm22, %zmm0, %zmm0
+ vaesdec %zmm22, %zmm1, %zmm1
+ vaesdec %zmm22, %zmm2, %zmm2
+ vaesdec %zmm22, %zmm3, %zmm3
+ vaesdec %zmm23, %zmm0, %zmm0
+ vaesdec %zmm23, %zmm1, %zmm1
+ vaesdec %zmm23, %zmm2, %zmm2
+ vaesdec %zmm23, %zmm3, %zmm3
+ vaesdec %zmm24, %zmm0, %zmm0
+ vaesdec %zmm24, %zmm1, %zmm1
+ vaesdec %zmm24, %zmm2, %zmm2
+ vaesdec %zmm24, %zmm3, %zmm3
+ vaesdec %zmm25, %zmm0, %zmm0
+ vaesdec %zmm25, %zmm1, %zmm1
+ vaesdec %zmm25, %zmm2, %zmm2
+ vaesdec %zmm25, %zmm3, %zmm3
+ cmpl $11, %r10d
+ vmovdqa64 %zmm26, %zmm9
+ jl L_AES_XTS_decrypt_avx512_aes_dec_256_aes_dec_block_last
+ vaesdec %zmm26, %zmm0, %zmm0
+ vaesdec %zmm26, %zmm1, %zmm1
+ vaesdec %zmm26, %zmm2, %zmm2
+ vaesdec %zmm26, %zmm3, %zmm3
+ vaesdec %zmm27, %zmm0, %zmm0
+ vaesdec %zmm27, %zmm1, %zmm1
+ vaesdec %zmm27, %zmm2, %zmm2
+ vaesdec %zmm27, %zmm3, %zmm3
+ cmpl $13, %r10d
+ vmovdqa64 %zmm28, %zmm9
+ jl L_AES_XTS_decrypt_avx512_aes_dec_256_aes_dec_block_last
+ vaesdec %zmm28, %zmm0, %zmm0
+ vaesdec %zmm28, %zmm1, %zmm1
+ vaesdec %zmm28, %zmm2, %zmm2
+ vaesdec %zmm28, %zmm3, %zmm3
+ vaesdec %zmm29, %zmm0, %zmm0
+ vaesdec %zmm29, %zmm1, %zmm1
+ vaesdec %zmm29, %zmm2, %zmm2
+ vaesdec %zmm29, %zmm3, %zmm3
+ vmovdqa64 %zmm30, %zmm9
+L_AES_XTS_decrypt_avx512_aes_dec_256_aes_dec_block_last:
+ vaesdeclast %zmm9, %zmm0, %zmm0
+ vaesdeclast %zmm9, %zmm1, %zmm1
+ vaesdeclast %zmm9, %zmm2, %zmm2
+ vaesdeclast %zmm9, %zmm3, %zmm3
+ vpxorq %zmm4, %zmm0, %zmm0
+ vmovdqu64 %zmm0, (%rdx)
+ vpsrlq $48, %zmm4, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $16, %zmm4, %zmm4
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm4
+ vpxorq %zmm5, %zmm1, %zmm1
+ vmovdqu64 %zmm1, 64(%rdx)
+ vpsrlq $48, %zmm5, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $16, %zmm5, %zmm5
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm5
+ vpxorq %zmm6, %zmm2, %zmm2
+ vmovdqu64 %zmm2, 128(%rdx)
+ vpsrlq $48, %zmm6, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $16, %zmm6, %zmm6
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm6
+ vpxorq %zmm7, %zmm3, %zmm3
+ vmovdqu64 %zmm3, 192(%rdx)
+ vpsrlq $48, %zmm7, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $16, %zmm7, %zmm7
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm7
+ addl $0x100, %r13d
+ cmpl %r11d, %r13d
+ jl L_AES_XTS_decrypt_avx512_dec_256
+ vextracti32x4 $0x00, %zmm4, %xmm8
+L_AES_XTS_decrypt_avx512_done_256:
+ cmpl %eax, %r13d
+ movl %eax, %r11d
+ je L_AES_XTS_decrypt_avx512_done_dec
+ andl $0xfffffff0, %r11d
+ cmpl %eax, %r11d
+ je L_AES_XTS_decrypt_avx512_mul16_128
+ subl $16, %r11d
+ subl %r13d, %r11d
+ cmpl $16, %r11d
+ jl L_AES_XTS_decrypt_avx512_last_31_start
+ addl %r13d, %r11d
+L_AES_XTS_decrypt_avx512_mul16_128:
+ andl $0xffffff80, %r11d
+ cmpl %r11d, %r13d
+ je L_AES_XTS_decrypt_avx512_done_128
+ # 128 bytes of input
+ # aes_dec_128
+ leaq (%rdi,%r13,1), %rcx
+ leaq (%rsi,%r13,1), %rdx
+ vmovdqu64 (%rcx), %zmm0
+ vmovdqu64 64(%rcx), %zmm1
+ vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5
+ vpsrlvq %zmm15, %zmm5, %zmm6
+ vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7
+ vpslldq $8, %zmm6, %zmm6
+ vpsllvq %zmm14, %zmm5, %zmm4
+ vpternlogq $0x96, %zmm6, %zmm7, %zmm4
+ vpsrlq $60, %zmm4, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $4, %zmm4, %zmm5
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm5
+ # aes_dec_block
+ vpternlogq $0x96, %zmm4, %zmm16, %zmm0
+ vpternlogq $0x96, %zmm5, %zmm16, %zmm1
+ vaesdec %zmm17, %zmm0, %zmm0
+ vaesdec %zmm17, %zmm1, %zmm1
+ vaesdec %zmm18, %zmm0, %zmm0
+ vaesdec %zmm18, %zmm1, %zmm1
+ vaesdec %zmm19, %zmm0, %zmm0
+ vaesdec %zmm19, %zmm1, %zmm1
+ vaesdec %zmm20, %zmm0, %zmm0
+ vaesdec %zmm20, %zmm1, %zmm1
+ vaesdec %zmm21, %zmm0, %zmm0
+ vaesdec %zmm21, %zmm1, %zmm1
+ vaesdec %zmm22, %zmm0, %zmm0
+ vaesdec %zmm22, %zmm1, %zmm1
+ vaesdec %zmm23, %zmm0, %zmm0
+ vaesdec %zmm23, %zmm1, %zmm1
+ vaesdec %zmm24, %zmm0, %zmm0
+ vaesdec %zmm24, %zmm1, %zmm1
+ vaesdec %zmm25, %zmm0, %zmm0
+ vaesdec %zmm25, %zmm1, %zmm1
+ cmpl $11, %r10d
+ vmovdqa64 %zmm26, %zmm9
+ jl L_AES_XTS_decrypt_avx512_aes_dec_128_aes_dec_block_last
+ vaesdec %zmm26, %zmm0, %zmm0
+ vaesdec %zmm26, %zmm1, %zmm1
+ vaesdec %zmm27, %zmm0, %zmm0
+ vaesdec %zmm27, %zmm1, %zmm1
+ cmpl $13, %r10d
+ vmovdqa64 %zmm28, %zmm9
+ jl L_AES_XTS_decrypt_avx512_aes_dec_128_aes_dec_block_last
+ vaesdec %zmm28, %zmm0, %zmm0
+ vaesdec %zmm28, %zmm1, %zmm1
+ vaesdec %zmm29, %zmm0, %zmm0
+ vaesdec %zmm29, %zmm1, %zmm1
+ vmovdqa64 %zmm30, %zmm9
+L_AES_XTS_decrypt_avx512_aes_dec_128_aes_dec_block_last:
+ vaesdeclast %zmm9, %zmm0, %zmm0
+ vaesdeclast %zmm9, %zmm1, %zmm1
+ vpxorq %zmm4, %zmm0, %zmm0
+ vmovdqu64 %zmm0, (%rdx)
+ vpxorq %zmm5, %zmm1, %zmm1
+ vmovdqu64 %zmm1, 64(%rdx)
+ vextracti32x4 $3, %zmm5, %xmm8
+ vpshufd $19, %xmm8, %xmm9
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm9, %xmm9
+ vpternlogd $0x78, %xmm12, %xmm9, %xmm8
+ addl $0x80, %r13d
+L_AES_XTS_decrypt_avx512_done_128:
+ cmpl %eax, %r13d
+ movl %eax, %r11d
+ je L_AES_XTS_decrypt_avx512_done_dec
+ andl $0xfffffff0, %r11d
+ cmpl %eax, %r11d
+ je L_AES_XTS_decrypt_avx512_mul16_64
+ subl $16, %r11d
+ subl %r13d, %r11d
+ cmpl $16, %r11d
+ jl L_AES_XTS_decrypt_avx512_last_31_start
+ addl %r13d, %r11d
+L_AES_XTS_decrypt_avx512_mul16_64:
+ andl $0xffffffc0, %r11d
+ cmpl %r11d, %r13d
+ je L_AES_XTS_decrypt_avx512_done_64
+ # 64 bytes of input
+ # aes_dec_64
+ leaq (%rdi,%r13,1), %rcx
+ leaq (%rsi,%r13,1), %rdx
+ vmovdqu64 (%rcx), %zmm0
+ vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5
+ vpsrlvq %zmm15, %zmm5, %zmm6
+ vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7
+ vpslldq $8, %zmm6, %zmm6
+ vpsllvq %zmm14, %zmm5, %zmm4
+ vpternlogq $0x96, %zmm6, %zmm7, %zmm4
+ # aes_dec_block
+ vpternlogq $0x96, %zmm4, %zmm16, %zmm0
+ vaesdec %zmm17, %zmm0, %zmm0
+ vaesdec %zmm18, %zmm0, %zmm0
+ vaesdec %zmm19, %zmm0, %zmm0
+ vaesdec %zmm20, %zmm0, %zmm0
+ vaesdec %zmm21, %zmm0, %zmm0
+ vaesdec %zmm22, %zmm0, %zmm0
+ vaesdec %zmm23, %zmm0, %zmm0
+ vaesdec %zmm24, %zmm0, %zmm0
+ vaesdec %zmm25, %zmm0, %zmm0
+ cmpl $11, %r10d
+ vmovdqa64 %zmm26, %zmm9
+ jl L_AES_XTS_decrypt_avx512_aes_dec_64_aes_dec_block_last
+ vaesdec %zmm26, %zmm0, %zmm0
+ vaesdec %zmm27, %zmm0, %zmm0
+ cmpl $13, %r10d
+ vmovdqa64 %zmm28, %zmm9
+ jl L_AES_XTS_decrypt_avx512_aes_dec_64_aes_dec_block_last
+ vaesdec %zmm28, %zmm0, %zmm0
+ vaesdec %zmm29, %zmm0, %zmm0
+ vmovdqa64 %zmm30, %zmm9
+L_AES_XTS_decrypt_avx512_aes_dec_64_aes_dec_block_last:
+ vaesdeclast %zmm9, %zmm0, %zmm0
+ vpxorq %zmm4, %zmm0, %zmm0
+ vmovdqu64 %zmm0, (%rdx)
+ vextracti32x4 $3, %zmm4, %xmm8
+ vpshufd $19, %xmm8, %xmm9
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm9, %xmm9
+ vpternlogd $0x78, %xmm12, %xmm9, %xmm8
+ addl $0x40, %r13d
+L_AES_XTS_decrypt_avx512_done_64:
+ cmpl %eax, %r13d
+ movl %eax, %r11d
+ je L_AES_XTS_decrypt_avx512_done_dec
+ andl $0xfffffff0, %r11d
+ cmpl %eax, %r11d
+ je L_AES_XTS_decrypt_avx512_mul16_32
+ subl $16, %r11d
+ subl %r13d, %r11d
+ cmpl $16, %r11d
+ jl L_AES_XTS_decrypt_avx512_last_31_start
+ addl %r13d, %r11d
+L_AES_XTS_decrypt_avx512_mul16_32:
+ andl $0xffffffe0, %r11d
+ cmpl %r11d, %r13d
+ je L_AES_XTS_decrypt_avx512_done_32
+ # 32 bytes of input
+ # aes_dec_32
+ leaq (%rdi,%r13,1), %rcx
+ leaq (%rsi,%r13,1), %rdx
+ vmovdqu64 (%rcx), %ymm0
+ vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5
+ vpsrlvq %zmm15, %zmm5, %zmm6
+ vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7
+ vpslldq $8, %zmm6, %zmm6
+ vpsllvq %zmm14, %zmm5, %zmm4
+ vpternlogq $0x96, %zmm6, %zmm7, %zmm4
+ # aes_dec_block
+ vpternlogq $0x96, %ymm4, %ymm16, %ymm0
+ vaesdec %ymm17, %ymm0, %ymm0
+ vaesdec %ymm18, %ymm0, %ymm0
+ vaesdec %ymm19, %ymm0, %ymm0
+ vaesdec %ymm20, %ymm0, %ymm0
+ vaesdec %ymm21, %ymm0, %ymm0
+ vaesdec %ymm22, %ymm0, %ymm0
+ vaesdec %ymm23, %ymm0, %ymm0
+ vaesdec %ymm24, %ymm0, %ymm0
+ vaesdec %ymm25, %ymm0, %ymm0
+ cmpl $11, %r10d
+ vmovdqa64 %ymm26, %ymm9
+ jl L_AES_XTS_decrypt_avx512_aes_dec_32_aes_dec_block_last
+ vaesdec %ymm26, %ymm0, %ymm0
+ vaesdec %ymm27, %ymm0, %ymm0
+ cmpl $13, %r10d
+ vmovdqa64 %ymm28, %ymm9
+ jl L_AES_XTS_decrypt_avx512_aes_dec_32_aes_dec_block_last
+ vaesdec %ymm28, %ymm0, %ymm0
+ vaesdec %ymm29, %ymm0, %ymm0
+ vmovdqa64 %ymm30, %ymm9
+L_AES_XTS_decrypt_avx512_aes_dec_32_aes_dec_block_last:
+ vaesdeclast %ymm9, %ymm0, %ymm0
+ vpxorq %ymm4, %ymm0, %ymm0
+ vmovdqu64 %ymm0, (%rdx)
+ vextracti32x4 $2, %zmm4, %xmm8
+ addl $32, %r13d
+L_AES_XTS_decrypt_avx512_done_32:
+ cmpl %eax, %r13d
+ movl %eax, %r11d
+ je L_AES_XTS_decrypt_avx512_done_dec
+ andl $0xfffffff0, %r11d
+ cmpl %eax, %r11d
+ je L_AES_XTS_decrypt_avx512_mul16
+ subl $16, %r11d
+ subl %r13d, %r11d
+ cmpl $16, %r11d
+ jl L_AES_XTS_decrypt_avx512_last_31_start
+ addl %r13d, %r11d
+L_AES_XTS_decrypt_avx512_mul16:
+L_AES_XTS_decrypt_avx512_dec_16:
+ # 16 bytes of input
+ leaq (%rdi,%r13,1), %rcx
+ vmovdqu (%rcx), %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ # aes_dec_block
+ vpxor (%r8), %xmm0, %xmm0
+ vmovdqu 16(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ cmpl $11, %r10d
+ vmovdqu 160(%r8), %xmm5
+ jl L_AES_XTS_decrypt_avx512_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ cmpl $13, %r10d
+ vmovdqu 192(%r8), %xmm5
+ jl L_AES_XTS_decrypt_avx512_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%r8), %xmm5
+L_AES_XTS_decrypt_avx512_aes_dec_block_last:
+ vaesdeclast %xmm5, %xmm0, %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ leaq (%rsi,%r13,1), %rcx
+ vmovdqu %xmm0, (%rcx)
+ vpshufd $19, %xmm8, %xmm4
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm4, %xmm4
+ vpternlogd $0x78, %xmm12, %xmm4, %xmm8
+ addl $16, %r13d
+ cmpl %r11d, %r13d
+ jl L_AES_XTS_decrypt_avx512_dec_16
+ cmpl %eax, %r13d
+ je L_AES_XTS_decrypt_avx512_done_dec
+L_AES_XTS_decrypt_avx512_last_31_start:
+ vpshufd $19, %xmm8, %xmm4
+ vpaddq %xmm8, %xmm8, %xmm7
+ vpsrad $31, %xmm4, %xmm4
+ vpternlogd $0x78, %xmm12, %xmm4, %xmm7
+ leaq (%rdi,%r13,1), %rcx
+ vmovdqu (%rcx), %xmm0
+ vpxor %xmm7, %xmm0, %xmm0
+ # aes_dec_block
+ vpxor (%r8), %xmm0, %xmm0
+ vmovdqu 16(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ cmpl $11, %r10d
+ vmovdqu 160(%r8), %xmm5
+ jl L_AES_XTS_decrypt_avx512_last_31_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ cmpl $13, %r10d
+ vmovdqu 192(%r8), %xmm5
+ jl L_AES_XTS_decrypt_avx512_last_31_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%r8), %xmm5
+L_AES_XTS_decrypt_avx512_last_31_aes_dec_block_last:
+ vaesdeclast %xmm5, %xmm0, %xmm0
+ vpxor %xmm7, %xmm0, %xmm0
+ vmovdqu %xmm0, (%rsp)
+ addq $16, %r13
+ xorq %rdx, %rdx
+L_AES_XTS_decrypt_avx512_last_31_byte_loop:
+ movb (%rsp,%rdx,1), %r11b
+ movb (%rdi,%r13,1), %cl
+ movb %r11b, (%rsi,%r13,1)
+ movb %cl, (%rsp,%rdx,1)
+ incl %r13d
+ incl %edx
+ cmpl %eax, %r13d
+ jl L_AES_XTS_decrypt_avx512_last_31_byte_loop
+ subq %rdx, %r13
+ vmovdqu (%rsp), %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ # aes_dec_block
+ vpxor (%r8), %xmm0, %xmm0
+ vmovdqu 16(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%r8), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ cmpl $11, %r10d
+ vmovdqu 160(%r8), %xmm5
+ jl L_AES_XTS_decrypt_avx512_last_31_2_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ cmpl $13, %r10d
+ vmovdqu 192(%r8), %xmm5
+ jl L_AES_XTS_decrypt_avx512_last_31_2_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%r8), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%r8), %xmm5
+L_AES_XTS_decrypt_avx512_last_31_2_aes_dec_block_last:
+ vaesdeclast %xmm5, %xmm0, %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ subq $16, %r13
+ leaq (%rsi,%r13,1), %rcx
+ vmovdqu %xmm0, (%rcx)
+L_AES_XTS_decrypt_avx512_done_dec:
+ addq $0x40, %rsp
+ popq %r13
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size AES_XTS_decrypt_avx512,.-AES_XTS_decrypt_avx512
+#endif /* __APPLE__ */
+#ifndef __APPLE__
+.text
+.globl AES_XTS_decrypt_update_avx512
+.type AES_XTS_decrypt_update_avx512,@function
+.align 16
+AES_XTS_decrypt_update_avx512:
+#else
+.section __TEXT,__text
+.globl _AES_XTS_decrypt_update_avx512
+.p2align 4
+_AES_XTS_decrypt_update_avx512:
+#endif /* __APPLE__ */
+ pushq %r12
+ movq %rdx, %rax
+ movq %rcx, %r10
+ subq $0x40, %rsp
+ vmovdqu L_avx512_aes_xts_gc_xts(%rip), %xmm12
+ vbroadcasti32x4 L_avx512_aes_xts_poly(%rip), %zmm13
+ vmovdqu64 L_avx512_aes_xts_shl(%rip), %zmm14
+ vmovdqu64 L_avx512_aes_xts_shr(%rip), %zmm15
+ vmovdqu (%r8), %xmm8
+ xorl %r12d, %r12d
+ movl %eax, %r11d
+ andl $0xfffffff0, %r11d
+ cmpl %eax, %r11d
+ je L_AES_XTS_decrypt_update_avx512_mul16_256
+ subl $16, %r11d
+ cmpl $16, %r11d
+ jl L_AES_XTS_decrypt_update_avx512_last_31_start
+L_AES_XTS_decrypt_update_avx512_mul16_256:
+ cmpl $32, %r11d
+ jl L_AES_XTS_decrypt_update_avx512_done_128
+ vbroadcasti32x4 (%r10), %zmm16
+ vbroadcasti32x4 16(%r10), %zmm17
+ vbroadcasti32x4 32(%r10), %zmm18
+ vbroadcasti32x4 48(%r10), %zmm19
+ vbroadcasti32x4 64(%r10), %zmm20
+ vbroadcasti32x4 80(%r10), %zmm21
+ vbroadcasti32x4 96(%r10), %zmm22
+ vbroadcasti32x4 112(%r10), %zmm23
+ vbroadcasti32x4 128(%r10), %zmm24
+ vbroadcasti32x4 144(%r10), %zmm25
+ vbroadcasti32x4 160(%r10), %zmm26
+ cmpl $11, %r9d
+ jl L_AES_XTS_decrypt_update_avx512_key_cached
+ vbroadcasti32x4 176(%r10), %zmm27
+ vbroadcasti32x4 192(%r10), %zmm28
+ cmpl $13, %r9d
+ jl L_AES_XTS_decrypt_update_avx512_key_cached
+ vbroadcasti32x4 208(%r10), %zmm29
+ vbroadcasti32x4 224(%r10), %zmm30
+L_AES_XTS_decrypt_update_avx512_key_cached:
+ cmpl $0x100, %r11d
+ jl L_AES_XTS_decrypt_update_avx512_done_256
+ andl $0xffffff00, %r11d
+ vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5
+ vpsrlvq %zmm15, %zmm5, %zmm6
+ vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7
+ vpslldq $8, %zmm6, %zmm6
+ vpsllvq %zmm14, %zmm5, %zmm4
+ vpternlogq $0x96, %zmm6, %zmm7, %zmm4
+ vpsrlq $60, %zmm4, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $4, %zmm4, %zmm5
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm5
+ vpsrlq $60, %zmm5, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $4, %zmm5, %zmm6
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm6
+ vpsrlq $60, %zmm6, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $4, %zmm6, %zmm7
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm7
+L_AES_XTS_decrypt_update_avx512_dec_256:
+ # 256 bytes of input
+ # aes_dec_256
+ leaq (%rdi,%r12,1), %rcx
+ leaq (%rsi,%r12,1), %rdx
+ vmovdqu64 (%rcx), %zmm0
+ vmovdqu64 64(%rcx), %zmm1
+ vmovdqu64 128(%rcx), %zmm2
+ vmovdqu64 192(%rcx), %zmm3
+ # aes_dec_block
+ vpternlogq $0x96, %zmm4, %zmm16, %zmm0
+ vpternlogq $0x96, %zmm5, %zmm16, %zmm1
+ vpternlogq $0x96, %zmm6, %zmm16, %zmm2
+ vpternlogq $0x96, %zmm7, %zmm16, %zmm3
+ vaesdec %zmm17, %zmm0, %zmm0
+ vaesdec %zmm17, %zmm1, %zmm1
+ vaesdec %zmm17, %zmm2, %zmm2
+ vaesdec %zmm17, %zmm3, %zmm3
+ vaesdec %zmm18, %zmm0, %zmm0
+ vaesdec %zmm18, %zmm1, %zmm1
+ vaesdec %zmm18, %zmm2, %zmm2
+ vaesdec %zmm18, %zmm3, %zmm3
+ vaesdec %zmm19, %zmm0, %zmm0
+ vaesdec %zmm19, %zmm1, %zmm1
+ vaesdec %zmm19, %zmm2, %zmm2
+ vaesdec %zmm19, %zmm3, %zmm3
+ vaesdec %zmm20, %zmm0, %zmm0
+ vaesdec %zmm20, %zmm1, %zmm1
+ vaesdec %zmm20, %zmm2, %zmm2
+ vaesdec %zmm20, %zmm3, %zmm3
+ vaesdec %zmm21, %zmm0, %zmm0
+ vaesdec %zmm21, %zmm1, %zmm1
+ vaesdec %zmm21, %zmm2, %zmm2
+ vaesdec %zmm21, %zmm3, %zmm3
+ vaesdec %zmm22, %zmm0, %zmm0
+ vaesdec %zmm22, %zmm1, %zmm1
+ vaesdec %zmm22, %zmm2, %zmm2
+ vaesdec %zmm22, %zmm3, %zmm3
+ vaesdec %zmm23, %zmm0, %zmm0
+ vaesdec %zmm23, %zmm1, %zmm1
+ vaesdec %zmm23, %zmm2, %zmm2
+ vaesdec %zmm23, %zmm3, %zmm3
+ vaesdec %zmm24, %zmm0, %zmm0
+ vaesdec %zmm24, %zmm1, %zmm1
+ vaesdec %zmm24, %zmm2, %zmm2
+ vaesdec %zmm24, %zmm3, %zmm3
+ vaesdec %zmm25, %zmm0, %zmm0
+ vaesdec %zmm25, %zmm1, %zmm1
+ vaesdec %zmm25, %zmm2, %zmm2
+ vaesdec %zmm25, %zmm3, %zmm3
+ cmpl $11, %r9d
+ vmovdqa64 %zmm26, %zmm9
+ jl L_AES_XTS_decrypt_update_avx512_aes_dec_256_aes_dec_block_last
+ vaesdec %zmm26, %zmm0, %zmm0
+ vaesdec %zmm26, %zmm1, %zmm1
+ vaesdec %zmm26, %zmm2, %zmm2
+ vaesdec %zmm26, %zmm3, %zmm3
+ vaesdec %zmm27, %zmm0, %zmm0
+ vaesdec %zmm27, %zmm1, %zmm1
+ vaesdec %zmm27, %zmm2, %zmm2
+ vaesdec %zmm27, %zmm3, %zmm3
+ cmpl $13, %r9d
+ vmovdqa64 %zmm28, %zmm9
+ jl L_AES_XTS_decrypt_update_avx512_aes_dec_256_aes_dec_block_last
+ vaesdec %zmm28, %zmm0, %zmm0
+ vaesdec %zmm28, %zmm1, %zmm1
+ vaesdec %zmm28, %zmm2, %zmm2
+ vaesdec %zmm28, %zmm3, %zmm3
+ vaesdec %zmm29, %zmm0, %zmm0
+ vaesdec %zmm29, %zmm1, %zmm1
+ vaesdec %zmm29, %zmm2, %zmm2
+ vaesdec %zmm29, %zmm3, %zmm3
+ vmovdqa64 %zmm30, %zmm9
+L_AES_XTS_decrypt_update_avx512_aes_dec_256_aes_dec_block_last:
+ vaesdeclast %zmm9, %zmm0, %zmm0
+ vaesdeclast %zmm9, %zmm1, %zmm1
+ vaesdeclast %zmm9, %zmm2, %zmm2
+ vaesdeclast %zmm9, %zmm3, %zmm3
+ vpxorq %zmm4, %zmm0, %zmm0
+ vmovdqu64 %zmm0, (%rdx)
+ vpsrlq $48, %zmm4, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $16, %zmm4, %zmm4
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm4
+ vpxorq %zmm5, %zmm1, %zmm1
+ vmovdqu64 %zmm1, 64(%rdx)
+ vpsrlq $48, %zmm5, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $16, %zmm5, %zmm5
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm5
+ vpxorq %zmm6, %zmm2, %zmm2
+ vmovdqu64 %zmm2, 128(%rdx)
+ vpsrlq $48, %zmm6, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $16, %zmm6, %zmm6
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm6
+ vpxorq %zmm7, %zmm3, %zmm3
+ vmovdqu64 %zmm3, 192(%rdx)
+ vpsrlq $48, %zmm7, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $16, %zmm7, %zmm7
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm7
+ addl $0x100, %r12d
+ cmpl %r11d, %r12d
+ jl L_AES_XTS_decrypt_update_avx512_dec_256
+ vextracti32x4 $0x00, %zmm4, %xmm8
+L_AES_XTS_decrypt_update_avx512_done_256:
+ cmpl %eax, %r12d
+ movl %eax, %r11d
+ je L_AES_XTS_decrypt_update_avx512_done_dec
+ andl $0xfffffff0, %r11d
+ cmpl %eax, %r11d
+ je L_AES_XTS_decrypt_update_avx512_mul16_128
+ subl $16, %r11d
+ subl %r12d, %r11d
+ cmpl $16, %r11d
+ jl L_AES_XTS_decrypt_update_avx512_last_31_start
+ addl %r12d, %r11d
+L_AES_XTS_decrypt_update_avx512_mul16_128:
+ andl $0xffffff80, %r11d
+ cmpl %r11d, %r12d
+ je L_AES_XTS_decrypt_update_avx512_done_128
+ # 128 bytes of input
+ # aes_dec_128
+ leaq (%rdi,%r12,1), %rcx
+ leaq (%rsi,%r12,1), %rdx
+ vmovdqu64 (%rcx), %zmm0
+ vmovdqu64 64(%rcx), %zmm1
+ vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5
+ vpsrlvq %zmm15, %zmm5, %zmm6
+ vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7
+ vpslldq $8, %zmm6, %zmm6
+ vpsllvq %zmm14, %zmm5, %zmm4
+ vpternlogq $0x96, %zmm6, %zmm7, %zmm4
+ vpsrlq $60, %zmm4, %zmm9
+ vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10
+ vpslldq $8, %zmm9, %zmm9
+ vpsllq $4, %zmm4, %zmm5
+ vpternlogq $0x96, %zmm9, %zmm10, %zmm5
+ # aes_dec_block
+ vpternlogq $0x96, %zmm4, %zmm16, %zmm0
+ vpternlogq $0x96, %zmm5, %zmm16, %zmm1
+ vaesdec %zmm17, %zmm0, %zmm0
+ vaesdec %zmm17, %zmm1, %zmm1
+ vaesdec %zmm18, %zmm0, %zmm0
+ vaesdec %zmm18, %zmm1, %zmm1
+ vaesdec %zmm19, %zmm0, %zmm0
+ vaesdec %zmm19, %zmm1, %zmm1
+ vaesdec %zmm20, %zmm0, %zmm0
+ vaesdec %zmm20, %zmm1, %zmm1
+ vaesdec %zmm21, %zmm0, %zmm0
+ vaesdec %zmm21, %zmm1, %zmm1
+ vaesdec %zmm22, %zmm0, %zmm0
+ vaesdec %zmm22, %zmm1, %zmm1
+ vaesdec %zmm23, %zmm0, %zmm0
+ vaesdec %zmm23, %zmm1, %zmm1
+ vaesdec %zmm24, %zmm0, %zmm0
+ vaesdec %zmm24, %zmm1, %zmm1
+ vaesdec %zmm25, %zmm0, %zmm0
+ vaesdec %zmm25, %zmm1, %zmm1
+ cmpl $11, %r9d
+ vmovdqa64 %zmm26, %zmm9
+ jl L_AES_XTS_decrypt_update_avx512_aes_dec_128_aes_dec_block_last
+ vaesdec %zmm26, %zmm0, %zmm0
+ vaesdec %zmm26, %zmm1, %zmm1
+ vaesdec %zmm27, %zmm0, %zmm0
+ vaesdec %zmm27, %zmm1, %zmm1
+ cmpl $13, %r9d
+ vmovdqa64 %zmm28, %zmm9
+ jl L_AES_XTS_decrypt_update_avx512_aes_dec_128_aes_dec_block_last
+ vaesdec %zmm28, %zmm0, %zmm0
+ vaesdec %zmm28, %zmm1, %zmm1
+ vaesdec %zmm29, %zmm0, %zmm0
+ vaesdec %zmm29, %zmm1, %zmm1
+ vmovdqa64 %zmm30, %zmm9
+L_AES_XTS_decrypt_update_avx512_aes_dec_128_aes_dec_block_last:
+ vaesdeclast %zmm9, %zmm0, %zmm0
+ vaesdeclast %zmm9, %zmm1, %zmm1
+ vpxorq %zmm4, %zmm0, %zmm0
+ vmovdqu64 %zmm0, (%rdx)
+ vpxorq %zmm5, %zmm1, %zmm1
+ vmovdqu64 %zmm1, 64(%rdx)
+ vextracti32x4 $3, %zmm5, %xmm8
+ vpshufd $19, %xmm8, %xmm9
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm9, %xmm9
+ vpternlogd $0x78, %xmm12, %xmm9, %xmm8
+ addl $0x80, %r12d
+L_AES_XTS_decrypt_update_avx512_done_128:
+ cmpl %eax, %r12d
+ movl %eax, %r11d
+ je L_AES_XTS_decrypt_update_avx512_done_dec
+ andl $0xfffffff0, %r11d
+ cmpl %eax, %r11d
+ je L_AES_XTS_decrypt_update_avx512_mul16_64
+ subl $16, %r11d
+ subl %r12d, %r11d
+ cmpl $16, %r11d
+ jl L_AES_XTS_decrypt_update_avx512_last_31_start
+ addl %r12d, %r11d
+L_AES_XTS_decrypt_update_avx512_mul16_64:
+ andl $0xffffffc0, %r11d
+ cmpl %r11d, %r12d
+ je L_AES_XTS_decrypt_update_avx512_done_64
+ # 64 bytes of input
+ # aes_dec_64
+ leaq (%rdi,%r12,1), %rcx
+ leaq (%rsi,%r12,1), %rdx
+ vmovdqu64 (%rcx), %zmm0
+ vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5
+ vpsrlvq %zmm15, %zmm5, %zmm6
+ vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7
+ vpslldq $8, %zmm6, %zmm6
+ vpsllvq %zmm14, %zmm5, %zmm4
+ vpternlogq $0x96, %zmm6, %zmm7, %zmm4
+ # aes_dec_block
+ vpternlogq $0x96, %zmm4, %zmm16, %zmm0
+ vaesdec %zmm17, %zmm0, %zmm0
+ vaesdec %zmm18, %zmm0, %zmm0
+ vaesdec %zmm19, %zmm0, %zmm0
+ vaesdec %zmm20, %zmm0, %zmm0
+ vaesdec %zmm21, %zmm0, %zmm0
+ vaesdec %zmm22, %zmm0, %zmm0
+ vaesdec %zmm23, %zmm0, %zmm0
+ vaesdec %zmm24, %zmm0, %zmm0
+ vaesdec %zmm25, %zmm0, %zmm0
+ cmpl $11, %r9d
+ vmovdqa64 %zmm26, %zmm9
+ jl L_AES_XTS_decrypt_update_avx512_aes_dec_64_aes_dec_block_last
+ vaesdec %zmm26, %zmm0, %zmm0
+ vaesdec %zmm27, %zmm0, %zmm0
+ cmpl $13, %r9d
+ vmovdqa64 %zmm28, %zmm9
+ jl L_AES_XTS_decrypt_update_avx512_aes_dec_64_aes_dec_block_last
+ vaesdec %zmm28, %zmm0, %zmm0
+ vaesdec %zmm29, %zmm0, %zmm0
+ vmovdqa64 %zmm30, %zmm9
+L_AES_XTS_decrypt_update_avx512_aes_dec_64_aes_dec_block_last:
+ vaesdeclast %zmm9, %zmm0, %zmm0
+ vpxorq %zmm4, %zmm0, %zmm0
+ vmovdqu64 %zmm0, (%rdx)
+ vextracti32x4 $3, %zmm4, %xmm8
+ vpshufd $19, %xmm8, %xmm9
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm9, %xmm9
+ vpternlogd $0x78, %xmm12, %xmm9, %xmm8
+ addl $0x40, %r12d
+L_AES_XTS_decrypt_update_avx512_done_64:
+ cmpl %eax, %r12d
+ movl %eax, %r11d
+ je L_AES_XTS_decrypt_update_avx512_done_dec
+ andl $0xfffffff0, %r11d
+ cmpl %eax, %r11d
+ je L_AES_XTS_decrypt_update_avx512_mul16_32
+ subl $16, %r11d
+ subl %r12d, %r11d
+ cmpl $16, %r11d
+ jl L_AES_XTS_decrypt_update_avx512_last_31_start
+ addl %r12d, %r11d
+L_AES_XTS_decrypt_update_avx512_mul16_32:
+ andl $0xffffffe0, %r11d
+ cmpl %r11d, %r12d
+ je L_AES_XTS_decrypt_update_avx512_done_32
+ # 32 bytes of input
+ # aes_dec_32
+ leaq (%rdi,%r12,1), %rcx
+ leaq (%rsi,%r12,1), %rdx
+ vmovdqu64 (%rcx), %ymm0
+ vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5
+ vpsrlvq %zmm15, %zmm5, %zmm6
+ vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7
+ vpslldq $8, %zmm6, %zmm6
+ vpsllvq %zmm14, %zmm5, %zmm4
+ vpternlogq $0x96, %zmm6, %zmm7, %zmm4
+ # aes_dec_block
+ vpternlogq $0x96, %ymm4, %ymm16, %ymm0
+ vaesdec %ymm17, %ymm0, %ymm0
+ vaesdec %ymm18, %ymm0, %ymm0
+ vaesdec %ymm19, %ymm0, %ymm0
+ vaesdec %ymm20, %ymm0, %ymm0
+ vaesdec %ymm21, %ymm0, %ymm0
+ vaesdec %ymm22, %ymm0, %ymm0
+ vaesdec %ymm23, %ymm0, %ymm0
+ vaesdec %ymm24, %ymm0, %ymm0
+ vaesdec %ymm25, %ymm0, %ymm0
+ cmpl $11, %r9d
+ vmovdqa64 %ymm26, %ymm9
+ jl L_AES_XTS_decrypt_update_avx512_aes_dec_32_aes_dec_block_last
+ vaesdec %ymm26, %ymm0, %ymm0
+ vaesdec %ymm27, %ymm0, %ymm0
+ cmpl $13, %r9d
+ vmovdqa64 %ymm28, %ymm9
+ jl L_AES_XTS_decrypt_update_avx512_aes_dec_32_aes_dec_block_last
+ vaesdec %ymm28, %ymm0, %ymm0
+ vaesdec %ymm29, %ymm0, %ymm0
+ vmovdqa64 %ymm30, %ymm9
+L_AES_XTS_decrypt_update_avx512_aes_dec_32_aes_dec_block_last:
+ vaesdeclast %ymm9, %ymm0, %ymm0
+ vpxorq %ymm4, %ymm0, %ymm0
+ vmovdqu64 %ymm0, (%rdx)
+ vextracti32x4 $2, %zmm4, %xmm8
+ addl $32, %r12d
+L_AES_XTS_decrypt_update_avx512_done_32:
+ cmpl %eax, %r12d
+ movl %eax, %r11d
+ je L_AES_XTS_decrypt_update_avx512_done_dec
+ andl $0xfffffff0, %r11d
+ cmpl %eax, %r11d
+ je L_AES_XTS_decrypt_update_avx512_mul16
+ subl $16, %r11d
+ subl %r12d, %r11d
+ cmpl $16, %r11d
+ jl L_AES_XTS_decrypt_update_avx512_last_31_start
+ addl %r12d, %r11d
+L_AES_XTS_decrypt_update_avx512_mul16:
+L_AES_XTS_decrypt_update_avx512_dec_16:
+ # 16 bytes of input
+ leaq (%rdi,%r12,1), %rcx
+ vmovdqu (%rcx), %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ # aes_dec_block
+ vpxor (%r10), %xmm0, %xmm0
+ vmovdqu 16(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ cmpl $11, %r9d
+ vmovdqu 160(%r10), %xmm5
+ jl L_AES_XTS_decrypt_update_avx512_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%r10), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ cmpl $13, %r9d
+ vmovdqu 192(%r10), %xmm5
+ jl L_AES_XTS_decrypt_update_avx512_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%r10), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%r10), %xmm5
+L_AES_XTS_decrypt_update_avx512_aes_dec_block_last:
+ vaesdeclast %xmm5, %xmm0, %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ leaq (%rsi,%r12,1), %rcx
+ vmovdqu %xmm0, (%rcx)
+ vpshufd $19, %xmm8, %xmm4
+ vpaddq %xmm8, %xmm8, %xmm8
+ vpsrad $31, %xmm4, %xmm4
+ vpternlogd $0x78, %xmm12, %xmm4, %xmm8
+ addl $16, %r12d
+ cmpl %r11d, %r12d
+ jl L_AES_XTS_decrypt_update_avx512_dec_16
+ cmpl %eax, %r12d
+ je L_AES_XTS_decrypt_update_avx512_done_dec
+L_AES_XTS_decrypt_update_avx512_last_31_start:
+ vpshufd $19, %xmm8, %xmm4
+ vpaddq %xmm8, %xmm8, %xmm7
+ vpsrad $31, %xmm4, %xmm4
+ vpternlogd $0x78, %xmm12, %xmm4, %xmm7
+ leaq (%rdi,%r12,1), %rcx
+ vmovdqu (%rcx), %xmm0
+ vpxor %xmm7, %xmm0, %xmm0
+ # aes_dec_block
+ vpxor (%r10), %xmm0, %xmm0
+ vmovdqu 16(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ cmpl $11, %r9d
+ vmovdqu 160(%r10), %xmm5
+ jl L_AES_XTS_decrypt_update_avx512_last_31_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%r10), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ cmpl $13, %r9d
+ vmovdqu 192(%r10), %xmm5
+ jl L_AES_XTS_decrypt_update_avx512_last_31_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%r10), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%r10), %xmm5
+L_AES_XTS_decrypt_update_avx512_last_31_aes_dec_block_last:
+ vaesdeclast %xmm5, %xmm0, %xmm0
+ vpxor %xmm7, %xmm0, %xmm0
+ vmovdqu %xmm0, (%rsp)
+ addq $16, %r12
+ xorq %rdx, %rdx
+L_AES_XTS_decrypt_update_avx512_last_31_byte_loop:
+ movb (%rsp,%rdx,1), %r11b
+ movb (%rdi,%r12,1), %cl
+ movb %r11b, (%rsi,%r12,1)
+ movb %cl, (%rsp,%rdx,1)
+ incl %r12d
+ incl %edx
+ cmpl %eax, %r12d
+ jl L_AES_XTS_decrypt_update_avx512_last_31_byte_loop
+ subq %rdx, %r12
+ vmovdqu (%rsp), %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ # aes_dec_block
+ vpxor (%r10), %xmm0, %xmm0
+ vmovdqu 16(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 32(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 48(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 64(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 80(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 96(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 112(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 128(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 144(%r10), %xmm5
+ vaesdec %xmm5, %xmm0, %xmm0
+ cmpl $11, %r9d
+ vmovdqu 160(%r10), %xmm5
+ jl L_AES_XTS_decrypt_update_avx512_last_31_2_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 176(%r10), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ cmpl $13, %r9d
+ vmovdqu 192(%r10), %xmm5
+ jl L_AES_XTS_decrypt_update_avx512_last_31_2_aes_dec_block_last
+ vaesdec %xmm5, %xmm0, %xmm0
+ vmovdqu 208(%r10), %xmm6
+ vaesdec %xmm6, %xmm0, %xmm0
+ vmovdqu 224(%r10), %xmm5
+L_AES_XTS_decrypt_update_avx512_last_31_2_aes_dec_block_last:
+ vaesdeclast %xmm5, %xmm0, %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ subq $16, %r12
+ leaq (%rsi,%r12,1), %rcx
+ vmovdqu %xmm0, (%rcx)
+L_AES_XTS_decrypt_update_avx512_done_dec:
+ vmovdqu %xmm8, (%r8)
+ addq $0x40, %rsp
+ popq %r12
+ repz retq
+#ifndef __APPLE__
+.size AES_XTS_decrypt_update_avx512,.-AES_XTS_decrypt_update_avx512
+#endif /* __APPLE__ */
+#endif /* HAVE_INTEL_AVX512 */
#endif /* WOLFSSL_X86_64_BUILD */
#endif /* WOLFSSL_AES_XTS */
diff --git a/wolfcrypt/src/aes_xts_asm.asm b/wolfcrypt/src/aes_xts_asm.asm
index b0e5cebf316..a904ffa4ce7 100644
--- a/wolfcrypt/src/aes_xts_asm.asm
+++ b/wolfcrypt/src/aes_xts_asm.asm
@@ -2831,4 +2831,4472 @@ L_AES_XTS_decrypt_update_avx1_done_dec:
AES_XTS_decrypt_update_avx1 ENDP
_TEXT ENDS
ENDIF
+IFDEF HAVE_INTEL_VAES
+_TEXT SEGMENT READONLY PARA
+AES_XTS_init_vaes PROC
+ vmovdqu xmm0, OWORD PTR [rcx]
+ ; aes_enc_block
+ vpxor xmm0, xmm0, [rdx]
+ vmovdqu xmm2, OWORD PTR [rdx+16]
+ vaesenc xmm0, xmm0, xmm2
+ vmovdqu xmm2, OWORD PTR [rdx+32]
+ vaesenc xmm0, xmm0, xmm2
+ vmovdqu xmm2, OWORD PTR [rdx+48]
+ vaesenc xmm0, xmm0, xmm2
+ vmovdqu xmm2, OWORD PTR [rdx+64]
+ vaesenc xmm0, xmm0, xmm2
+ vmovdqu xmm2, OWORD PTR [rdx+80]
+ vaesenc xmm0, xmm0, xmm2
+ vmovdqu xmm2, OWORD PTR [rdx+96]
+ vaesenc xmm0, xmm0, xmm2
+ vmovdqu xmm2, OWORD PTR [rdx+112]
+ vaesenc xmm0, xmm0, xmm2
+ vmovdqu xmm2, OWORD PTR [rdx+128]
+ vaesenc xmm0, xmm0, xmm2
+ vmovdqu xmm2, OWORD PTR [rdx+144]
+ vaesenc xmm0, xmm0, xmm2
+ cmp r8d, 11
+ vmovdqu xmm2, OWORD PTR [rdx+160]
+ jl L_AES_XTS_init_vaes_tweak_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm2
+ vmovdqu xmm3, OWORD PTR [rdx+176]
+ vaesenc xmm0, xmm0, xmm3
+ cmp r8d, 13
+ vmovdqu xmm2, OWORD PTR [rdx+192]
+ jl L_AES_XTS_init_vaes_tweak_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm2
+ vmovdqu xmm3, OWORD PTR [rdx+208]
+ vaesenc xmm0, xmm0, xmm3
+ vmovdqu xmm2, OWORD PTR [rdx+224]
+L_AES_XTS_init_vaes_tweak_aes_enc_block_last:
+ vaesenclast xmm0, xmm0, xmm2
+ vmovdqu OWORD PTR [rcx], xmm0
+ ret
+AES_XTS_init_vaes ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_vaes_aes_xts_gc_xts DWORD \
+ 00000087h, 00000000h, 00000001h, 00000000h
+ptr_L_vaes_aes_xts_gc_xts QWORD L_vaes_aes_xts_gc_xts
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_vaes_aes_xts_poly DWORD \
+ 00000087h, 00000000h, 00000000h, 00000000h
+ptr_L_vaes_aes_xts_poly QWORD L_vaes_aes_xts_poly
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_vaes_aes_xts_shl DWORD \
+ 00000000h, 00000000h, 00000000h, 00000000h,
+ 00000001h, 00000000h, 00000001h, 00000000h
+ptr_L_vaes_aes_xts_shl QWORD L_vaes_aes_xts_shl
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_vaes_aes_xts_shr DWORD \
+ 00000040h, 00000000h, 00000040h, 00000000h,
+ 0000003fh, 00000000h, 0000003fh, 00000000h
+ptr_L_vaes_aes_xts_shr QWORD L_vaes_aes_xts_shr
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+AES_XTS_encrypt_vaes PROC
+ push rdi
+ push rsi
+ push r12
+ push r13
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rax, r8
+ mov r12, r9
+ mov r8, QWORD PTR [rsp+72]
+ mov r9, QWORD PTR [rsp+80]
+ mov r10d, DWORD PTR [rsp+88]
+ sub rsp, 224
+ vmovdqu OWORD PTR [rsp+64], xmm6
+ vmovdqu OWORD PTR [rsp+80], xmm7
+ vmovdqu OWORD PTR [rsp+96], xmm8
+ vmovdqu OWORD PTR [rsp+112], xmm9
+ vmovdqu OWORD PTR [rsp+128], xmm10
+ vmovdqu OWORD PTR [rsp+144], xmm11
+ vmovdqu OWORD PTR [rsp+160], xmm12
+ vmovdqu OWORD PTR [rsp+176], xmm13
+ vmovdqu OWORD PTR [rsp+192], xmm14
+ vmovdqu OWORD PTR [rsp+208], xmm15
+ vmovdqu xmm12, OWORD PTR L_vaes_aes_xts_gc_xts
+ vbroadcasti128 ymm13, ptr_L_vaes_aes_xts_poly
+ vmovdqu ymm14, YMMWORD PTR L_vaes_aes_xts_shl
+ vmovdqu ymm15, YMMWORD PTR L_vaes_aes_xts_shr
+ vmovdqu xmm8, OWORD PTR [r12]
+ ; aes_enc_block
+ vpxor xmm8, xmm8, [r9]
+ vmovdqu xmm5, OWORD PTR [r9+16]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+32]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+48]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+64]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+80]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+96]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+112]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+128]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+144]
+ vaesenc xmm8, xmm8, xmm5
+ cmp r10d, 11
+ vmovdqu xmm5, OWORD PTR [r9+160]
+ jl L_AES_XTS_encrypt_vaes_tweak_aes_enc_block_last
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+176]
+ vaesenc xmm8, xmm8, xmm6
+ cmp r10d, 13
+ vmovdqu xmm5, OWORD PTR [r9+192]
+ jl L_AES_XTS_encrypt_vaes_tweak_aes_enc_block_last
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+208]
+ vaesenc xmm8, xmm8, xmm6
+ vmovdqu xmm5, OWORD PTR [r9+224]
+L_AES_XTS_encrypt_vaes_tweak_aes_enc_block_last:
+ vaesenclast xmm8, xmm8, xmm5
+ xor r13d, r13d
+ cmp eax, 32
+ jl L_AES_XTS_encrypt_vaes_done_128
+ cmp eax, 128
+ mov r11d, eax
+ jl L_AES_XTS_encrypt_vaes_done_128
+ and r11d, 4294967168
+ vperm2i128 ymm5, ymm8, ymm8, 0
+ vpsrlvq ymm6, ymm5, ymm15
+ vpclmulqdq ymm7, ymm6, ymm13, 1
+ vpslldq ymm6, ymm6, 8
+ vpsllvq ymm4, ymm5, ymm14
+ vpxor ymm4, ymm4, ymm7
+ vpxor ymm4, ymm4, ymm6
+ vpsrlq ymm9, ymm4, 62
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm5, ymm4, 2
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm5, ymm5, ymm9
+ vpsrlq ymm9, ymm5, 62
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm6, ymm5, 2
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm6, ymm6, ymm9
+ vpsrlq ymm9, ymm6, 62
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm7, ymm6, 2
+ vpxor ymm7, ymm7, ymm10
+ vpxor ymm7, ymm7, ymm9
+L_AES_XTS_encrypt_vaes_enc_128:
+ ; 128 bytes of input
+ ; aes_enc_128
+ lea rcx, QWORD PTR [rdi+r13]
+ lea rdx, QWORD PTR [rsi+r13]
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ ; aes_enc_block
+ vbroadcasti128 ymm9, [r8]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm0, ymm0, ymm9
+ vpxor ymm1, ymm1, ymm5
+ vpxor ymm1, ymm1, ymm9
+ vpxor ymm2, ymm2, ymm6
+ vpxor ymm2, ymm2, ymm9
+ vpxor ymm3, ymm3, ymm7
+ vpxor ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+16]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+32]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+48]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+64]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+80]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+96]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+112]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+128]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+144]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ cmp r10d, 11
+ vbroadcasti128 ymm9, [r8+160]
+ jl L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+176]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ cmp r10d, 13
+ vbroadcasti128 ymm9, [r8+192]
+ jl L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+208]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+224]
+L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last:
+ vaesenclast ymm0, ymm0, ymm9
+ vaesenclast ymm1, ymm1, ymm9
+ vaesenclast ymm2, ymm2, ymm9
+ vaesenclast ymm3, ymm3, ymm9
+ vpxor ymm0, ymm0, ymm4
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vpsrlq ymm9, ymm4, 56
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm4, ymm4, 8
+ vpxor ymm4, ymm4, ymm10
+ vpxor ymm4, ymm4, ymm9
+ vpxor ymm1, ymm1, ymm5
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vpsrlq ymm9, ymm5, 56
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm5, ymm5, 8
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm2, ymm2, ymm6
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vpsrlq ymm9, ymm6, 56
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm6, ymm6, 8
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm6, ymm6, ymm9
+ vpxor ymm3, ymm3, ymm7
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ vpsrlq ymm9, ymm7, 56
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm7, ymm7, 8
+ vpxor ymm7, ymm7, ymm10
+ vpxor ymm7, ymm7, ymm9
+ add r13d, 128
+ cmp r13d, r11d
+ jl L_AES_XTS_encrypt_vaes_enc_128
+ vextracti128 xmm8, ymm4, 0
+L_AES_XTS_encrypt_vaes_done_128:
+ mov r11d, eax
+ and r11d, 4294967232
+ cmp r13d, r11d
+ je L_AES_XTS_encrypt_vaes_done_64
+ ; 64 bytes of input
+ ; aes_enc_64
+ lea rcx, QWORD PTR [rdi+r13]
+ lea rdx, QWORD PTR [rsi+r13]
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vperm2i128 ymm5, ymm8, ymm8, 0
+ vpsrlvq ymm6, ymm5, ymm15
+ vpclmulqdq ymm7, ymm6, ymm13, 1
+ vpslldq ymm6, ymm6, 8
+ vpsllvq ymm4, ymm5, ymm14
+ vpxor ymm4, ymm4, ymm7
+ vpxor ymm4, ymm4, ymm6
+ vpsrlq ymm9, ymm4, 62
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm5, ymm4, 2
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm5, ymm5, ymm9
+ ; aes_enc_block
+ vbroadcasti128 ymm9, [r8]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm0, ymm0, ymm9
+ vpxor ymm1, ymm1, ymm5
+ vpxor ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+16]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+32]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+48]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+64]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+80]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+96]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+112]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+128]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+144]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ cmp r10d, 11
+ vbroadcasti128 ymm9, [r8+160]
+ jl L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+176]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ cmp r10d, 13
+ vbroadcasti128 ymm9, [r8+192]
+ jl L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+208]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+224]
+L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last:
+ vaesenclast ymm0, ymm0, ymm9
+ vaesenclast ymm1, ymm1, ymm9
+ vpxor ymm0, ymm0, ymm4
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vpxor ymm1, ymm1, ymm5
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vextracti128 xmm8, ymm5, 1
+ vpshufd xmm9, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm9, xmm9, 31
+ vpand xmm9, xmm9, xmm12
+ vpxor xmm8, xmm8, xmm9
+ add r13d, 64
+L_AES_XTS_encrypt_vaes_done_64:
+ mov r11d, eax
+ and r11d, 4294967264
+ cmp r13d, r11d
+ je L_AES_XTS_encrypt_vaes_done_32
+ ; 32 bytes of input
+ ; aes_enc_32
+ lea rcx, QWORD PTR [rdi+r13]
+ lea rdx, QWORD PTR [rsi+r13]
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vperm2i128 ymm5, ymm8, ymm8, 0
+ vpsrlvq ymm6, ymm5, ymm15
+ vpclmulqdq ymm7, ymm6, ymm13, 1
+ vpslldq ymm6, ymm6, 8
+ vpsllvq ymm4, ymm5, ymm14
+ vpxor ymm4, ymm4, ymm7
+ vpxor ymm4, ymm4, ymm6
+ ; aes_enc_block
+ vbroadcasti128 ymm9, [r8]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+16]
+ vaesenc ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+32]
+ vaesenc ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+48]
+ vaesenc ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+64]
+ vaesenc ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+80]
+ vaesenc ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+96]
+ vaesenc ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+112]
+ vaesenc ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+128]
+ vaesenc ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+144]
+ vaesenc ymm0, ymm0, ymm9
+ cmp r10d, 11
+ vbroadcasti128 ymm9, [r8+160]
+ jl L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+176]
+ vaesenc ymm0, ymm0, ymm9
+ cmp r10d, 13
+ vbroadcasti128 ymm9, [r8+192]
+ jl L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+208]
+ vaesenc ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+224]
+L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last:
+ vaesenclast ymm0, ymm0, ymm9
+ vpxor ymm0, ymm0, ymm4
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vextracti128 xmm8, ymm4, 1
+ vpshufd xmm9, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm9, xmm9, 31
+ vpand xmm9, xmm9, xmm12
+ vpxor xmm8, xmm8, xmm9
+ add r13d, 32
+L_AES_XTS_encrypt_vaes_done_32:
+ cmp r13d, eax
+ mov r11d, eax
+ je L_AES_XTS_encrypt_vaes_done_enc
+ sub r11d, r13d
+ cmp r11d, 16
+ mov r11d, eax
+ jl L_AES_XTS_encrypt_vaes_last_15
+ and r11d, 4294967280
+ ; 16 bytes of input
+L_AES_XTS_encrypt_vaes_enc_16:
+ lea rcx, QWORD PTR [rdi+r13]
+ vmovdqu xmm0, OWORD PTR [rcx]
+ vpxor xmm0, xmm0, xmm8
+ ; aes_enc_block
+ vpxor xmm0, xmm0, [r8]
+ vmovdqu xmm5, OWORD PTR [r8+16]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+32]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+48]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+64]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+80]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+96]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+112]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+128]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+144]
+ vaesenc xmm0, xmm0, xmm5
+ cmp r10d, 11
+ vmovdqu xmm5, OWORD PTR [r8+160]
+ jl L_AES_XTS_encrypt_vaes_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r8+176]
+ vaesenc xmm0, xmm0, xmm6
+ cmp r10d, 13
+ vmovdqu xmm5, OWORD PTR [r8+192]
+ jl L_AES_XTS_encrypt_vaes_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r8+208]
+ vaesenc xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r8+224]
+L_AES_XTS_encrypt_vaes_aes_enc_block_last:
+ vaesenclast xmm0, xmm0, xmm5
+ vpxor xmm0, xmm0, xmm8
+ lea rcx, QWORD PTR [rsi+r13]
+ vmovdqu OWORD PTR [rcx], xmm0
+ vpshufd xmm4, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm4, xmm4, 31
+ vpand xmm4, xmm4, xmm12
+ vpxor xmm8, xmm8, xmm4
+ add r13d, 16
+ cmp r13d, r11d
+ jl L_AES_XTS_encrypt_vaes_enc_16
+ cmp r13d, eax
+ je L_AES_XTS_encrypt_vaes_done_enc
+L_AES_XTS_encrypt_vaes_last_15:
+ sub r13, 16
+ lea rcx, QWORD PTR [rsi+r13]
+ vmovdqu xmm0, OWORD PTR [rcx]
+ add r13, 16
+ vmovdqu OWORD PTR [rsp], xmm0
+ xor rdx, rdx
+L_AES_XTS_encrypt_vaes_last_15_byte_loop:
+ mov r11b, BYTE PTR [rsp+rdx]
+ mov cl, BYTE PTR [rdi+r13]
+ mov BYTE PTR [rsi+r13], r11b
+ mov BYTE PTR [rsp+rdx], cl
+ inc r13d
+ inc edx
+ cmp r13d, eax
+ jl L_AES_XTS_encrypt_vaes_last_15_byte_loop
+ sub r13, rdx
+ vmovdqu xmm0, OWORD PTR [rsp]
+ sub r13, 16
+ vpxor xmm0, xmm0, xmm8
+ ; aes_enc_block
+ vpxor xmm0, xmm0, [r8]
+ vmovdqu xmm5, OWORD PTR [r8+16]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+32]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+48]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+64]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+80]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+96]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+112]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+128]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+144]
+ vaesenc xmm0, xmm0, xmm5
+ cmp r10d, 11
+ vmovdqu xmm5, OWORD PTR [r8+160]
+ jl L_AES_XTS_encrypt_vaes_last_15_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r8+176]
+ vaesenc xmm0, xmm0, xmm6
+ cmp r10d, 13
+ vmovdqu xmm5, OWORD PTR [r8+192]
+ jl L_AES_XTS_encrypt_vaes_last_15_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r8+208]
+ vaesenc xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r8+224]
+L_AES_XTS_encrypt_vaes_last_15_aes_enc_block_last:
+ vaesenclast xmm0, xmm0, xmm5
+ vpxor xmm0, xmm0, xmm8
+ lea rcx, QWORD PTR [rsi+r13]
+ vmovdqu OWORD PTR [rcx], xmm0
+L_AES_XTS_encrypt_vaes_done_enc:
+ vmovdqu xmm6, OWORD PTR [rsp+64]
+ vmovdqu xmm7, OWORD PTR [rsp+80]
+ vmovdqu xmm8, OWORD PTR [rsp+96]
+ vmovdqu xmm9, OWORD PTR [rsp+112]
+ vmovdqu xmm10, OWORD PTR [rsp+128]
+ vmovdqu xmm11, OWORD PTR [rsp+144]
+ vmovdqu xmm12, OWORD PTR [rsp+160]
+ vmovdqu xmm13, OWORD PTR [rsp+176]
+ vmovdqu xmm14, OWORD PTR [rsp+192]
+ vmovdqu xmm15, OWORD PTR [rsp+208]
+ add rsp, 224
+ pop r13
+ pop r12
+ pop rsi
+ pop rdi
+ ret
+AES_XTS_encrypt_vaes ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_XTS_encrypt_update_vaes PROC
+ push rdi
+ push rsi
+ push r12
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rax, r8
+ mov r10, r9
+ mov r8, QWORD PTR [rsp+64]
+ mov r9d, DWORD PTR [rsp+72]
+ sub rsp, 224
+ vmovdqu OWORD PTR [rsp+64], xmm6
+ vmovdqu OWORD PTR [rsp+80], xmm7
+ vmovdqu OWORD PTR [rsp+96], xmm8
+ vmovdqu OWORD PTR [rsp+112], xmm9
+ vmovdqu OWORD PTR [rsp+128], xmm10
+ vmovdqu OWORD PTR [rsp+144], xmm11
+ vmovdqu OWORD PTR [rsp+160], xmm12
+ vmovdqu OWORD PTR [rsp+176], xmm13
+ vmovdqu OWORD PTR [rsp+192], xmm14
+ vmovdqu OWORD PTR [rsp+208], xmm15
+ vmovdqu xmm12, OWORD PTR L_vaes_aes_xts_gc_xts
+ vbroadcasti128 ymm13, ptr_L_vaes_aes_xts_poly
+ vmovdqu ymm14, YMMWORD PTR L_vaes_aes_xts_shl
+ vmovdqu ymm15, YMMWORD PTR L_vaes_aes_xts_shr
+ vmovdqu xmm8, OWORD PTR [r8]
+ xor r12d, r12d
+ cmp eax, 32
+ jl L_AES_XTS_encrypt_update_vaes_done_128
+ cmp eax, 128
+ mov r11d, eax
+ jl L_AES_XTS_encrypt_update_vaes_done_128
+ and r11d, 4294967168
+ vperm2i128 ymm5, ymm8, ymm8, 0
+ vpsrlvq ymm6, ymm5, ymm15
+ vpclmulqdq ymm7, ymm6, ymm13, 1
+ vpslldq ymm6, ymm6, 8
+ vpsllvq ymm4, ymm5, ymm14
+ vpxor ymm4, ymm4, ymm7
+ vpxor ymm4, ymm4, ymm6
+ vpsrlq ymm9, ymm4, 62
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm5, ymm4, 2
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm5, ymm5, ymm9
+ vpsrlq ymm9, ymm5, 62
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm6, ymm5, 2
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm6, ymm6, ymm9
+ vpsrlq ymm9, ymm6, 62
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm7, ymm6, 2
+ vpxor ymm7, ymm7, ymm10
+ vpxor ymm7, ymm7, ymm9
+L_AES_XTS_encrypt_update_vaes_enc_128:
+ ; 128 bytes of input
+ ; aes_enc_128
+ lea rcx, QWORD PTR [rdi+r12]
+ lea rdx, QWORD PTR [rsi+r12]
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ ; aes_enc_block
+ vbroadcasti128 ymm9, [r10]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm0, ymm0, ymm9
+ vpxor ymm1, ymm1, ymm5
+ vpxor ymm1, ymm1, ymm9
+ vpxor ymm2, ymm2, ymm6
+ vpxor ymm2, ymm2, ymm9
+ vpxor ymm3, ymm3, ymm7
+ vpxor ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+16]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+32]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+48]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+64]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+80]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+96]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+112]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+128]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+144]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ cmp r9d, 11
+ vbroadcasti128 ymm9, [r10+160]
+ jl L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+176]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ cmp r9d, 13
+ vbroadcasti128 ymm9, [r10+192]
+ jl L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+208]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vaesenc ymm2, ymm2, ymm9
+ vaesenc ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+224]
+L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last:
+ vaesenclast ymm0, ymm0, ymm9
+ vaesenclast ymm1, ymm1, ymm9
+ vaesenclast ymm2, ymm2, ymm9
+ vaesenclast ymm3, ymm3, ymm9
+ vpxor ymm0, ymm0, ymm4
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vpsrlq ymm9, ymm4, 56
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm4, ymm4, 8
+ vpxor ymm4, ymm4, ymm10
+ vpxor ymm4, ymm4, ymm9
+ vpxor ymm1, ymm1, ymm5
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vpsrlq ymm9, ymm5, 56
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm5, ymm5, 8
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm2, ymm2, ymm6
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vpsrlq ymm9, ymm6, 56
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm6, ymm6, 8
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm6, ymm6, ymm9
+ vpxor ymm3, ymm3, ymm7
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ vpsrlq ymm9, ymm7, 56
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm7, ymm7, 8
+ vpxor ymm7, ymm7, ymm10
+ vpxor ymm7, ymm7, ymm9
+ add r12d, 128
+ cmp r12d, r11d
+ jl L_AES_XTS_encrypt_update_vaes_enc_128
+ vextracti128 xmm8, ymm4, 0
+L_AES_XTS_encrypt_update_vaes_done_128:
+ mov r11d, eax
+ and r11d, 4294967232
+ cmp r12d, r11d
+ je L_AES_XTS_encrypt_update_vaes_done_64
+ ; 64 bytes of input
+ ; aes_enc_64
+ lea rcx, QWORD PTR [rdi+r12]
+ lea rdx, QWORD PTR [rsi+r12]
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vperm2i128 ymm5, ymm8, ymm8, 0
+ vpsrlvq ymm6, ymm5, ymm15
+ vpclmulqdq ymm7, ymm6, ymm13, 1
+ vpslldq ymm6, ymm6, 8
+ vpsllvq ymm4, ymm5, ymm14
+ vpxor ymm4, ymm4, ymm7
+ vpxor ymm4, ymm4, ymm6
+ vpsrlq ymm9, ymm4, 62
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm5, ymm4, 2
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm5, ymm5, ymm9
+ ; aes_enc_block
+ vbroadcasti128 ymm9, [r10]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm0, ymm0, ymm9
+ vpxor ymm1, ymm1, ymm5
+ vpxor ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+16]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+32]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+48]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+64]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+80]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+96]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+112]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+128]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+144]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ cmp r9d, 11
+ vbroadcasti128 ymm9, [r10+160]
+ jl L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+176]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ cmp r9d, 13
+ vbroadcasti128 ymm9, [r10+192]
+ jl L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+208]
+ vaesenc ymm0, ymm0, ymm9
+ vaesenc ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+224]
+L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last:
+ vaesenclast ymm0, ymm0, ymm9
+ vaesenclast ymm1, ymm1, ymm9
+ vpxor ymm0, ymm0, ymm4
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vpxor ymm1, ymm1, ymm5
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vextracti128 xmm8, ymm5, 1
+ vpshufd xmm9, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm9, xmm9, 31
+ vpand xmm9, xmm9, xmm12
+ vpxor xmm8, xmm8, xmm9
+ add r12d, 64
+L_AES_XTS_encrypt_update_vaes_done_64:
+ mov r11d, eax
+ and r11d, 4294967264
+ cmp r12d, r11d
+ je L_AES_XTS_encrypt_update_vaes_done_32
+ ; 32 bytes of input
+ ; aes_enc_32
+ lea rcx, QWORD PTR [rdi+r12]
+ lea rdx, QWORD PTR [rsi+r12]
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vperm2i128 ymm5, ymm8, ymm8, 0
+ vpsrlvq ymm6, ymm5, ymm15
+ vpclmulqdq ymm7, ymm6, ymm13, 1
+ vpslldq ymm6, ymm6, 8
+ vpsllvq ymm4, ymm5, ymm14
+ vpxor ymm4, ymm4, ymm7
+ vpxor ymm4, ymm4, ymm6
+ ; aes_enc_block
+ vbroadcasti128 ymm9, [r10]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+16]
+ vaesenc ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+32]
+ vaesenc ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+48]
+ vaesenc ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+64]
+ vaesenc ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+80]
+ vaesenc ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+96]
+ vaesenc ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+112]
+ vaesenc ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+128]
+ vaesenc ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+144]
+ vaesenc ymm0, ymm0, ymm9
+ cmp r9d, 11
+ vbroadcasti128 ymm9, [r10+160]
+ jl L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+176]
+ vaesenc ymm0, ymm0, ymm9
+ cmp r9d, 13
+ vbroadcasti128 ymm9, [r10+192]
+ jl L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+208]
+ vaesenc ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+224]
+L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last:
+ vaesenclast ymm0, ymm0, ymm9
+ vpxor ymm0, ymm0, ymm4
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vextracti128 xmm8, ymm4, 1
+ vpshufd xmm9, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm9, xmm9, 31
+ vpand xmm9, xmm9, xmm12
+ vpxor xmm8, xmm8, xmm9
+ add r12d, 32
+L_AES_XTS_encrypt_update_vaes_done_32:
+ cmp r12d, eax
+ mov r11d, eax
+ je L_AES_XTS_encrypt_update_vaes_done_enc
+ sub r11d, r12d
+ cmp r11d, 16
+ mov r11d, eax
+ jl L_AES_XTS_encrypt_update_vaes_last_15
+ and r11d, 4294967280
+ ; 16 bytes of input
+L_AES_XTS_encrypt_update_vaes_enc_16:
+ lea rcx, QWORD PTR [rdi+r12]
+ vmovdqu xmm0, OWORD PTR [rcx]
+ vpxor xmm0, xmm0, xmm8
+ ; aes_enc_block
+ vpxor xmm0, xmm0, [r10]
+ vmovdqu xmm5, OWORD PTR [r10+16]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+32]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+48]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+64]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+80]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+96]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+112]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+128]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+144]
+ vaesenc xmm0, xmm0, xmm5
+ cmp r9d, 11
+ vmovdqu xmm5, OWORD PTR [r10+160]
+ jl L_AES_XTS_encrypt_update_vaes_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r10+176]
+ vaesenc xmm0, xmm0, xmm6
+ cmp r9d, 13
+ vmovdqu xmm5, OWORD PTR [r10+192]
+ jl L_AES_XTS_encrypt_update_vaes_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r10+208]
+ vaesenc xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r10+224]
+L_AES_XTS_encrypt_update_vaes_aes_enc_block_last:
+ vaesenclast xmm0, xmm0, xmm5
+ vpxor xmm0, xmm0, xmm8
+ lea rcx, QWORD PTR [rsi+r12]
+ vmovdqu OWORD PTR [rcx], xmm0
+ vpshufd xmm4, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm4, xmm4, 31
+ vpand xmm4, xmm4, xmm12
+ vpxor xmm8, xmm8, xmm4
+ add r12d, 16
+ cmp r12d, r11d
+ jl L_AES_XTS_encrypt_update_vaes_enc_16
+ cmp r12d, eax
+ je L_AES_XTS_encrypt_update_vaes_done_enc
+L_AES_XTS_encrypt_update_vaes_last_15:
+ sub r12, 16
+ lea rcx, QWORD PTR [rsi+r12]
+ vmovdqu xmm0, OWORD PTR [rcx]
+ add r12, 16
+ vmovdqu OWORD PTR [rsp], xmm0
+ xor rdx, rdx
+L_AES_XTS_encrypt_update_vaes_last_15_byte_loop:
+ mov r11b, BYTE PTR [rsp+rdx]
+ mov cl, BYTE PTR [rdi+r12]
+ mov BYTE PTR [rsi+r12], r11b
+ mov BYTE PTR [rsp+rdx], cl
+ inc r12d
+ inc edx
+ cmp r12d, eax
+ jl L_AES_XTS_encrypt_update_vaes_last_15_byte_loop
+ sub r12, rdx
+ vmovdqu xmm0, OWORD PTR [rsp]
+ sub r12, 16
+ vpxor xmm0, xmm0, xmm8
+ ; aes_enc_block
+ vpxor xmm0, xmm0, [r10]
+ vmovdqu xmm5, OWORD PTR [r10+16]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+32]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+48]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+64]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+80]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+96]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+112]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+128]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+144]
+ vaesenc xmm0, xmm0, xmm5
+ cmp r9d, 11
+ vmovdqu xmm5, OWORD PTR [r10+160]
+ jl L_AES_XTS_encrypt_update_vaes_last_15_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r10+176]
+ vaesenc xmm0, xmm0, xmm6
+ cmp r9d, 13
+ vmovdqu xmm5, OWORD PTR [r10+192]
+ jl L_AES_XTS_encrypt_update_vaes_last_15_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r10+208]
+ vaesenc xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r10+224]
+L_AES_XTS_encrypt_update_vaes_last_15_aes_enc_block_last:
+ vaesenclast xmm0, xmm0, xmm5
+ vpxor xmm0, xmm0, xmm8
+ lea rcx, QWORD PTR [rsi+r12]
+ vmovdqu OWORD PTR [rcx], xmm0
+L_AES_XTS_encrypt_update_vaes_done_enc:
+ vmovdqu OWORD PTR [r8], xmm8
+ vmovdqu xmm6, OWORD PTR [rsp+64]
+ vmovdqu xmm7, OWORD PTR [rsp+80]
+ vmovdqu xmm8, OWORD PTR [rsp+96]
+ vmovdqu xmm9, OWORD PTR [rsp+112]
+ vmovdqu xmm10, OWORD PTR [rsp+128]
+ vmovdqu xmm11, OWORD PTR [rsp+144]
+ vmovdqu xmm12, OWORD PTR [rsp+160]
+ vmovdqu xmm13, OWORD PTR [rsp+176]
+ vmovdqu xmm14, OWORD PTR [rsp+192]
+ vmovdqu xmm15, OWORD PTR [rsp+208]
+ add rsp, 224
+ pop r12
+ pop rsi
+ pop rdi
+ ret
+AES_XTS_encrypt_update_vaes ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_XTS_decrypt_vaes PROC
+ push rdi
+ push rsi
+ push r12
+ push r13
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rax, r8
+ mov r12, r9
+ mov r8, QWORD PTR [rsp+72]
+ mov r9, QWORD PTR [rsp+80]
+ mov r10d, DWORD PTR [rsp+88]
+ sub rsp, 224
+ vmovdqu OWORD PTR [rsp+64], xmm6
+ vmovdqu OWORD PTR [rsp+80], xmm7
+ vmovdqu OWORD PTR [rsp+96], xmm8
+ vmovdqu OWORD PTR [rsp+112], xmm9
+ vmovdqu OWORD PTR [rsp+128], xmm10
+ vmovdqu OWORD PTR [rsp+144], xmm11
+ vmovdqu OWORD PTR [rsp+160], xmm12
+ vmovdqu OWORD PTR [rsp+176], xmm13
+ vmovdqu OWORD PTR [rsp+192], xmm14
+ vmovdqu OWORD PTR [rsp+208], xmm15
+ vmovdqu xmm12, OWORD PTR L_vaes_aes_xts_gc_xts
+ vbroadcasti128 ymm13, ptr_L_vaes_aes_xts_poly
+ vmovdqu ymm14, YMMWORD PTR L_vaes_aes_xts_shl
+ vmovdqu ymm15, YMMWORD PTR L_vaes_aes_xts_shr
+ vmovdqu xmm8, OWORD PTR [r12]
+ ; aes_enc_block
+ vpxor xmm8, xmm8, [r9]
+ vmovdqu xmm5, OWORD PTR [r9+16]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+32]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+48]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+64]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+80]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+96]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+112]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+128]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+144]
+ vaesenc xmm8, xmm8, xmm5
+ cmp r10d, 11
+ vmovdqu xmm5, OWORD PTR [r9+160]
+ jl L_AES_XTS_decrypt_vaes_tweak_aes_enc_block_last
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+176]
+ vaesenc xmm8, xmm8, xmm6
+ cmp r10d, 13
+ vmovdqu xmm5, OWORD PTR [r9+192]
+ jl L_AES_XTS_decrypt_vaes_tweak_aes_enc_block_last
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+208]
+ vaesenc xmm8, xmm8, xmm6
+ vmovdqu xmm5, OWORD PTR [r9+224]
+L_AES_XTS_decrypt_vaes_tweak_aes_enc_block_last:
+ vaesenclast xmm8, xmm8, xmm5
+ xor r13d, r13d
+ mov r11d, eax
+ and r11d, 4294967280
+ cmp r11d, eax
+ je L_AES_XTS_decrypt_vaes_mul16_128
+ sub r11d, 16
+ cmp r11d, 16
+ jl L_AES_XTS_decrypt_vaes_last_31_start
+L_AES_XTS_decrypt_vaes_mul16_128:
+ cmp r11d, 32
+ jl L_AES_XTS_decrypt_vaes_done_128
+ cmp r11d, 128
+ jl L_AES_XTS_decrypt_vaes_done_128
+ and r11d, 4294967168
+ vperm2i128 ymm5, ymm8, ymm8, 0
+ vpsrlvq ymm6, ymm5, ymm15
+ vpclmulqdq ymm7, ymm6, ymm13, 1
+ vpslldq ymm6, ymm6, 8
+ vpsllvq ymm4, ymm5, ymm14
+ vpxor ymm4, ymm4, ymm7
+ vpxor ymm4, ymm4, ymm6
+ vpsrlq ymm9, ymm4, 62
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm5, ymm4, 2
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm5, ymm5, ymm9
+ vpsrlq ymm9, ymm5, 62
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm6, ymm5, 2
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm6, ymm6, ymm9
+ vpsrlq ymm9, ymm6, 62
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm7, ymm6, 2
+ vpxor ymm7, ymm7, ymm10
+ vpxor ymm7, ymm7, ymm9
+L_AES_XTS_decrypt_vaes_dec_128:
+ ; 128 bytes of input
+ ; aes_dec_128
+ lea rcx, QWORD PTR [rdi+r13]
+ lea rdx, QWORD PTR [rsi+r13]
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ ; aes_dec_block
+ vbroadcasti128 ymm9, [r8]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm0, ymm0, ymm9
+ vpxor ymm1, ymm1, ymm5
+ vpxor ymm1, ymm1, ymm9
+ vpxor ymm2, ymm2, ymm6
+ vpxor ymm2, ymm2, ymm9
+ vpxor ymm3, ymm3, ymm7
+ vpxor ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+16]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+32]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+48]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+64]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+80]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+96]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+112]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+128]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+144]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ cmp r10d, 11
+ vbroadcasti128 ymm9, [r8+160]
+ jl L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+176]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ cmp r10d, 13
+ vbroadcasti128 ymm9, [r8+192]
+ jl L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+208]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r8+224]
+L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last:
+ vaesdeclast ymm0, ymm0, ymm9
+ vaesdeclast ymm1, ymm1, ymm9
+ vaesdeclast ymm2, ymm2, ymm9
+ vaesdeclast ymm3, ymm3, ymm9
+ vpxor ymm0, ymm0, ymm4
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vpsrlq ymm9, ymm4, 56
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm4, ymm4, 8
+ vpxor ymm4, ymm4, ymm10
+ vpxor ymm4, ymm4, ymm9
+ vpxor ymm1, ymm1, ymm5
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vpsrlq ymm9, ymm5, 56
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm5, ymm5, 8
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm2, ymm2, ymm6
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vpsrlq ymm9, ymm6, 56
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm6, ymm6, 8
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm6, ymm6, ymm9
+ vpxor ymm3, ymm3, ymm7
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ vpsrlq ymm9, ymm7, 56
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm7, ymm7, 8
+ vpxor ymm7, ymm7, ymm10
+ vpxor ymm7, ymm7, ymm9
+ add r13d, 128
+ cmp r13d, r11d
+ jl L_AES_XTS_decrypt_vaes_dec_128
+ vextracti128 xmm8, ymm4, 0
+L_AES_XTS_decrypt_vaes_done_128:
+ cmp r13d, eax
+ mov r11d, eax
+ je L_AES_XTS_decrypt_vaes_done_dec
+ and r11d, 4294967280
+ cmp r11d, eax
+ je L_AES_XTS_decrypt_vaes_mul16_64
+ sub r11d, 16
+ sub r11d, r13d
+ cmp r11d, 16
+ jl L_AES_XTS_decrypt_vaes_last_31_start
+ add r11d, r13d
+L_AES_XTS_decrypt_vaes_mul16_64:
+ and r11d, 4294967232
+ cmp r13d, r11d
+ je L_AES_XTS_decrypt_vaes_done_64
+ ; 64 bytes of input
+ ; aes_dec_64
+ lea rcx, QWORD PTR [rdi+r13]
+ lea rdx, QWORD PTR [rsi+r13]
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vperm2i128 ymm5, ymm8, ymm8, 0
+ vpsrlvq ymm6, ymm5, ymm15
+ vpclmulqdq ymm7, ymm6, ymm13, 1
+ vpslldq ymm6, ymm6, 8
+ vpsllvq ymm4, ymm5, ymm14
+ vpxor ymm4, ymm4, ymm7
+ vpxor ymm4, ymm4, ymm6
+ vpsrlq ymm9, ymm4, 62
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm5, ymm4, 2
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm5, ymm5, ymm9
+ ; aes_dec_block
+ vbroadcasti128 ymm9, [r8]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm0, ymm0, ymm9
+ vpxor ymm1, ymm1, ymm5
+ vpxor ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+16]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+32]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+48]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+64]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+80]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+96]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+112]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+128]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+144]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ cmp r10d, 11
+ vbroadcasti128 ymm9, [r8+160]
+ jl L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+176]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ cmp r10d, 13
+ vbroadcasti128 ymm9, [r8+192]
+ jl L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+208]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r8+224]
+L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last:
+ vaesdeclast ymm0, ymm0, ymm9
+ vaesdeclast ymm1, ymm1, ymm9
+ vpxor ymm0, ymm0, ymm4
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vpxor ymm1, ymm1, ymm5
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vextracti128 xmm8, ymm5, 1
+ vpshufd xmm9, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm9, xmm9, 31
+ vpand xmm9, xmm9, xmm12
+ vpxor xmm8, xmm8, xmm9
+ add r13d, 64
+L_AES_XTS_decrypt_vaes_done_64:
+ cmp r13d, eax
+ mov r11d, eax
+ je L_AES_XTS_decrypt_vaes_done_dec
+ and r11d, 4294967280
+ cmp r11d, eax
+ je L_AES_XTS_decrypt_vaes_mul16_32
+ sub r11d, 16
+ sub r11d, r13d
+ cmp r11d, 16
+ jl L_AES_XTS_decrypt_vaes_last_31_start
+ add r11d, r13d
+L_AES_XTS_decrypt_vaes_mul16_32:
+ and r11d, 4294967264
+ cmp r13d, r11d
+ je L_AES_XTS_decrypt_vaes_done_32
+ ; 32 bytes of input
+ ; aes_dec_32
+ lea rcx, QWORD PTR [rdi+r13]
+ lea rdx, QWORD PTR [rsi+r13]
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vperm2i128 ymm5, ymm8, ymm8, 0
+ vpsrlvq ymm6, ymm5, ymm15
+ vpclmulqdq ymm7, ymm6, ymm13, 1
+ vpslldq ymm6, ymm6, 8
+ vpsllvq ymm4, ymm5, ymm14
+ vpxor ymm4, ymm4, ymm7
+ vpxor ymm4, ymm4, ymm6
+ ; aes_dec_block
+ vbroadcasti128 ymm9, [r8]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+16]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+32]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+48]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+64]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+80]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+96]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+112]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+128]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+144]
+ vaesdec ymm0, ymm0, ymm9
+ cmp r10d, 11
+ vbroadcasti128 ymm9, [r8+160]
+ jl L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+176]
+ vaesdec ymm0, ymm0, ymm9
+ cmp r10d, 13
+ vbroadcasti128 ymm9, [r8+192]
+ jl L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+208]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r8+224]
+L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last:
+ vaesdeclast ymm0, ymm0, ymm9
+ vpxor ymm0, ymm0, ymm4
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vextracti128 xmm8, ymm4, 1
+ vpshufd xmm9, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm9, xmm9, 31
+ vpand xmm9, xmm9, xmm12
+ vpxor xmm8, xmm8, xmm9
+ add r13d, 32
+L_AES_XTS_decrypt_vaes_done_32:
+ cmp r13d, eax
+ mov r11d, eax
+ je L_AES_XTS_decrypt_vaes_done_dec
+ and r11d, 4294967280
+ cmp r11d, eax
+ je L_AES_XTS_decrypt_vaes_mul16
+ sub r11d, 16
+ sub r11d, r13d
+ cmp r11d, 16
+ jl L_AES_XTS_decrypt_vaes_last_31_start
+ add r11d, r13d
+L_AES_XTS_decrypt_vaes_mul16:
+L_AES_XTS_decrypt_vaes_dec_16:
+ ; 16 bytes of input
+ lea rcx, QWORD PTR [rdi+r13]
+ vmovdqu xmm0, OWORD PTR [rcx]
+ vpxor xmm0, xmm0, xmm8
+ ; aes_dec_block
+ vpxor xmm0, xmm0, [r8]
+ vmovdqu xmm5, OWORD PTR [r8+16]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+32]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+48]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+64]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+80]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+96]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+112]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+128]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+144]
+ vaesdec xmm0, xmm0, xmm5
+ cmp r10d, 11
+ vmovdqu xmm5, OWORD PTR [r8+160]
+ jl L_AES_XTS_decrypt_vaes_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r8+176]
+ vaesdec xmm0, xmm0, xmm6
+ cmp r10d, 13
+ vmovdqu xmm5, OWORD PTR [r8+192]
+ jl L_AES_XTS_decrypt_vaes_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r8+208]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r8+224]
+L_AES_XTS_decrypt_vaes_aes_dec_block_last:
+ vaesdeclast xmm0, xmm0, xmm5
+ vpxor xmm0, xmm0, xmm8
+ lea rcx, QWORD PTR [rsi+r13]
+ vmovdqu OWORD PTR [rcx], xmm0
+ vpshufd xmm4, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm4, xmm4, 31
+ vpand xmm4, xmm4, xmm12
+ vpxor xmm8, xmm8, xmm4
+ add r13d, 16
+ cmp r13d, r11d
+ jl L_AES_XTS_decrypt_vaes_dec_16
+ cmp r13d, eax
+ je L_AES_XTS_decrypt_vaes_done_dec
+L_AES_XTS_decrypt_vaes_last_31_start:
+ vpshufd xmm4, xmm8, 19
+ vpaddq xmm7, xmm8, xmm8
+ vpsrad xmm4, xmm4, 31
+ vpand xmm4, xmm4, xmm12
+ vpxor xmm7, xmm7, xmm4
+ lea rcx, QWORD PTR [rdi+r13]
+ vmovdqu xmm0, OWORD PTR [rcx]
+ vpxor xmm0, xmm0, xmm7
+ ; aes_dec_block
+ vpxor xmm0, xmm0, [r8]
+ vmovdqu xmm5, OWORD PTR [r8+16]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+32]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+48]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+64]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+80]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+96]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+112]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+128]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+144]
+ vaesdec xmm0, xmm0, xmm5
+ cmp r10d, 11
+ vmovdqu xmm5, OWORD PTR [r8+160]
+ jl L_AES_XTS_decrypt_vaes_last_31_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r8+176]
+ vaesdec xmm0, xmm0, xmm6
+ cmp r10d, 13
+ vmovdqu xmm5, OWORD PTR [r8+192]
+ jl L_AES_XTS_decrypt_vaes_last_31_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r8+208]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r8+224]
+L_AES_XTS_decrypt_vaes_last_31_aes_dec_block_last:
+ vaesdeclast xmm0, xmm0, xmm5
+ vpxor xmm0, xmm0, xmm7
+ vmovdqu OWORD PTR [rsp], xmm0
+ add r13, 16
+ xor rdx, rdx
+L_AES_XTS_decrypt_vaes_last_31_byte_loop:
+ mov r11b, BYTE PTR [rsp+rdx]
+ mov cl, BYTE PTR [rdi+r13]
+ mov BYTE PTR [rsi+r13], r11b
+ mov BYTE PTR [rsp+rdx], cl
+ inc r13d
+ inc edx
+ cmp r13d, eax
+ jl L_AES_XTS_decrypt_vaes_last_31_byte_loop
+ sub r13, rdx
+ vmovdqu xmm0, OWORD PTR [rsp]
+ vpxor xmm0, xmm0, xmm8
+ ; aes_dec_block
+ vpxor xmm0, xmm0, [r8]
+ vmovdqu xmm5, OWORD PTR [r8+16]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+32]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+48]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+64]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+80]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+96]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+112]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+128]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+144]
+ vaesdec xmm0, xmm0, xmm5
+ cmp r10d, 11
+ vmovdqu xmm5, OWORD PTR [r8+160]
+ jl L_AES_XTS_decrypt_vaes_last_31_2_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r8+176]
+ vaesdec xmm0, xmm0, xmm6
+ cmp r10d, 13
+ vmovdqu xmm5, OWORD PTR [r8+192]
+ jl L_AES_XTS_decrypt_vaes_last_31_2_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r8+208]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r8+224]
+L_AES_XTS_decrypt_vaes_last_31_2_aes_dec_block_last:
+ vaesdeclast xmm0, xmm0, xmm5
+ vpxor xmm0, xmm0, xmm8
+ sub r13, 16
+ lea rcx, QWORD PTR [rsi+r13]
+ vmovdqu OWORD PTR [rcx], xmm0
+L_AES_XTS_decrypt_vaes_done_dec:
+ vmovdqu xmm6, OWORD PTR [rsp+64]
+ vmovdqu xmm7, OWORD PTR [rsp+80]
+ vmovdqu xmm8, OWORD PTR [rsp+96]
+ vmovdqu xmm9, OWORD PTR [rsp+112]
+ vmovdqu xmm10, OWORD PTR [rsp+128]
+ vmovdqu xmm11, OWORD PTR [rsp+144]
+ vmovdqu xmm12, OWORD PTR [rsp+160]
+ vmovdqu xmm13, OWORD PTR [rsp+176]
+ vmovdqu xmm14, OWORD PTR [rsp+192]
+ vmovdqu xmm15, OWORD PTR [rsp+208]
+ add rsp, 224
+ pop r13
+ pop r12
+ pop rsi
+ pop rdi
+ ret
+AES_XTS_decrypt_vaes ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_XTS_decrypt_update_vaes PROC
+ push rdi
+ push rsi
+ push r12
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rax, r8
+ mov r10, r9
+ mov r8, QWORD PTR [rsp+64]
+ mov r9d, DWORD PTR [rsp+72]
+ sub rsp, 224
+ vmovdqu OWORD PTR [rsp+64], xmm6
+ vmovdqu OWORD PTR [rsp+80], xmm7
+ vmovdqu OWORD PTR [rsp+96], xmm8
+ vmovdqu OWORD PTR [rsp+112], xmm9
+ vmovdqu OWORD PTR [rsp+128], xmm10
+ vmovdqu OWORD PTR [rsp+144], xmm11
+ vmovdqu OWORD PTR [rsp+160], xmm12
+ vmovdqu OWORD PTR [rsp+176], xmm13
+ vmovdqu OWORD PTR [rsp+192], xmm14
+ vmovdqu OWORD PTR [rsp+208], xmm15
+ vmovdqu xmm12, OWORD PTR L_vaes_aes_xts_gc_xts
+ vbroadcasti128 ymm13, ptr_L_vaes_aes_xts_poly
+ vmovdqu ymm14, YMMWORD PTR L_vaes_aes_xts_shl
+ vmovdqu ymm15, YMMWORD PTR L_vaes_aes_xts_shr
+ vmovdqu xmm8, OWORD PTR [r8]
+ xor r12d, r12d
+ mov r11d, eax
+ and r11d, 4294967280
+ cmp r11d, eax
+ je L_AES_XTS_decrypt_update_vaes_mul16_128
+ sub r11d, 16
+ cmp r11d, 16
+ jl L_AES_XTS_decrypt_update_vaes_last_31_start
+L_AES_XTS_decrypt_update_vaes_mul16_128:
+ cmp r11d, 32
+ jl L_AES_XTS_decrypt_update_vaes_done_128
+ cmp r11d, 128
+ jl L_AES_XTS_decrypt_update_vaes_done_128
+ and r11d, 4294967168
+ vperm2i128 ymm5, ymm8, ymm8, 0
+ vpsrlvq ymm6, ymm5, ymm15
+ vpclmulqdq ymm7, ymm6, ymm13, 1
+ vpslldq ymm6, ymm6, 8
+ vpsllvq ymm4, ymm5, ymm14
+ vpxor ymm4, ymm4, ymm7
+ vpxor ymm4, ymm4, ymm6
+ vpsrlq ymm9, ymm4, 62
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm5, ymm4, 2
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm5, ymm5, ymm9
+ vpsrlq ymm9, ymm5, 62
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm6, ymm5, 2
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm6, ymm6, ymm9
+ vpsrlq ymm9, ymm6, 62
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm7, ymm6, 2
+ vpxor ymm7, ymm7, ymm10
+ vpxor ymm7, ymm7, ymm9
+L_AES_XTS_decrypt_update_vaes_dec_128:
+ ; 128 bytes of input
+ ; aes_dec_128
+ lea rcx, QWORD PTR [rdi+r12]
+ lea rdx, QWORD PTR [rsi+r12]
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vmovdqu ymm2, YMMWORD PTR [rcx+64]
+ vmovdqu ymm3, YMMWORD PTR [rcx+96]
+ ; aes_dec_block
+ vbroadcasti128 ymm9, [r10]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm0, ymm0, ymm9
+ vpxor ymm1, ymm1, ymm5
+ vpxor ymm1, ymm1, ymm9
+ vpxor ymm2, ymm2, ymm6
+ vpxor ymm2, ymm2, ymm9
+ vpxor ymm3, ymm3, ymm7
+ vpxor ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+16]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+32]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+48]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+64]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+80]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+96]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+112]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+128]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+144]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ cmp r9d, 11
+ vbroadcasti128 ymm9, [r10+160]
+ jl L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+176]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ cmp r9d, 13
+ vbroadcasti128 ymm9, [r10+192]
+ jl L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+208]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vaesdec ymm2, ymm2, ymm9
+ vaesdec ymm3, ymm3, ymm9
+ vbroadcasti128 ymm9, [r10+224]
+L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last:
+ vaesdeclast ymm0, ymm0, ymm9
+ vaesdeclast ymm1, ymm1, ymm9
+ vaesdeclast ymm2, ymm2, ymm9
+ vaesdeclast ymm3, ymm3, ymm9
+ vpxor ymm0, ymm0, ymm4
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vpsrlq ymm9, ymm4, 56
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm4, ymm4, 8
+ vpxor ymm4, ymm4, ymm10
+ vpxor ymm4, ymm4, ymm9
+ vpxor ymm1, ymm1, ymm5
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vpsrlq ymm9, ymm5, 56
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm5, ymm5, 8
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm2, ymm2, ymm6
+ vmovdqu YMMWORD PTR [rdx+64], ymm2
+ vpsrlq ymm9, ymm6, 56
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm6, ymm6, 8
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm6, ymm6, ymm9
+ vpxor ymm3, ymm3, ymm7
+ vmovdqu YMMWORD PTR [rdx+96], ymm3
+ vpsrlq ymm9, ymm7, 56
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm7, ymm7, 8
+ vpxor ymm7, ymm7, ymm10
+ vpxor ymm7, ymm7, ymm9
+ add r12d, 128
+ cmp r12d, r11d
+ jl L_AES_XTS_decrypt_update_vaes_dec_128
+ vextracti128 xmm8, ymm4, 0
+L_AES_XTS_decrypt_update_vaes_done_128:
+ cmp r12d, eax
+ mov r11d, eax
+ je L_AES_XTS_decrypt_update_vaes_done_dec
+ and r11d, 4294967280
+ cmp r11d, eax
+ je L_AES_XTS_decrypt_update_vaes_mul16_64
+ sub r11d, 16
+ sub r11d, r12d
+ cmp r11d, 16
+ jl L_AES_XTS_decrypt_update_vaes_last_31_start
+ add r11d, r12d
+L_AES_XTS_decrypt_update_vaes_mul16_64:
+ and r11d, 4294967232
+ cmp r12d, r11d
+ je L_AES_XTS_decrypt_update_vaes_done_64
+ ; 64 bytes of input
+ ; aes_dec_64
+ lea rcx, QWORD PTR [rdi+r12]
+ lea rdx, QWORD PTR [rsi+r12]
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vmovdqu ymm1, YMMWORD PTR [rcx+32]
+ vperm2i128 ymm5, ymm8, ymm8, 0
+ vpsrlvq ymm6, ymm5, ymm15
+ vpclmulqdq ymm7, ymm6, ymm13, 1
+ vpslldq ymm6, ymm6, 8
+ vpsllvq ymm4, ymm5, ymm14
+ vpxor ymm4, ymm4, ymm7
+ vpxor ymm4, ymm4, ymm6
+ vpsrlq ymm9, ymm4, 62
+ vpclmulqdq ymm10, ymm9, ymm13, 1
+ vpslldq ymm9, ymm9, 8
+ vpsllq ymm5, ymm4, 2
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm5, ymm5, ymm9
+ ; aes_dec_block
+ vbroadcasti128 ymm9, [r10]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm0, ymm0, ymm9
+ vpxor ymm1, ymm1, ymm5
+ vpxor ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+16]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+32]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+48]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+64]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+80]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+96]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+112]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+128]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+144]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ cmp r9d, 11
+ vbroadcasti128 ymm9, [r10+160]
+ jl L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+176]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ cmp r9d, 13
+ vbroadcasti128 ymm9, [r10+192]
+ jl L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+208]
+ vaesdec ymm0, ymm0, ymm9
+ vaesdec ymm1, ymm1, ymm9
+ vbroadcasti128 ymm9, [r10+224]
+L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last:
+ vaesdeclast ymm0, ymm0, ymm9
+ vaesdeclast ymm1, ymm1, ymm9
+ vpxor ymm0, ymm0, ymm4
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vpxor ymm1, ymm1, ymm5
+ vmovdqu YMMWORD PTR [rdx+32], ymm1
+ vextracti128 xmm8, ymm5, 1
+ vpshufd xmm9, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm9, xmm9, 31
+ vpand xmm9, xmm9, xmm12
+ vpxor xmm8, xmm8, xmm9
+ add r12d, 64
+L_AES_XTS_decrypt_update_vaes_done_64:
+ cmp r12d, eax
+ mov r11d, eax
+ je L_AES_XTS_decrypt_update_vaes_done_dec
+ and r11d, 4294967280
+ cmp r11d, eax
+ je L_AES_XTS_decrypt_update_vaes_mul16_32
+ sub r11d, 16
+ sub r11d, r12d
+ cmp r11d, 16
+ jl L_AES_XTS_decrypt_update_vaes_last_31_start
+ add r11d, r12d
+L_AES_XTS_decrypt_update_vaes_mul16_32:
+ and r11d, 4294967264
+ cmp r12d, r11d
+ je L_AES_XTS_decrypt_update_vaes_done_32
+ ; 32 bytes of input
+ ; aes_dec_32
+ lea rcx, QWORD PTR [rdi+r12]
+ lea rdx, QWORD PTR [rsi+r12]
+ vmovdqu ymm0, YMMWORD PTR [rcx]
+ vperm2i128 ymm5, ymm8, ymm8, 0
+ vpsrlvq ymm6, ymm5, ymm15
+ vpclmulqdq ymm7, ymm6, ymm13, 1
+ vpslldq ymm6, ymm6, 8
+ vpsllvq ymm4, ymm5, ymm14
+ vpxor ymm4, ymm4, ymm7
+ vpxor ymm4, ymm4, ymm6
+ ; aes_dec_block
+ vbroadcasti128 ymm9, [r10]
+ vpxor ymm0, ymm0, ymm4
+ vpxor ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+16]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+32]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+48]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+64]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+80]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+96]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+112]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+128]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+144]
+ vaesdec ymm0, ymm0, ymm9
+ cmp r9d, 11
+ vbroadcasti128 ymm9, [r10+160]
+ jl L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+176]
+ vaesdec ymm0, ymm0, ymm9
+ cmp r9d, 13
+ vbroadcasti128 ymm9, [r10+192]
+ jl L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+208]
+ vaesdec ymm0, ymm0, ymm9
+ vbroadcasti128 ymm9, [r10+224]
+L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last:
+ vaesdeclast ymm0, ymm0, ymm9
+ vpxor ymm0, ymm0, ymm4
+ vmovdqu YMMWORD PTR [rdx], ymm0
+ vextracti128 xmm8, ymm4, 1
+ vpshufd xmm9, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm9, xmm9, 31
+ vpand xmm9, xmm9, xmm12
+ vpxor xmm8, xmm8, xmm9
+ add r12d, 32
+L_AES_XTS_decrypt_update_vaes_done_32:
+ cmp r12d, eax
+ mov r11d, eax
+ je L_AES_XTS_decrypt_update_vaes_done_dec
+ and r11d, 4294967280
+ cmp r11d, eax
+ je L_AES_XTS_decrypt_update_vaes_mul16
+ sub r11d, 16
+ sub r11d, r12d
+ cmp r11d, 16
+ jl L_AES_XTS_decrypt_update_vaes_last_31_start
+ add r11d, r12d
+L_AES_XTS_decrypt_update_vaes_mul16:
+L_AES_XTS_decrypt_update_vaes_dec_16:
+ ; 16 bytes of input
+ lea rcx, QWORD PTR [rdi+r12]
+ vmovdqu xmm0, OWORD PTR [rcx]
+ vpxor xmm0, xmm0, xmm8
+ ; aes_dec_block
+ vpxor xmm0, xmm0, [r10]
+ vmovdqu xmm5, OWORD PTR [r10+16]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+32]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+48]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+64]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+80]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+96]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+112]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+128]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+144]
+ vaesdec xmm0, xmm0, xmm5
+ cmp r9d, 11
+ vmovdqu xmm5, OWORD PTR [r10+160]
+ jl L_AES_XTS_decrypt_update_vaes_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r10+176]
+ vaesdec xmm0, xmm0, xmm6
+ cmp r9d, 13
+ vmovdqu xmm5, OWORD PTR [r10+192]
+ jl L_AES_XTS_decrypt_update_vaes_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r10+208]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r10+224]
+L_AES_XTS_decrypt_update_vaes_aes_dec_block_last:
+ vaesdeclast xmm0, xmm0, xmm5
+ vpxor xmm0, xmm0, xmm8
+ lea rcx, QWORD PTR [rsi+r12]
+ vmovdqu OWORD PTR [rcx], xmm0
+ vpshufd xmm4, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm4, xmm4, 31
+ vpand xmm4, xmm4, xmm12
+ vpxor xmm8, xmm8, xmm4
+ add r12d, 16
+ cmp r12d, r11d
+ jl L_AES_XTS_decrypt_update_vaes_dec_16
+ cmp r12d, eax
+ je L_AES_XTS_decrypt_update_vaes_done_dec
+L_AES_XTS_decrypt_update_vaes_last_31_start:
+ vpshufd xmm4, xmm8, 19
+ vpaddq xmm7, xmm8, xmm8
+ vpsrad xmm4, xmm4, 31
+ vpand xmm4, xmm4, xmm12
+ vpxor xmm7, xmm7, xmm4
+ lea rcx, QWORD PTR [rdi+r12]
+ vmovdqu xmm0, OWORD PTR [rcx]
+ vpxor xmm0, xmm0, xmm7
+ ; aes_dec_block
+ vpxor xmm0, xmm0, [r10]
+ vmovdqu xmm5, OWORD PTR [r10+16]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+32]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+48]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+64]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+80]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+96]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+112]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+128]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+144]
+ vaesdec xmm0, xmm0, xmm5
+ cmp r9d, 11
+ vmovdqu xmm5, OWORD PTR [r10+160]
+ jl L_AES_XTS_decrypt_update_vaes_last_31_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r10+176]
+ vaesdec xmm0, xmm0, xmm6
+ cmp r9d, 13
+ vmovdqu xmm5, OWORD PTR [r10+192]
+ jl L_AES_XTS_decrypt_update_vaes_last_31_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r10+208]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r10+224]
+L_AES_XTS_decrypt_update_vaes_last_31_aes_dec_block_last:
+ vaesdeclast xmm0, xmm0, xmm5
+ vpxor xmm0, xmm0, xmm7
+ vmovdqu OWORD PTR [rsp], xmm0
+ add r12, 16
+ xor rdx, rdx
+L_AES_XTS_decrypt_update_vaes_last_31_byte_loop:
+ mov r11b, BYTE PTR [rsp+rdx]
+ mov cl, BYTE PTR [rdi+r12]
+ mov BYTE PTR [rsi+r12], r11b
+ mov BYTE PTR [rsp+rdx], cl
+ inc r12d
+ inc edx
+ cmp r12d, eax
+ jl L_AES_XTS_decrypt_update_vaes_last_31_byte_loop
+ sub r12, rdx
+ vmovdqu xmm0, OWORD PTR [rsp]
+ vpxor xmm0, xmm0, xmm8
+ ; aes_dec_block
+ vpxor xmm0, xmm0, [r10]
+ vmovdqu xmm5, OWORD PTR [r10+16]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+32]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+48]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+64]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+80]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+96]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+112]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+128]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+144]
+ vaesdec xmm0, xmm0, xmm5
+ cmp r9d, 11
+ vmovdqu xmm5, OWORD PTR [r10+160]
+ jl L_AES_XTS_decrypt_update_vaes_last_31_2_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r10+176]
+ vaesdec xmm0, xmm0, xmm6
+ cmp r9d, 13
+ vmovdqu xmm5, OWORD PTR [r10+192]
+ jl L_AES_XTS_decrypt_update_vaes_last_31_2_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r10+208]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r10+224]
+L_AES_XTS_decrypt_update_vaes_last_31_2_aes_dec_block_last:
+ vaesdeclast xmm0, xmm0, xmm5
+ vpxor xmm0, xmm0, xmm8
+ sub r12, 16
+ lea rcx, QWORD PTR [rsi+r12]
+ vmovdqu OWORD PTR [rcx], xmm0
+L_AES_XTS_decrypt_update_vaes_done_dec:
+ vmovdqu OWORD PTR [r8], xmm8
+ vmovdqu xmm6, OWORD PTR [rsp+64]
+ vmovdqu xmm7, OWORD PTR [rsp+80]
+ vmovdqu xmm8, OWORD PTR [rsp+96]
+ vmovdqu xmm9, OWORD PTR [rsp+112]
+ vmovdqu xmm10, OWORD PTR [rsp+128]
+ vmovdqu xmm11, OWORD PTR [rsp+144]
+ vmovdqu xmm12, OWORD PTR [rsp+160]
+ vmovdqu xmm13, OWORD PTR [rsp+176]
+ vmovdqu xmm14, OWORD PTR [rsp+192]
+ vmovdqu xmm15, OWORD PTR [rsp+208]
+ add rsp, 224
+ pop r12
+ pop rsi
+ pop rdi
+ ret
+AES_XTS_decrypt_update_vaes ENDP
+_TEXT ENDS
+ENDIF
+IFDEF HAVE_INTEL_AVX512
+_TEXT SEGMENT READONLY PARA
+AES_XTS_init_avx512 PROC
+ vmovdqu xmm0, OWORD PTR [rcx]
+ ; aes_enc_block
+ vpxor xmm0, xmm0, [rdx]
+ vmovdqu xmm2, OWORD PTR [rdx+16]
+ vaesenc xmm0, xmm0, xmm2
+ vmovdqu xmm2, OWORD PTR [rdx+32]
+ vaesenc xmm0, xmm0, xmm2
+ vmovdqu xmm2, OWORD PTR [rdx+48]
+ vaesenc xmm0, xmm0, xmm2
+ vmovdqu xmm2, OWORD PTR [rdx+64]
+ vaesenc xmm0, xmm0, xmm2
+ vmovdqu xmm2, OWORD PTR [rdx+80]
+ vaesenc xmm0, xmm0, xmm2
+ vmovdqu xmm2, OWORD PTR [rdx+96]
+ vaesenc xmm0, xmm0, xmm2
+ vmovdqu xmm2, OWORD PTR [rdx+112]
+ vaesenc xmm0, xmm0, xmm2
+ vmovdqu xmm2, OWORD PTR [rdx+128]
+ vaesenc xmm0, xmm0, xmm2
+ vmovdqu xmm2, OWORD PTR [rdx+144]
+ vaesenc xmm0, xmm0, xmm2
+ cmp r8d, 11
+ vmovdqu xmm2, OWORD PTR [rdx+160]
+ jl L_AES_XTS_init_avx512_tweak_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm2
+ vmovdqu xmm3, OWORD PTR [rdx+176]
+ vaesenc xmm0, xmm0, xmm3
+ cmp r8d, 13
+ vmovdqu xmm2, OWORD PTR [rdx+192]
+ jl L_AES_XTS_init_avx512_tweak_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm2
+ vmovdqu xmm3, OWORD PTR [rdx+208]
+ vaesenc xmm0, xmm0, xmm3
+ vmovdqu xmm2, OWORD PTR [rdx+224]
+L_AES_XTS_init_avx512_tweak_aes_enc_block_last:
+ vaesenclast xmm0, xmm0, xmm2
+ vmovdqu OWORD PTR [rcx], xmm0
+ ret
+AES_XTS_init_avx512 ENDP
+_TEXT ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx512_aes_xts_gc_xts DWORD \
+ 00000087h, 00000000h, 00000001h, 00000000h
+ptr_L_avx512_aes_xts_gc_xts QWORD L_avx512_aes_xts_gc_xts
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx512_aes_xts_poly DWORD \
+ 00000087h, 00000000h, 00000000h, 00000000h
+ptr_L_avx512_aes_xts_poly QWORD L_avx512_aes_xts_poly
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx512_aes_xts_shl DWORD \
+ 00000000h, 00000000h, 00000000h, 00000000h,
+ 00000001h, 00000000h, 00000001h, 00000000h,
+ 00000002h, 00000000h, 00000002h, 00000000h,
+ 00000003h, 00000000h, 00000003h, 00000000h
+ptr_L_avx512_aes_xts_shl QWORD L_avx512_aes_xts_shl
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx512_aes_xts_shr DWORD \
+ 00000040h, 00000000h, 00000040h, 00000000h,
+ 0000003fh, 00000000h, 0000003fh, 00000000h,
+ 0000003eh, 00000000h, 0000003eh, 00000000h,
+ 0000003dh, 00000000h, 0000003dh, 00000000h
+ptr_L_avx512_aes_xts_shr QWORD L_avx512_aes_xts_shr
+_DATA ENDS
+_TEXT SEGMENT READONLY PARA
+AES_XTS_encrypt_avx512 PROC
+ push rdi
+ push rsi
+ push r12
+ push r13
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rax, r8
+ mov r12, r9
+ mov r8, QWORD PTR [rsp+72]
+ mov r9, QWORD PTR [rsp+80]
+ mov r10d, DWORD PTR [rsp+88]
+ sub rsp, 224
+ vmovdqu OWORD PTR [rsp+64], xmm6
+ vmovdqu OWORD PTR [rsp+80], xmm7
+ vmovdqu OWORD PTR [rsp+96], xmm8
+ vmovdqu OWORD PTR [rsp+112], xmm9
+ vmovdqu OWORD PTR [rsp+128], xmm10
+ vmovdqu OWORD PTR [rsp+144], xmm11
+ vmovdqu OWORD PTR [rsp+160], xmm12
+ vmovdqu OWORD PTR [rsp+176], xmm13
+ vmovdqu OWORD PTR [rsp+192], xmm14
+ vmovdqu OWORD PTR [rsp+208], xmm15
+ vmovdqu xmm12, OWORD PTR L_avx512_aes_xts_gc_xts
+ vbroadcasti32x4 zmm13, ptr_L_avx512_aes_xts_poly
+ vmovdqu64 zmm14, ptr_L_avx512_aes_xts_shl
+ vmovdqu64 zmm15, ptr_L_avx512_aes_xts_shr
+ vmovdqu xmm8, OWORD PTR [r12]
+ ; aes_enc_block
+ vpxor xmm8, xmm8, [r9]
+ vmovdqu xmm5, OWORD PTR [r9+16]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+32]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+48]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+64]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+80]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+96]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+112]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+128]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+144]
+ vaesenc xmm8, xmm8, xmm5
+ cmp r10d, 11
+ vmovdqu xmm5, OWORD PTR [r9+160]
+ jl L_AES_XTS_encrypt_avx512_tweak_aes_enc_block_last
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+176]
+ vaesenc xmm8, xmm8, xmm6
+ cmp r10d, 13
+ vmovdqu xmm5, OWORD PTR [r9+192]
+ jl L_AES_XTS_encrypt_avx512_tweak_aes_enc_block_last
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+208]
+ vaesenc xmm8, xmm8, xmm6
+ vmovdqu xmm5, OWORD PTR [r9+224]
+L_AES_XTS_encrypt_avx512_tweak_aes_enc_block_last:
+ vaesenclast xmm8, xmm8, xmm5
+ xor r13d, r13d
+ cmp eax, 32
+ jl L_AES_XTS_encrypt_avx512_done_128
+ vbroadcasti32x4 zmm16, [r8]
+ vbroadcasti32x4 zmm17, [r8+16]
+ vbroadcasti32x4 zmm18, [r8+32]
+ vbroadcasti32x4 zmm19, [r8+48]
+ vbroadcasti32x4 zmm20, [r8+64]
+ vbroadcasti32x4 zmm21, [r8+80]
+ vbroadcasti32x4 zmm22, [r8+96]
+ vbroadcasti32x4 zmm23, [r8+112]
+ vbroadcasti32x4 zmm24, [r8+128]
+ vbroadcasti32x4 zmm25, [r8+144]
+ vbroadcasti32x4 zmm26, [r8+160]
+ cmp r10d, 11
+ jl L_AES_XTS_encrypt_avx512_key_cached
+ vbroadcasti32x4 zmm27, [r8+176]
+ vbroadcasti32x4 zmm28, [r8+192]
+ cmp r10d, 13
+ jl L_AES_XTS_encrypt_avx512_key_cached
+ vbroadcasti32x4 zmm29, [r8+208]
+ vbroadcasti32x4 zmm30, [r8+224]
+L_AES_XTS_encrypt_avx512_key_cached:
+ cmp eax, 256
+ mov r11d, eax
+ jl L_AES_XTS_encrypt_avx512_done_256
+ and r11d, 4294967040
+ vshufi64x2 zmm5, zmm8, zmm8, 0
+ vpsrlvq zmm6, zmm5, zmm15
+ vpclmulqdq zmm7, zmm6, zmm13, 1
+ vpslldq zmm6, zmm6, 8
+ vpsllvq zmm4, zmm5, zmm14
+ vpternlogq zmm4, zmm7, zmm6, 150
+ vpsrlq zmm9, zmm4, 60
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm5, zmm4, 4
+ vpternlogq zmm5, zmm10, zmm9, 150
+ vpsrlq zmm9, zmm5, 60
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm6, zmm5, 4
+ vpternlogq zmm6, zmm10, zmm9, 150
+ vpsrlq zmm9, zmm6, 60
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm7, zmm6, 4
+ vpternlogq zmm7, zmm10, zmm9, 150
+L_AES_XTS_encrypt_avx512_enc_256:
+ ; 256 bytes of input
+ ; aes_enc_256
+ lea rcx, QWORD PTR [rdi+r13]
+ lea rdx, QWORD PTR [rsi+r13]
+ vmovdqu64 zmm0, [rcx]
+ vmovdqu64 zmm1, [rcx+64]
+ vmovdqu64 zmm2, [rcx+128]
+ vmovdqu64 zmm3, [rcx+192]
+ ; aes_enc_block
+ vpternlogq zmm0, zmm16, zmm4, 150
+ vpternlogq zmm1, zmm16, zmm5, 150
+ vpternlogq zmm2, zmm16, zmm6, 150
+ vpternlogq zmm3, zmm16, zmm7, 150
+ vaesenc zmm0, zmm0, zmm17
+ vaesenc zmm1, zmm1, zmm17
+ vaesenc zmm2, zmm2, zmm17
+ vaesenc zmm3, zmm3, zmm17
+ vaesenc zmm0, zmm0, zmm18
+ vaesenc zmm1, zmm1, zmm18
+ vaesenc zmm2, zmm2, zmm18
+ vaesenc zmm3, zmm3, zmm18
+ vaesenc zmm0, zmm0, zmm19
+ vaesenc zmm1, zmm1, zmm19
+ vaesenc zmm2, zmm2, zmm19
+ vaesenc zmm3, zmm3, zmm19
+ vaesenc zmm0, zmm0, zmm20
+ vaesenc zmm1, zmm1, zmm20
+ vaesenc zmm2, zmm2, zmm20
+ vaesenc zmm3, zmm3, zmm20
+ vaesenc zmm0, zmm0, zmm21
+ vaesenc zmm1, zmm1, zmm21
+ vaesenc zmm2, zmm2, zmm21
+ vaesenc zmm3, zmm3, zmm21
+ vaesenc zmm0, zmm0, zmm22
+ vaesenc zmm1, zmm1, zmm22
+ vaesenc zmm2, zmm2, zmm22
+ vaesenc zmm3, zmm3, zmm22
+ vaesenc zmm0, zmm0, zmm23
+ vaesenc zmm1, zmm1, zmm23
+ vaesenc zmm2, zmm2, zmm23
+ vaesenc zmm3, zmm3, zmm23
+ vaesenc zmm0, zmm0, zmm24
+ vaesenc zmm1, zmm1, zmm24
+ vaesenc zmm2, zmm2, zmm24
+ vaesenc zmm3, zmm3, zmm24
+ vaesenc zmm0, zmm0, zmm25
+ vaesenc zmm1, zmm1, zmm25
+ vaesenc zmm2, zmm2, zmm25
+ vaesenc zmm3, zmm3, zmm25
+ cmp r10d, 11
+ vmovdqa64 zmm9, zmm26
+ jl L_AES_XTS_encrypt_avx512_aes_enc_256_aes_enc_block_last
+ vaesenc zmm0, zmm0, zmm26
+ vaesenc zmm1, zmm1, zmm26
+ vaesenc zmm2, zmm2, zmm26
+ vaesenc zmm3, zmm3, zmm26
+ vaesenc zmm0, zmm0, zmm27
+ vaesenc zmm1, zmm1, zmm27
+ vaesenc zmm2, zmm2, zmm27
+ vaesenc zmm3, zmm3, zmm27
+ cmp r10d, 13
+ vmovdqa64 zmm9, zmm28
+ jl L_AES_XTS_encrypt_avx512_aes_enc_256_aes_enc_block_last
+ vaesenc zmm0, zmm0, zmm28
+ vaesenc zmm1, zmm1, zmm28
+ vaesenc zmm2, zmm2, zmm28
+ vaesenc zmm3, zmm3, zmm28
+ vaesenc zmm0, zmm0, zmm29
+ vaesenc zmm1, zmm1, zmm29
+ vaesenc zmm2, zmm2, zmm29
+ vaesenc zmm3, zmm3, zmm29
+ vmovdqa64 zmm9, zmm30
+L_AES_XTS_encrypt_avx512_aes_enc_256_aes_enc_block_last:
+ vaesenclast zmm0, zmm0, zmm9
+ vaesenclast zmm1, zmm1, zmm9
+ vaesenclast zmm2, zmm2, zmm9
+ vaesenclast zmm3, zmm3, zmm9
+ vpxorq zmm0, zmm0, zmm4
+ vmovdqu64 [rdx], zmm0
+ vpsrlq zmm9, zmm4, 48
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm4, zmm4, 16
+ vpternlogq zmm4, zmm10, zmm9, 150
+ vpxorq zmm1, zmm1, zmm5
+ vmovdqu64 [rdx+64], zmm1
+ vpsrlq zmm9, zmm5, 48
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm5, zmm5, 16
+ vpternlogq zmm5, zmm10, zmm9, 150
+ vpxorq zmm2, zmm2, zmm6
+ vmovdqu64 [rdx+128], zmm2
+ vpsrlq zmm9, zmm6, 48
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm6, zmm6, 16
+ vpternlogq zmm6, zmm10, zmm9, 150
+ vpxorq zmm3, zmm3, zmm7
+ vmovdqu64 [rdx+192], zmm3
+ vpsrlq zmm9, zmm7, 48
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm7, zmm7, 16
+ vpternlogq zmm7, zmm10, zmm9, 150
+ add r13d, 256
+ cmp r13d, r11d
+ jl L_AES_XTS_encrypt_avx512_enc_256
+ vextracti32x4 xmm8, zmm4, 0
+L_AES_XTS_encrypt_avx512_done_256:
+ mov r11d, eax
+ and r11d, 4294967168
+ cmp r13d, r11d
+ je L_AES_XTS_encrypt_avx512_done_128
+ ; 128 bytes of input
+ ; aes_enc_128
+ lea rcx, QWORD PTR [rdi+r13]
+ lea rdx, QWORD PTR [rsi+r13]
+ vmovdqu64 zmm0, [rcx]
+ vmovdqu64 zmm1, [rcx+64]
+ vshufi64x2 zmm5, zmm8, zmm8, 0
+ vpsrlvq zmm6, zmm5, zmm15
+ vpclmulqdq zmm7, zmm6, zmm13, 1
+ vpslldq zmm6, zmm6, 8
+ vpsllvq zmm4, zmm5, zmm14
+ vpternlogq zmm4, zmm7, zmm6, 150
+ vpsrlq zmm9, zmm4, 60
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm5, zmm4, 4
+ vpternlogq zmm5, zmm10, zmm9, 150
+ ; aes_enc_block
+ vpternlogq zmm0, zmm16, zmm4, 150
+ vpternlogq zmm1, zmm16, zmm5, 150
+ vaesenc zmm0, zmm0, zmm17
+ vaesenc zmm1, zmm1, zmm17
+ vaesenc zmm0, zmm0, zmm18
+ vaesenc zmm1, zmm1, zmm18
+ vaesenc zmm0, zmm0, zmm19
+ vaesenc zmm1, zmm1, zmm19
+ vaesenc zmm0, zmm0, zmm20
+ vaesenc zmm1, zmm1, zmm20
+ vaesenc zmm0, zmm0, zmm21
+ vaesenc zmm1, zmm1, zmm21
+ vaesenc zmm0, zmm0, zmm22
+ vaesenc zmm1, zmm1, zmm22
+ vaesenc zmm0, zmm0, zmm23
+ vaesenc zmm1, zmm1, zmm23
+ vaesenc zmm0, zmm0, zmm24
+ vaesenc zmm1, zmm1, zmm24
+ vaesenc zmm0, zmm0, zmm25
+ vaesenc zmm1, zmm1, zmm25
+ cmp r10d, 11
+ vmovdqa64 zmm9, zmm26
+ jl L_AES_XTS_encrypt_avx512_aes_enc_128_aes_enc_block_last
+ vaesenc zmm0, zmm0, zmm26
+ vaesenc zmm1, zmm1, zmm26
+ vaesenc zmm0, zmm0, zmm27
+ vaesenc zmm1, zmm1, zmm27
+ cmp r10d, 13
+ vmovdqa64 zmm9, zmm28
+ jl L_AES_XTS_encrypt_avx512_aes_enc_128_aes_enc_block_last
+ vaesenc zmm0, zmm0, zmm28
+ vaesenc zmm1, zmm1, zmm28
+ vaesenc zmm0, zmm0, zmm29
+ vaesenc zmm1, zmm1, zmm29
+ vmovdqa64 zmm9, zmm30
+L_AES_XTS_encrypt_avx512_aes_enc_128_aes_enc_block_last:
+ vaesenclast zmm0, zmm0, zmm9
+ vaesenclast zmm1, zmm1, zmm9
+ vpxorq zmm0, zmm0, zmm4
+ vmovdqu64 [rdx], zmm0
+ vpxorq zmm1, zmm1, zmm5
+ vmovdqu64 [rdx+64], zmm1
+ vextracti32x4 xmm8, zmm5, 3
+ vpshufd xmm9, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm9, xmm9, 31
+ vpternlogd xmm8, xmm9, xmm12, 120
+ add r13d, 128
+L_AES_XTS_encrypt_avx512_done_128:
+ mov r11d, eax
+ and r11d, 4294967232
+ cmp r13d, r11d
+ je L_AES_XTS_encrypt_avx512_done_64
+ ; 64 bytes of input
+ ; aes_enc_64
+ lea rcx, QWORD PTR [rdi+r13]
+ lea rdx, QWORD PTR [rsi+r13]
+ vmovdqu64 zmm0, [rcx]
+ vshufi64x2 zmm5, zmm8, zmm8, 0
+ vpsrlvq zmm6, zmm5, zmm15
+ vpclmulqdq zmm7, zmm6, zmm13, 1
+ vpslldq zmm6, zmm6, 8
+ vpsllvq zmm4, zmm5, zmm14
+ vpternlogq zmm4, zmm7, zmm6, 150
+ ; aes_enc_block
+ vpternlogq zmm0, zmm16, zmm4, 150
+ vaesenc zmm0, zmm0, zmm17
+ vaesenc zmm0, zmm0, zmm18
+ vaesenc zmm0, zmm0, zmm19
+ vaesenc zmm0, zmm0, zmm20
+ vaesenc zmm0, zmm0, zmm21
+ vaesenc zmm0, zmm0, zmm22
+ vaesenc zmm0, zmm0, zmm23
+ vaesenc zmm0, zmm0, zmm24
+ vaesenc zmm0, zmm0, zmm25
+ cmp r10d, 11
+ vmovdqa64 zmm9, zmm26
+ jl L_AES_XTS_encrypt_avx512_aes_enc_64_aes_enc_block_last
+ vaesenc zmm0, zmm0, zmm26
+ vaesenc zmm0, zmm0, zmm27
+ cmp r10d, 13
+ vmovdqa64 zmm9, zmm28
+ jl L_AES_XTS_encrypt_avx512_aes_enc_64_aes_enc_block_last
+ vaesenc zmm0, zmm0, zmm28
+ vaesenc zmm0, zmm0, zmm29
+ vmovdqa64 zmm9, zmm30
+L_AES_XTS_encrypt_avx512_aes_enc_64_aes_enc_block_last:
+ vaesenclast zmm0, zmm0, zmm9
+ vpxorq zmm0, zmm0, zmm4
+ vmovdqu64 [rdx], zmm0
+ vextracti32x4 xmm8, zmm4, 3
+ vpshufd xmm9, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm9, xmm9, 31
+ vpternlogd xmm8, xmm9, xmm12, 120
+ add r13d, 64
+L_AES_XTS_encrypt_avx512_done_64:
+ mov r11d, eax
+ and r11d, 4294967264
+ cmp r13d, r11d
+ je L_AES_XTS_encrypt_avx512_done_32
+ ; 32 bytes of input
+ ; aes_enc_32
+ lea rcx, QWORD PTR [rdi+r13]
+ lea rdx, QWORD PTR [rsi+r13]
+ vmovdqu64 ymm0, [rcx]
+ vshufi64x2 zmm5, zmm8, zmm8, 0
+ vpsrlvq zmm6, zmm5, zmm15
+ vpclmulqdq zmm7, zmm6, zmm13, 1
+ vpslldq zmm6, zmm6, 8
+ vpsllvq zmm4, zmm5, zmm14
+ vpternlogq zmm4, zmm7, zmm6, 150
+ ; aes_enc_block
+ vpternlogq ymm0, ymm16, ymm4, 150
+ vaesenc ymm0, ymm0, ymm17
+ vaesenc ymm0, ymm0, ymm18
+ vaesenc ymm0, ymm0, ymm19
+ vaesenc ymm0, ymm0, ymm20
+ vaesenc ymm0, ymm0, ymm21
+ vaesenc ymm0, ymm0, ymm22
+ vaesenc ymm0, ymm0, ymm23
+ vaesenc ymm0, ymm0, ymm24
+ vaesenc ymm0, ymm0, ymm25
+ cmp r10d, 11
+ vmovdqa64 ymm9, ymm26
+ jl L_AES_XTS_encrypt_avx512_aes_enc_32_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm26
+ vaesenc ymm0, ymm0, ymm27
+ cmp r10d, 13
+ vmovdqa64 ymm9, ymm28
+ jl L_AES_XTS_encrypt_avx512_aes_enc_32_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm28
+ vaesenc ymm0, ymm0, ymm29
+ vmovdqa64 ymm9, ymm30
+L_AES_XTS_encrypt_avx512_aes_enc_32_aes_enc_block_last:
+ vaesenclast ymm0, ymm0, ymm9
+ vpxorq ymm0, ymm0, ymm4
+ vmovdqu64 [rdx], ymm0
+ vextracti32x4 xmm8, zmm4, 2
+ add r13d, 32
+L_AES_XTS_encrypt_avx512_done_32:
+ cmp r13d, eax
+ mov r11d, eax
+ je L_AES_XTS_encrypt_avx512_done_enc
+ sub r11d, r13d
+ cmp r11d, 16
+ mov r11d, eax
+ jl L_AES_XTS_encrypt_avx512_last_15
+ and r11d, 4294967280
+ ; 16 bytes of input
+L_AES_XTS_encrypt_avx512_enc_16:
+ lea rcx, QWORD PTR [rdi+r13]
+ vmovdqu xmm0, OWORD PTR [rcx]
+ vpxor xmm0, xmm0, xmm8
+ ; aes_enc_block
+ vpxor xmm0, xmm0, [r8]
+ vmovdqu xmm5, OWORD PTR [r8+16]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+32]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+48]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+64]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+80]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+96]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+112]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+128]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+144]
+ vaesenc xmm0, xmm0, xmm5
+ cmp r10d, 11
+ vmovdqu xmm5, OWORD PTR [r8+160]
+ jl L_AES_XTS_encrypt_avx512_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r8+176]
+ vaesenc xmm0, xmm0, xmm6
+ cmp r10d, 13
+ vmovdqu xmm5, OWORD PTR [r8+192]
+ jl L_AES_XTS_encrypt_avx512_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r8+208]
+ vaesenc xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r8+224]
+L_AES_XTS_encrypt_avx512_aes_enc_block_last:
+ vaesenclast xmm0, xmm0, xmm5
+ vpxor xmm0, xmm0, xmm8
+ lea rcx, QWORD PTR [rsi+r13]
+ vmovdqu OWORD PTR [rcx], xmm0
+ vpshufd xmm4, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm4, xmm4, 31
+ vpternlogd xmm8, xmm4, xmm12, 120
+ add r13d, 16
+ cmp r13d, r11d
+ jl L_AES_XTS_encrypt_avx512_enc_16
+ cmp r13d, eax
+ je L_AES_XTS_encrypt_avx512_done_enc
+L_AES_XTS_encrypt_avx512_last_15:
+ sub r13, 16
+ lea rcx, QWORD PTR [rsi+r13]
+ vmovdqu xmm0, OWORD PTR [rcx]
+ add r13, 16
+ vmovdqu OWORD PTR [rsp], xmm0
+ xor rdx, rdx
+L_AES_XTS_encrypt_avx512_last_15_byte_loop:
+ mov r11b, BYTE PTR [rsp+rdx]
+ mov cl, BYTE PTR [rdi+r13]
+ mov BYTE PTR [rsi+r13], r11b
+ mov BYTE PTR [rsp+rdx], cl
+ inc r13d
+ inc edx
+ cmp r13d, eax
+ jl L_AES_XTS_encrypt_avx512_last_15_byte_loop
+ sub r13, rdx
+ vmovdqu xmm0, OWORD PTR [rsp]
+ sub r13, 16
+ vpxor xmm0, xmm0, xmm8
+ ; aes_enc_block
+ vpxor xmm0, xmm0, [r8]
+ vmovdqu xmm5, OWORD PTR [r8+16]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+32]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+48]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+64]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+80]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+96]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+112]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+128]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+144]
+ vaesenc xmm0, xmm0, xmm5
+ cmp r10d, 11
+ vmovdqu xmm5, OWORD PTR [r8+160]
+ jl L_AES_XTS_encrypt_avx512_last_15_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r8+176]
+ vaesenc xmm0, xmm0, xmm6
+ cmp r10d, 13
+ vmovdqu xmm5, OWORD PTR [r8+192]
+ jl L_AES_XTS_encrypt_avx512_last_15_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r8+208]
+ vaesenc xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r8+224]
+L_AES_XTS_encrypt_avx512_last_15_aes_enc_block_last:
+ vaesenclast xmm0, xmm0, xmm5
+ vpxor xmm0, xmm0, xmm8
+ lea rcx, QWORD PTR [rsi+r13]
+ vmovdqu OWORD PTR [rcx], xmm0
+L_AES_XTS_encrypt_avx512_done_enc:
+ vmovdqu xmm6, OWORD PTR [rsp+64]
+ vmovdqu xmm7, OWORD PTR [rsp+80]
+ vmovdqu xmm8, OWORD PTR [rsp+96]
+ vmovdqu xmm9, OWORD PTR [rsp+112]
+ vmovdqu xmm10, OWORD PTR [rsp+128]
+ vmovdqu xmm11, OWORD PTR [rsp+144]
+ vmovdqu xmm12, OWORD PTR [rsp+160]
+ vmovdqu xmm13, OWORD PTR [rsp+176]
+ vmovdqu xmm14, OWORD PTR [rsp+192]
+ vmovdqu xmm15, OWORD PTR [rsp+208]
+ add rsp, 224
+ pop r13
+ pop r12
+ pop rsi
+ pop rdi
+ ret
+AES_XTS_encrypt_avx512 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_XTS_encrypt_update_avx512 PROC
+ push rdi
+ push rsi
+ push r12
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rax, r8
+ mov r10, r9
+ mov r8, QWORD PTR [rsp+64]
+ mov r9d, DWORD PTR [rsp+72]
+ sub rsp, 224
+ vmovdqu OWORD PTR [rsp+64], xmm6
+ vmovdqu OWORD PTR [rsp+80], xmm7
+ vmovdqu OWORD PTR [rsp+96], xmm8
+ vmovdqu OWORD PTR [rsp+112], xmm9
+ vmovdqu OWORD PTR [rsp+128], xmm10
+ vmovdqu OWORD PTR [rsp+144], xmm11
+ vmovdqu OWORD PTR [rsp+160], xmm12
+ vmovdqu OWORD PTR [rsp+176], xmm13
+ vmovdqu OWORD PTR [rsp+192], xmm14
+ vmovdqu OWORD PTR [rsp+208], xmm15
+ vmovdqu xmm12, OWORD PTR L_avx512_aes_xts_gc_xts
+ vbroadcasti32x4 zmm13, ptr_L_avx512_aes_xts_poly
+ vmovdqu64 zmm14, ptr_L_avx512_aes_xts_shl
+ vmovdqu64 zmm15, ptr_L_avx512_aes_xts_shr
+ vmovdqu xmm8, OWORD PTR [r8]
+ xor r12d, r12d
+ cmp eax, 32
+ jl L_AES_XTS_encrypt_update_avx512_done_128
+ vbroadcasti32x4 zmm16, [r10]
+ vbroadcasti32x4 zmm17, [r10+16]
+ vbroadcasti32x4 zmm18, [r10+32]
+ vbroadcasti32x4 zmm19, [r10+48]
+ vbroadcasti32x4 zmm20, [r10+64]
+ vbroadcasti32x4 zmm21, [r10+80]
+ vbroadcasti32x4 zmm22, [r10+96]
+ vbroadcasti32x4 zmm23, [r10+112]
+ vbroadcasti32x4 zmm24, [r10+128]
+ vbroadcasti32x4 zmm25, [r10+144]
+ vbroadcasti32x4 zmm26, [r10+160]
+ cmp r9d, 11
+ jl L_AES_XTS_encrypt_update_avx512_key_cached
+ vbroadcasti32x4 zmm27, [r10+176]
+ vbroadcasti32x4 zmm28, [r10+192]
+ cmp r9d, 13
+ jl L_AES_XTS_encrypt_update_avx512_key_cached
+ vbroadcasti32x4 zmm29, [r10+208]
+ vbroadcasti32x4 zmm30, [r10+224]
+L_AES_XTS_encrypt_update_avx512_key_cached:
+ cmp eax, 256
+ mov r11d, eax
+ jl L_AES_XTS_encrypt_update_avx512_done_256
+ and r11d, 4294967040
+ vshufi64x2 zmm5, zmm8, zmm8, 0
+ vpsrlvq zmm6, zmm5, zmm15
+ vpclmulqdq zmm7, zmm6, zmm13, 1
+ vpslldq zmm6, zmm6, 8
+ vpsllvq zmm4, zmm5, zmm14
+ vpternlogq zmm4, zmm7, zmm6, 150
+ vpsrlq zmm9, zmm4, 60
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm5, zmm4, 4
+ vpternlogq zmm5, zmm10, zmm9, 150
+ vpsrlq zmm9, zmm5, 60
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm6, zmm5, 4
+ vpternlogq zmm6, zmm10, zmm9, 150
+ vpsrlq zmm9, zmm6, 60
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm7, zmm6, 4
+ vpternlogq zmm7, zmm10, zmm9, 150
+L_AES_XTS_encrypt_update_avx512_enc_256:
+ ; 256 bytes of input
+ ; aes_enc_256
+ lea rcx, QWORD PTR [rdi+r12]
+ lea rdx, QWORD PTR [rsi+r12]
+ vmovdqu64 zmm0, [rcx]
+ vmovdqu64 zmm1, [rcx+64]
+ vmovdqu64 zmm2, [rcx+128]
+ vmovdqu64 zmm3, [rcx+192]
+ ; aes_enc_block
+ vpternlogq zmm0, zmm16, zmm4, 150
+ vpternlogq zmm1, zmm16, zmm5, 150
+ vpternlogq zmm2, zmm16, zmm6, 150
+ vpternlogq zmm3, zmm16, zmm7, 150
+ vaesenc zmm0, zmm0, zmm17
+ vaesenc zmm1, zmm1, zmm17
+ vaesenc zmm2, zmm2, zmm17
+ vaesenc zmm3, zmm3, zmm17
+ vaesenc zmm0, zmm0, zmm18
+ vaesenc zmm1, zmm1, zmm18
+ vaesenc zmm2, zmm2, zmm18
+ vaesenc zmm3, zmm3, zmm18
+ vaesenc zmm0, zmm0, zmm19
+ vaesenc zmm1, zmm1, zmm19
+ vaesenc zmm2, zmm2, zmm19
+ vaesenc zmm3, zmm3, zmm19
+ vaesenc zmm0, zmm0, zmm20
+ vaesenc zmm1, zmm1, zmm20
+ vaesenc zmm2, zmm2, zmm20
+ vaesenc zmm3, zmm3, zmm20
+ vaesenc zmm0, zmm0, zmm21
+ vaesenc zmm1, zmm1, zmm21
+ vaesenc zmm2, zmm2, zmm21
+ vaesenc zmm3, zmm3, zmm21
+ vaesenc zmm0, zmm0, zmm22
+ vaesenc zmm1, zmm1, zmm22
+ vaesenc zmm2, zmm2, zmm22
+ vaesenc zmm3, zmm3, zmm22
+ vaesenc zmm0, zmm0, zmm23
+ vaesenc zmm1, zmm1, zmm23
+ vaesenc zmm2, zmm2, zmm23
+ vaesenc zmm3, zmm3, zmm23
+ vaesenc zmm0, zmm0, zmm24
+ vaesenc zmm1, zmm1, zmm24
+ vaesenc zmm2, zmm2, zmm24
+ vaesenc zmm3, zmm3, zmm24
+ vaesenc zmm0, zmm0, zmm25
+ vaesenc zmm1, zmm1, zmm25
+ vaesenc zmm2, zmm2, zmm25
+ vaesenc zmm3, zmm3, zmm25
+ cmp r9d, 11
+ vmovdqa64 zmm9, zmm26
+ jl L_AES_XTS_encrypt_update_avx512_aes_enc_256_aes_enc_block_last
+ vaesenc zmm0, zmm0, zmm26
+ vaesenc zmm1, zmm1, zmm26
+ vaesenc zmm2, zmm2, zmm26
+ vaesenc zmm3, zmm3, zmm26
+ vaesenc zmm0, zmm0, zmm27
+ vaesenc zmm1, zmm1, zmm27
+ vaesenc zmm2, zmm2, zmm27
+ vaesenc zmm3, zmm3, zmm27
+ cmp r9d, 13
+ vmovdqa64 zmm9, zmm28
+ jl L_AES_XTS_encrypt_update_avx512_aes_enc_256_aes_enc_block_last
+ vaesenc zmm0, zmm0, zmm28
+ vaesenc zmm1, zmm1, zmm28
+ vaesenc zmm2, zmm2, zmm28
+ vaesenc zmm3, zmm3, zmm28
+ vaesenc zmm0, zmm0, zmm29
+ vaesenc zmm1, zmm1, zmm29
+ vaesenc zmm2, zmm2, zmm29
+ vaesenc zmm3, zmm3, zmm29
+ vmovdqa64 zmm9, zmm30
+L_AES_XTS_encrypt_update_avx512_aes_enc_256_aes_enc_block_last:
+ vaesenclast zmm0, zmm0, zmm9
+ vaesenclast zmm1, zmm1, zmm9
+ vaesenclast zmm2, zmm2, zmm9
+ vaesenclast zmm3, zmm3, zmm9
+ vpxorq zmm0, zmm0, zmm4
+ vmovdqu64 [rdx], zmm0
+ vpsrlq zmm9, zmm4, 48
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm4, zmm4, 16
+ vpternlogq zmm4, zmm10, zmm9, 150
+ vpxorq zmm1, zmm1, zmm5
+ vmovdqu64 [rdx+64], zmm1
+ vpsrlq zmm9, zmm5, 48
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm5, zmm5, 16
+ vpternlogq zmm5, zmm10, zmm9, 150
+ vpxorq zmm2, zmm2, zmm6
+ vmovdqu64 [rdx+128], zmm2
+ vpsrlq zmm9, zmm6, 48
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm6, zmm6, 16
+ vpternlogq zmm6, zmm10, zmm9, 150
+ vpxorq zmm3, zmm3, zmm7
+ vmovdqu64 [rdx+192], zmm3
+ vpsrlq zmm9, zmm7, 48
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm7, zmm7, 16
+ vpternlogq zmm7, zmm10, zmm9, 150
+ add r12d, 256
+ cmp r12d, r11d
+ jl L_AES_XTS_encrypt_update_avx512_enc_256
+ vextracti32x4 xmm8, zmm4, 0
+L_AES_XTS_encrypt_update_avx512_done_256:
+ mov r11d, eax
+ and r11d, 4294967168
+ cmp r12d, r11d
+ je L_AES_XTS_encrypt_update_avx512_done_128
+ ; 128 bytes of input
+ ; aes_enc_128
+ lea rcx, QWORD PTR [rdi+r12]
+ lea rdx, QWORD PTR [rsi+r12]
+ vmovdqu64 zmm0, [rcx]
+ vmovdqu64 zmm1, [rcx+64]
+ vshufi64x2 zmm5, zmm8, zmm8, 0
+ vpsrlvq zmm6, zmm5, zmm15
+ vpclmulqdq zmm7, zmm6, zmm13, 1
+ vpslldq zmm6, zmm6, 8
+ vpsllvq zmm4, zmm5, zmm14
+ vpternlogq zmm4, zmm7, zmm6, 150
+ vpsrlq zmm9, zmm4, 60
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm5, zmm4, 4
+ vpternlogq zmm5, zmm10, zmm9, 150
+ ; aes_enc_block
+ vpternlogq zmm0, zmm16, zmm4, 150
+ vpternlogq zmm1, zmm16, zmm5, 150
+ vaesenc zmm0, zmm0, zmm17
+ vaesenc zmm1, zmm1, zmm17
+ vaesenc zmm0, zmm0, zmm18
+ vaesenc zmm1, zmm1, zmm18
+ vaesenc zmm0, zmm0, zmm19
+ vaesenc zmm1, zmm1, zmm19
+ vaesenc zmm0, zmm0, zmm20
+ vaesenc zmm1, zmm1, zmm20
+ vaesenc zmm0, zmm0, zmm21
+ vaesenc zmm1, zmm1, zmm21
+ vaesenc zmm0, zmm0, zmm22
+ vaesenc zmm1, zmm1, zmm22
+ vaesenc zmm0, zmm0, zmm23
+ vaesenc zmm1, zmm1, zmm23
+ vaesenc zmm0, zmm0, zmm24
+ vaesenc zmm1, zmm1, zmm24
+ vaesenc zmm0, zmm0, zmm25
+ vaesenc zmm1, zmm1, zmm25
+ cmp r9d, 11
+ vmovdqa64 zmm9, zmm26
+ jl L_AES_XTS_encrypt_update_avx512_aes_enc_128_aes_enc_block_last
+ vaesenc zmm0, zmm0, zmm26
+ vaesenc zmm1, zmm1, zmm26
+ vaesenc zmm0, zmm0, zmm27
+ vaesenc zmm1, zmm1, zmm27
+ cmp r9d, 13
+ vmovdqa64 zmm9, zmm28
+ jl L_AES_XTS_encrypt_update_avx512_aes_enc_128_aes_enc_block_last
+ vaesenc zmm0, zmm0, zmm28
+ vaesenc zmm1, zmm1, zmm28
+ vaesenc zmm0, zmm0, zmm29
+ vaesenc zmm1, zmm1, zmm29
+ vmovdqa64 zmm9, zmm30
+L_AES_XTS_encrypt_update_avx512_aes_enc_128_aes_enc_block_last:
+ vaesenclast zmm0, zmm0, zmm9
+ vaesenclast zmm1, zmm1, zmm9
+ vpxorq zmm0, zmm0, zmm4
+ vmovdqu64 [rdx], zmm0
+ vpxorq zmm1, zmm1, zmm5
+ vmovdqu64 [rdx+64], zmm1
+ vextracti32x4 xmm8, zmm5, 3
+ vpshufd xmm9, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm9, xmm9, 31
+ vpternlogd xmm8, xmm9, xmm12, 120
+ add r12d, 128
+L_AES_XTS_encrypt_update_avx512_done_128:
+ mov r11d, eax
+ and r11d, 4294967232
+ cmp r12d, r11d
+ je L_AES_XTS_encrypt_update_avx512_done_64
+ ; 64 bytes of input
+ ; aes_enc_64
+ lea rcx, QWORD PTR [rdi+r12]
+ lea rdx, QWORD PTR [rsi+r12]
+ vmovdqu64 zmm0, [rcx]
+ vshufi64x2 zmm5, zmm8, zmm8, 0
+ vpsrlvq zmm6, zmm5, zmm15
+ vpclmulqdq zmm7, zmm6, zmm13, 1
+ vpslldq zmm6, zmm6, 8
+ vpsllvq zmm4, zmm5, zmm14
+ vpternlogq zmm4, zmm7, zmm6, 150
+ ; aes_enc_block
+ vpternlogq zmm0, zmm16, zmm4, 150
+ vaesenc zmm0, zmm0, zmm17
+ vaesenc zmm0, zmm0, zmm18
+ vaesenc zmm0, zmm0, zmm19
+ vaesenc zmm0, zmm0, zmm20
+ vaesenc zmm0, zmm0, zmm21
+ vaesenc zmm0, zmm0, zmm22
+ vaesenc zmm0, zmm0, zmm23
+ vaesenc zmm0, zmm0, zmm24
+ vaesenc zmm0, zmm0, zmm25
+ cmp r9d, 11
+ vmovdqa64 zmm9, zmm26
+ jl L_AES_XTS_encrypt_update_avx512_aes_enc_64_aes_enc_block_last
+ vaesenc zmm0, zmm0, zmm26
+ vaesenc zmm0, zmm0, zmm27
+ cmp r9d, 13
+ vmovdqa64 zmm9, zmm28
+ jl L_AES_XTS_encrypt_update_avx512_aes_enc_64_aes_enc_block_last
+ vaesenc zmm0, zmm0, zmm28
+ vaesenc zmm0, zmm0, zmm29
+ vmovdqa64 zmm9, zmm30
+L_AES_XTS_encrypt_update_avx512_aes_enc_64_aes_enc_block_last:
+ vaesenclast zmm0, zmm0, zmm9
+ vpxorq zmm0, zmm0, zmm4
+ vmovdqu64 [rdx], zmm0
+ vextracti32x4 xmm8, zmm4, 3
+ vpshufd xmm9, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm9, xmm9, 31
+ vpternlogd xmm8, xmm9, xmm12, 120
+ add r12d, 64
+L_AES_XTS_encrypt_update_avx512_done_64:
+ mov r11d, eax
+ and r11d, 4294967264
+ cmp r12d, r11d
+ je L_AES_XTS_encrypt_update_avx512_done_32
+ ; 32 bytes of input
+ ; aes_enc_32
+ lea rcx, QWORD PTR [rdi+r12]
+ lea rdx, QWORD PTR [rsi+r12]
+ vmovdqu64 ymm0, [rcx]
+ vshufi64x2 zmm5, zmm8, zmm8, 0
+ vpsrlvq zmm6, zmm5, zmm15
+ vpclmulqdq zmm7, zmm6, zmm13, 1
+ vpslldq zmm6, zmm6, 8
+ vpsllvq zmm4, zmm5, zmm14
+ vpternlogq zmm4, zmm7, zmm6, 150
+ ; aes_enc_block
+ vpternlogq ymm0, ymm16, ymm4, 150
+ vaesenc ymm0, ymm0, ymm17
+ vaesenc ymm0, ymm0, ymm18
+ vaesenc ymm0, ymm0, ymm19
+ vaesenc ymm0, ymm0, ymm20
+ vaesenc ymm0, ymm0, ymm21
+ vaesenc ymm0, ymm0, ymm22
+ vaesenc ymm0, ymm0, ymm23
+ vaesenc ymm0, ymm0, ymm24
+ vaesenc ymm0, ymm0, ymm25
+ cmp r9d, 11
+ vmovdqa64 ymm9, ymm26
+ jl L_AES_XTS_encrypt_update_avx512_aes_enc_32_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm26
+ vaesenc ymm0, ymm0, ymm27
+ cmp r9d, 13
+ vmovdqa64 ymm9, ymm28
+ jl L_AES_XTS_encrypt_update_avx512_aes_enc_32_aes_enc_block_last
+ vaesenc ymm0, ymm0, ymm28
+ vaesenc ymm0, ymm0, ymm29
+ vmovdqa64 ymm9, ymm30
+L_AES_XTS_encrypt_update_avx512_aes_enc_32_aes_enc_block_last:
+ vaesenclast ymm0, ymm0, ymm9
+ vpxorq ymm0, ymm0, ymm4
+ vmovdqu64 [rdx], ymm0
+ vextracti32x4 xmm8, zmm4, 2
+ add r12d, 32
+L_AES_XTS_encrypt_update_avx512_done_32:
+ cmp r12d, eax
+ mov r11d, eax
+ je L_AES_XTS_encrypt_update_avx512_done_enc
+ sub r11d, r12d
+ cmp r11d, 16
+ mov r11d, eax
+ jl L_AES_XTS_encrypt_update_avx512_last_15
+ and r11d, 4294967280
+ ; 16 bytes of input
+L_AES_XTS_encrypt_update_avx512_enc_16:
+ lea rcx, QWORD PTR [rdi+r12]
+ vmovdqu xmm0, OWORD PTR [rcx]
+ vpxor xmm0, xmm0, xmm8
+ ; aes_enc_block
+ vpxor xmm0, xmm0, [r10]
+ vmovdqu xmm5, OWORD PTR [r10+16]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+32]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+48]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+64]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+80]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+96]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+112]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+128]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+144]
+ vaesenc xmm0, xmm0, xmm5
+ cmp r9d, 11
+ vmovdqu xmm5, OWORD PTR [r10+160]
+ jl L_AES_XTS_encrypt_update_avx512_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r10+176]
+ vaesenc xmm0, xmm0, xmm6
+ cmp r9d, 13
+ vmovdqu xmm5, OWORD PTR [r10+192]
+ jl L_AES_XTS_encrypt_update_avx512_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r10+208]
+ vaesenc xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r10+224]
+L_AES_XTS_encrypt_update_avx512_aes_enc_block_last:
+ vaesenclast xmm0, xmm0, xmm5
+ vpxor xmm0, xmm0, xmm8
+ lea rcx, QWORD PTR [rsi+r12]
+ vmovdqu OWORD PTR [rcx], xmm0
+ vpshufd xmm4, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm4, xmm4, 31
+ vpternlogd xmm8, xmm4, xmm12, 120
+ add r12d, 16
+ cmp r12d, r11d
+ jl L_AES_XTS_encrypt_update_avx512_enc_16
+ cmp r12d, eax
+ je L_AES_XTS_encrypt_update_avx512_done_enc
+L_AES_XTS_encrypt_update_avx512_last_15:
+ sub r12, 16
+ lea rcx, QWORD PTR [rsi+r12]
+ vmovdqu xmm0, OWORD PTR [rcx]
+ add r12, 16
+ vmovdqu OWORD PTR [rsp], xmm0
+ xor rdx, rdx
+L_AES_XTS_encrypt_update_avx512_last_15_byte_loop:
+ mov r11b, BYTE PTR [rsp+rdx]
+ mov cl, BYTE PTR [rdi+r12]
+ mov BYTE PTR [rsi+r12], r11b
+ mov BYTE PTR [rsp+rdx], cl
+ inc r12d
+ inc edx
+ cmp r12d, eax
+ jl L_AES_XTS_encrypt_update_avx512_last_15_byte_loop
+ sub r12, rdx
+ vmovdqu xmm0, OWORD PTR [rsp]
+ sub r12, 16
+ vpxor xmm0, xmm0, xmm8
+ ; aes_enc_block
+ vpxor xmm0, xmm0, [r10]
+ vmovdqu xmm5, OWORD PTR [r10+16]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+32]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+48]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+64]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+80]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+96]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+112]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+128]
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+144]
+ vaesenc xmm0, xmm0, xmm5
+ cmp r9d, 11
+ vmovdqu xmm5, OWORD PTR [r10+160]
+ jl L_AES_XTS_encrypt_update_avx512_last_15_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r10+176]
+ vaesenc xmm0, xmm0, xmm6
+ cmp r9d, 13
+ vmovdqu xmm5, OWORD PTR [r10+192]
+ jl L_AES_XTS_encrypt_update_avx512_last_15_aes_enc_block_last
+ vaesenc xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r10+208]
+ vaesenc xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r10+224]
+L_AES_XTS_encrypt_update_avx512_last_15_aes_enc_block_last:
+ vaesenclast xmm0, xmm0, xmm5
+ vpxor xmm0, xmm0, xmm8
+ lea rcx, QWORD PTR [rsi+r12]
+ vmovdqu OWORD PTR [rcx], xmm0
+L_AES_XTS_encrypt_update_avx512_done_enc:
+ vmovdqu OWORD PTR [r8], xmm8
+ vmovdqu xmm6, OWORD PTR [rsp+64]
+ vmovdqu xmm7, OWORD PTR [rsp+80]
+ vmovdqu xmm8, OWORD PTR [rsp+96]
+ vmovdqu xmm9, OWORD PTR [rsp+112]
+ vmovdqu xmm10, OWORD PTR [rsp+128]
+ vmovdqu xmm11, OWORD PTR [rsp+144]
+ vmovdqu xmm12, OWORD PTR [rsp+160]
+ vmovdqu xmm13, OWORD PTR [rsp+176]
+ vmovdqu xmm14, OWORD PTR [rsp+192]
+ vmovdqu xmm15, OWORD PTR [rsp+208]
+ add rsp, 224
+ pop r12
+ pop rsi
+ pop rdi
+ ret
+AES_XTS_encrypt_update_avx512 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_XTS_decrypt_avx512 PROC
+ push rdi
+ push rsi
+ push r12
+ push r13
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rax, r8
+ mov r12, r9
+ mov r8, QWORD PTR [rsp+72]
+ mov r9, QWORD PTR [rsp+80]
+ mov r10d, DWORD PTR [rsp+88]
+ sub rsp, 224
+ vmovdqu OWORD PTR [rsp+64], xmm6
+ vmovdqu OWORD PTR [rsp+80], xmm7
+ vmovdqu OWORD PTR [rsp+96], xmm8
+ vmovdqu OWORD PTR [rsp+112], xmm9
+ vmovdqu OWORD PTR [rsp+128], xmm10
+ vmovdqu OWORD PTR [rsp+144], xmm11
+ vmovdqu OWORD PTR [rsp+160], xmm12
+ vmovdqu OWORD PTR [rsp+176], xmm13
+ vmovdqu OWORD PTR [rsp+192], xmm14
+ vmovdqu OWORD PTR [rsp+208], xmm15
+ vmovdqu xmm12, OWORD PTR L_avx512_aes_xts_gc_xts
+ vbroadcasti32x4 zmm13, ptr_L_avx512_aes_xts_poly
+ vmovdqu64 zmm14, ptr_L_avx512_aes_xts_shl
+ vmovdqu64 zmm15, ptr_L_avx512_aes_xts_shr
+ vmovdqu xmm8, OWORD PTR [r12]
+ ; aes_enc_block
+ vpxor xmm8, xmm8, [r9]
+ vmovdqu xmm5, OWORD PTR [r9+16]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+32]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+48]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+64]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+80]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+96]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+112]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+128]
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm5, OWORD PTR [r9+144]
+ vaesenc xmm8, xmm8, xmm5
+ cmp r10d, 11
+ vmovdqu xmm5, OWORD PTR [r9+160]
+ jl L_AES_XTS_decrypt_avx512_tweak_aes_enc_block_last
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+176]
+ vaesenc xmm8, xmm8, xmm6
+ cmp r10d, 13
+ vmovdqu xmm5, OWORD PTR [r9+192]
+ jl L_AES_XTS_decrypt_avx512_tweak_aes_enc_block_last
+ vaesenc xmm8, xmm8, xmm5
+ vmovdqu xmm6, OWORD PTR [r9+208]
+ vaesenc xmm8, xmm8, xmm6
+ vmovdqu xmm5, OWORD PTR [r9+224]
+L_AES_XTS_decrypt_avx512_tweak_aes_enc_block_last:
+ vaesenclast xmm8, xmm8, xmm5
+ xor r13d, r13d
+ mov r11d, eax
+ and r11d, 4294967280
+ cmp r11d, eax
+ je L_AES_XTS_decrypt_avx512_mul16_256
+ sub r11d, 16
+ cmp r11d, 16
+ jl L_AES_XTS_decrypt_avx512_last_31_start
+L_AES_XTS_decrypt_avx512_mul16_256:
+ cmp r11d, 32
+ jl L_AES_XTS_decrypt_avx512_done_128
+ vbroadcasti32x4 zmm16, [r8]
+ vbroadcasti32x4 zmm17, [r8+16]
+ vbroadcasti32x4 zmm18, [r8+32]
+ vbroadcasti32x4 zmm19, [r8+48]
+ vbroadcasti32x4 zmm20, [r8+64]
+ vbroadcasti32x4 zmm21, [r8+80]
+ vbroadcasti32x4 zmm22, [r8+96]
+ vbroadcasti32x4 zmm23, [r8+112]
+ vbroadcasti32x4 zmm24, [r8+128]
+ vbroadcasti32x4 zmm25, [r8+144]
+ vbroadcasti32x4 zmm26, [r8+160]
+ cmp r10d, 11
+ jl L_AES_XTS_decrypt_avx512_key_cached
+ vbroadcasti32x4 zmm27, [r8+176]
+ vbroadcasti32x4 zmm28, [r8+192]
+ cmp r10d, 13
+ jl L_AES_XTS_decrypt_avx512_key_cached
+ vbroadcasti32x4 zmm29, [r8+208]
+ vbroadcasti32x4 zmm30, [r8+224]
+L_AES_XTS_decrypt_avx512_key_cached:
+ cmp r11d, 256
+ jl L_AES_XTS_decrypt_avx512_done_256
+ and r11d, 4294967040
+ vshufi64x2 zmm5, zmm8, zmm8, 0
+ vpsrlvq zmm6, zmm5, zmm15
+ vpclmulqdq zmm7, zmm6, zmm13, 1
+ vpslldq zmm6, zmm6, 8
+ vpsllvq zmm4, zmm5, zmm14
+ vpternlogq zmm4, zmm7, zmm6, 150
+ vpsrlq zmm9, zmm4, 60
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm5, zmm4, 4
+ vpternlogq zmm5, zmm10, zmm9, 150
+ vpsrlq zmm9, zmm5, 60
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm6, zmm5, 4
+ vpternlogq zmm6, zmm10, zmm9, 150
+ vpsrlq zmm9, zmm6, 60
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm7, zmm6, 4
+ vpternlogq zmm7, zmm10, zmm9, 150
+L_AES_XTS_decrypt_avx512_dec_256:
+ ; 256 bytes of input
+ ; aes_dec_256
+ lea rcx, QWORD PTR [rdi+r13]
+ lea rdx, QWORD PTR [rsi+r13]
+ vmovdqu64 zmm0, [rcx]
+ vmovdqu64 zmm1, [rcx+64]
+ vmovdqu64 zmm2, [rcx+128]
+ vmovdqu64 zmm3, [rcx+192]
+ ; aes_dec_block
+ vpternlogq zmm0, zmm16, zmm4, 150
+ vpternlogq zmm1, zmm16, zmm5, 150
+ vpternlogq zmm2, zmm16, zmm6, 150
+ vpternlogq zmm3, zmm16, zmm7, 150
+ vaesdec zmm0, zmm0, zmm17
+ vaesdec zmm1, zmm1, zmm17
+ vaesdec zmm2, zmm2, zmm17
+ vaesdec zmm3, zmm3, zmm17
+ vaesdec zmm0, zmm0, zmm18
+ vaesdec zmm1, zmm1, zmm18
+ vaesdec zmm2, zmm2, zmm18
+ vaesdec zmm3, zmm3, zmm18
+ vaesdec zmm0, zmm0, zmm19
+ vaesdec zmm1, zmm1, zmm19
+ vaesdec zmm2, zmm2, zmm19
+ vaesdec zmm3, zmm3, zmm19
+ vaesdec zmm0, zmm0, zmm20
+ vaesdec zmm1, zmm1, zmm20
+ vaesdec zmm2, zmm2, zmm20
+ vaesdec zmm3, zmm3, zmm20
+ vaesdec zmm0, zmm0, zmm21
+ vaesdec zmm1, zmm1, zmm21
+ vaesdec zmm2, zmm2, zmm21
+ vaesdec zmm3, zmm3, zmm21
+ vaesdec zmm0, zmm0, zmm22
+ vaesdec zmm1, zmm1, zmm22
+ vaesdec zmm2, zmm2, zmm22
+ vaesdec zmm3, zmm3, zmm22
+ vaesdec zmm0, zmm0, zmm23
+ vaesdec zmm1, zmm1, zmm23
+ vaesdec zmm2, zmm2, zmm23
+ vaesdec zmm3, zmm3, zmm23
+ vaesdec zmm0, zmm0, zmm24
+ vaesdec zmm1, zmm1, zmm24
+ vaesdec zmm2, zmm2, zmm24
+ vaesdec zmm3, zmm3, zmm24
+ vaesdec zmm0, zmm0, zmm25
+ vaesdec zmm1, zmm1, zmm25
+ vaesdec zmm2, zmm2, zmm25
+ vaesdec zmm3, zmm3, zmm25
+ cmp r10d, 11
+ vmovdqa64 zmm9, zmm26
+ jl L_AES_XTS_decrypt_avx512_aes_dec_256_aes_dec_block_last
+ vaesdec zmm0, zmm0, zmm26
+ vaesdec zmm1, zmm1, zmm26
+ vaesdec zmm2, zmm2, zmm26
+ vaesdec zmm3, zmm3, zmm26
+ vaesdec zmm0, zmm0, zmm27
+ vaesdec zmm1, zmm1, zmm27
+ vaesdec zmm2, zmm2, zmm27
+ vaesdec zmm3, zmm3, zmm27
+ cmp r10d, 13
+ vmovdqa64 zmm9, zmm28
+ jl L_AES_XTS_decrypt_avx512_aes_dec_256_aes_dec_block_last
+ vaesdec zmm0, zmm0, zmm28
+ vaesdec zmm1, zmm1, zmm28
+ vaesdec zmm2, zmm2, zmm28
+ vaesdec zmm3, zmm3, zmm28
+ vaesdec zmm0, zmm0, zmm29
+ vaesdec zmm1, zmm1, zmm29
+ vaesdec zmm2, zmm2, zmm29
+ vaesdec zmm3, zmm3, zmm29
+ vmovdqa64 zmm9, zmm30
+L_AES_XTS_decrypt_avx512_aes_dec_256_aes_dec_block_last:
+ vaesdeclast zmm0, zmm0, zmm9
+ vaesdeclast zmm1, zmm1, zmm9
+ vaesdeclast zmm2, zmm2, zmm9
+ vaesdeclast zmm3, zmm3, zmm9
+ vpxorq zmm0, zmm0, zmm4
+ vmovdqu64 [rdx], zmm0
+ vpsrlq zmm9, zmm4, 48
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm4, zmm4, 16
+ vpternlogq zmm4, zmm10, zmm9, 150
+ vpxorq zmm1, zmm1, zmm5
+ vmovdqu64 [rdx+64], zmm1
+ vpsrlq zmm9, zmm5, 48
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm5, zmm5, 16
+ vpternlogq zmm5, zmm10, zmm9, 150
+ vpxorq zmm2, zmm2, zmm6
+ vmovdqu64 [rdx+128], zmm2
+ vpsrlq zmm9, zmm6, 48
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm6, zmm6, 16
+ vpternlogq zmm6, zmm10, zmm9, 150
+ vpxorq zmm3, zmm3, zmm7
+ vmovdqu64 [rdx+192], zmm3
+ vpsrlq zmm9, zmm7, 48
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm7, zmm7, 16
+ vpternlogq zmm7, zmm10, zmm9, 150
+ add r13d, 256
+ cmp r13d, r11d
+ jl L_AES_XTS_decrypt_avx512_dec_256
+ vextracti32x4 xmm8, zmm4, 0
+L_AES_XTS_decrypt_avx512_done_256:
+ cmp r13d, eax
+ mov r11d, eax
+ je L_AES_XTS_decrypt_avx512_done_dec
+ and r11d, 4294967280
+ cmp r11d, eax
+ je L_AES_XTS_decrypt_avx512_mul16_128
+ sub r11d, 16
+ sub r11d, r13d
+ cmp r11d, 16
+ jl L_AES_XTS_decrypt_avx512_last_31_start
+ add r11d, r13d
+L_AES_XTS_decrypt_avx512_mul16_128:
+ and r11d, 4294967168
+ cmp r13d, r11d
+ je L_AES_XTS_decrypt_avx512_done_128
+ ; 128 bytes of input
+ ; aes_dec_128
+ lea rcx, QWORD PTR [rdi+r13]
+ lea rdx, QWORD PTR [rsi+r13]
+ vmovdqu64 zmm0, [rcx]
+ vmovdqu64 zmm1, [rcx+64]
+ vshufi64x2 zmm5, zmm8, zmm8, 0
+ vpsrlvq zmm6, zmm5, zmm15
+ vpclmulqdq zmm7, zmm6, zmm13, 1
+ vpslldq zmm6, zmm6, 8
+ vpsllvq zmm4, zmm5, zmm14
+ vpternlogq zmm4, zmm7, zmm6, 150
+ vpsrlq zmm9, zmm4, 60
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm5, zmm4, 4
+ vpternlogq zmm5, zmm10, zmm9, 150
+ ; aes_dec_block
+ vpternlogq zmm0, zmm16, zmm4, 150
+ vpternlogq zmm1, zmm16, zmm5, 150
+ vaesdec zmm0, zmm0, zmm17
+ vaesdec zmm1, zmm1, zmm17
+ vaesdec zmm0, zmm0, zmm18
+ vaesdec zmm1, zmm1, zmm18
+ vaesdec zmm0, zmm0, zmm19
+ vaesdec zmm1, zmm1, zmm19
+ vaesdec zmm0, zmm0, zmm20
+ vaesdec zmm1, zmm1, zmm20
+ vaesdec zmm0, zmm0, zmm21
+ vaesdec zmm1, zmm1, zmm21
+ vaesdec zmm0, zmm0, zmm22
+ vaesdec zmm1, zmm1, zmm22
+ vaesdec zmm0, zmm0, zmm23
+ vaesdec zmm1, zmm1, zmm23
+ vaesdec zmm0, zmm0, zmm24
+ vaesdec zmm1, zmm1, zmm24
+ vaesdec zmm0, zmm0, zmm25
+ vaesdec zmm1, zmm1, zmm25
+ cmp r10d, 11
+ vmovdqa64 zmm9, zmm26
+ jl L_AES_XTS_decrypt_avx512_aes_dec_128_aes_dec_block_last
+ vaesdec zmm0, zmm0, zmm26
+ vaesdec zmm1, zmm1, zmm26
+ vaesdec zmm0, zmm0, zmm27
+ vaesdec zmm1, zmm1, zmm27
+ cmp r10d, 13
+ vmovdqa64 zmm9, zmm28
+ jl L_AES_XTS_decrypt_avx512_aes_dec_128_aes_dec_block_last
+ vaesdec zmm0, zmm0, zmm28
+ vaesdec zmm1, zmm1, zmm28
+ vaesdec zmm0, zmm0, zmm29
+ vaesdec zmm1, zmm1, zmm29
+ vmovdqa64 zmm9, zmm30
+L_AES_XTS_decrypt_avx512_aes_dec_128_aes_dec_block_last:
+ vaesdeclast zmm0, zmm0, zmm9
+ vaesdeclast zmm1, zmm1, zmm9
+ vpxorq zmm0, zmm0, zmm4
+ vmovdqu64 [rdx], zmm0
+ vpxorq zmm1, zmm1, zmm5
+ vmovdqu64 [rdx+64], zmm1
+ vextracti32x4 xmm8, zmm5, 3
+ vpshufd xmm9, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm9, xmm9, 31
+ vpternlogd xmm8, xmm9, xmm12, 120
+ add r13d, 128
+L_AES_XTS_decrypt_avx512_done_128:
+ cmp r13d, eax
+ mov r11d, eax
+ je L_AES_XTS_decrypt_avx512_done_dec
+ and r11d, 4294967280
+ cmp r11d, eax
+ je L_AES_XTS_decrypt_avx512_mul16_64
+ sub r11d, 16
+ sub r11d, r13d
+ cmp r11d, 16
+ jl L_AES_XTS_decrypt_avx512_last_31_start
+ add r11d, r13d
+L_AES_XTS_decrypt_avx512_mul16_64:
+ and r11d, 4294967232
+ cmp r13d, r11d
+ je L_AES_XTS_decrypt_avx512_done_64
+ ; 64 bytes of input
+ ; aes_dec_64
+ lea rcx, QWORD PTR [rdi+r13]
+ lea rdx, QWORD PTR [rsi+r13]
+ vmovdqu64 zmm0, [rcx]
+ vshufi64x2 zmm5, zmm8, zmm8, 0
+ vpsrlvq zmm6, zmm5, zmm15
+ vpclmulqdq zmm7, zmm6, zmm13, 1
+ vpslldq zmm6, zmm6, 8
+ vpsllvq zmm4, zmm5, zmm14
+ vpternlogq zmm4, zmm7, zmm6, 150
+ ; aes_dec_block
+ vpternlogq zmm0, zmm16, zmm4, 150
+ vaesdec zmm0, zmm0, zmm17
+ vaesdec zmm0, zmm0, zmm18
+ vaesdec zmm0, zmm0, zmm19
+ vaesdec zmm0, zmm0, zmm20
+ vaesdec zmm0, zmm0, zmm21
+ vaesdec zmm0, zmm0, zmm22
+ vaesdec zmm0, zmm0, zmm23
+ vaesdec zmm0, zmm0, zmm24
+ vaesdec zmm0, zmm0, zmm25
+ cmp r10d, 11
+ vmovdqa64 zmm9, zmm26
+ jl L_AES_XTS_decrypt_avx512_aes_dec_64_aes_dec_block_last
+ vaesdec zmm0, zmm0, zmm26
+ vaesdec zmm0, zmm0, zmm27
+ cmp r10d, 13
+ vmovdqa64 zmm9, zmm28
+ jl L_AES_XTS_decrypt_avx512_aes_dec_64_aes_dec_block_last
+ vaesdec zmm0, zmm0, zmm28
+ vaesdec zmm0, zmm0, zmm29
+ vmovdqa64 zmm9, zmm30
+L_AES_XTS_decrypt_avx512_aes_dec_64_aes_dec_block_last:
+ vaesdeclast zmm0, zmm0, zmm9
+ vpxorq zmm0, zmm0, zmm4
+ vmovdqu64 [rdx], zmm0
+ vextracti32x4 xmm8, zmm4, 3
+ vpshufd xmm9, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm9, xmm9, 31
+ vpternlogd xmm8, xmm9, xmm12, 120
+ add r13d, 64
+L_AES_XTS_decrypt_avx512_done_64:
+ cmp r13d, eax
+ mov r11d, eax
+ je L_AES_XTS_decrypt_avx512_done_dec
+ and r11d, 4294967280
+ cmp r11d, eax
+ je L_AES_XTS_decrypt_avx512_mul16_32
+ sub r11d, 16
+ sub r11d, r13d
+ cmp r11d, 16
+ jl L_AES_XTS_decrypt_avx512_last_31_start
+ add r11d, r13d
+L_AES_XTS_decrypt_avx512_mul16_32:
+ and r11d, 4294967264
+ cmp r13d, r11d
+ je L_AES_XTS_decrypt_avx512_done_32
+ ; 32 bytes of input
+ ; aes_dec_32
+ lea rcx, QWORD PTR [rdi+r13]
+ lea rdx, QWORD PTR [rsi+r13]
+ vmovdqu64 ymm0, [rcx]
+ vshufi64x2 zmm5, zmm8, zmm8, 0
+ vpsrlvq zmm6, zmm5, zmm15
+ vpclmulqdq zmm7, zmm6, zmm13, 1
+ vpslldq zmm6, zmm6, 8
+ vpsllvq zmm4, zmm5, zmm14
+ vpternlogq zmm4, zmm7, zmm6, 150
+ ; aes_dec_block
+ vpternlogq ymm0, ymm16, ymm4, 150
+ vaesdec ymm0, ymm0, ymm17
+ vaesdec ymm0, ymm0, ymm18
+ vaesdec ymm0, ymm0, ymm19
+ vaesdec ymm0, ymm0, ymm20
+ vaesdec ymm0, ymm0, ymm21
+ vaesdec ymm0, ymm0, ymm22
+ vaesdec ymm0, ymm0, ymm23
+ vaesdec ymm0, ymm0, ymm24
+ vaesdec ymm0, ymm0, ymm25
+ cmp r10d, 11
+ vmovdqa64 ymm9, ymm26
+ jl L_AES_XTS_decrypt_avx512_aes_dec_32_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm26
+ vaesdec ymm0, ymm0, ymm27
+ cmp r10d, 13
+ vmovdqa64 ymm9, ymm28
+ jl L_AES_XTS_decrypt_avx512_aes_dec_32_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm28
+ vaesdec ymm0, ymm0, ymm29
+ vmovdqa64 ymm9, ymm30
+L_AES_XTS_decrypt_avx512_aes_dec_32_aes_dec_block_last:
+ vaesdeclast ymm0, ymm0, ymm9
+ vpxorq ymm0, ymm0, ymm4
+ vmovdqu64 [rdx], ymm0
+ vextracti32x4 xmm8, zmm4, 2
+ add r13d, 32
+L_AES_XTS_decrypt_avx512_done_32:
+ cmp r13d, eax
+ mov r11d, eax
+ je L_AES_XTS_decrypt_avx512_done_dec
+ and r11d, 4294967280
+ cmp r11d, eax
+ je L_AES_XTS_decrypt_avx512_mul16
+ sub r11d, 16
+ sub r11d, r13d
+ cmp r11d, 16
+ jl L_AES_XTS_decrypt_avx512_last_31_start
+ add r11d, r13d
+L_AES_XTS_decrypt_avx512_mul16:
+L_AES_XTS_decrypt_avx512_dec_16:
+ ; 16 bytes of input
+ lea rcx, QWORD PTR [rdi+r13]
+ vmovdqu xmm0, OWORD PTR [rcx]
+ vpxor xmm0, xmm0, xmm8
+ ; aes_dec_block
+ vpxor xmm0, xmm0, [r8]
+ vmovdqu xmm5, OWORD PTR [r8+16]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+32]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+48]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+64]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+80]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+96]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+112]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+128]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+144]
+ vaesdec xmm0, xmm0, xmm5
+ cmp r10d, 11
+ vmovdqu xmm5, OWORD PTR [r8+160]
+ jl L_AES_XTS_decrypt_avx512_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r8+176]
+ vaesdec xmm0, xmm0, xmm6
+ cmp r10d, 13
+ vmovdqu xmm5, OWORD PTR [r8+192]
+ jl L_AES_XTS_decrypt_avx512_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r8+208]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r8+224]
+L_AES_XTS_decrypt_avx512_aes_dec_block_last:
+ vaesdeclast xmm0, xmm0, xmm5
+ vpxor xmm0, xmm0, xmm8
+ lea rcx, QWORD PTR [rsi+r13]
+ vmovdqu OWORD PTR [rcx], xmm0
+ vpshufd xmm4, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm4, xmm4, 31
+ vpternlogd xmm8, xmm4, xmm12, 120
+ add r13d, 16
+ cmp r13d, r11d
+ jl L_AES_XTS_decrypt_avx512_dec_16
+ cmp r13d, eax
+ je L_AES_XTS_decrypt_avx512_done_dec
+L_AES_XTS_decrypt_avx512_last_31_start:
+ vpshufd xmm4, xmm8, 19
+ vpaddq xmm7, xmm8, xmm8
+ vpsrad xmm4, xmm4, 31
+ vpternlogd xmm7, xmm4, xmm12, 120
+ lea rcx, QWORD PTR [rdi+r13]
+ vmovdqu xmm0, OWORD PTR [rcx]
+ vpxor xmm0, xmm0, xmm7
+ ; aes_dec_block
+ vpxor xmm0, xmm0, [r8]
+ vmovdqu xmm5, OWORD PTR [r8+16]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+32]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+48]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+64]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+80]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+96]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+112]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+128]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+144]
+ vaesdec xmm0, xmm0, xmm5
+ cmp r10d, 11
+ vmovdqu xmm5, OWORD PTR [r8+160]
+ jl L_AES_XTS_decrypt_avx512_last_31_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r8+176]
+ vaesdec xmm0, xmm0, xmm6
+ cmp r10d, 13
+ vmovdqu xmm5, OWORD PTR [r8+192]
+ jl L_AES_XTS_decrypt_avx512_last_31_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r8+208]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r8+224]
+L_AES_XTS_decrypt_avx512_last_31_aes_dec_block_last:
+ vaesdeclast xmm0, xmm0, xmm5
+ vpxor xmm0, xmm0, xmm7
+ vmovdqu OWORD PTR [rsp], xmm0
+ add r13, 16
+ xor rdx, rdx
+L_AES_XTS_decrypt_avx512_last_31_byte_loop:
+ mov r11b, BYTE PTR [rsp+rdx]
+ mov cl, BYTE PTR [rdi+r13]
+ mov BYTE PTR [rsi+r13], r11b
+ mov BYTE PTR [rsp+rdx], cl
+ inc r13d
+ inc edx
+ cmp r13d, eax
+ jl L_AES_XTS_decrypt_avx512_last_31_byte_loop
+ sub r13, rdx
+ vmovdqu xmm0, OWORD PTR [rsp]
+ vpxor xmm0, xmm0, xmm8
+ ; aes_dec_block
+ vpxor xmm0, xmm0, [r8]
+ vmovdqu xmm5, OWORD PTR [r8+16]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+32]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+48]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+64]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+80]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+96]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+112]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+128]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r8+144]
+ vaesdec xmm0, xmm0, xmm5
+ cmp r10d, 11
+ vmovdqu xmm5, OWORD PTR [r8+160]
+ jl L_AES_XTS_decrypt_avx512_last_31_2_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r8+176]
+ vaesdec xmm0, xmm0, xmm6
+ cmp r10d, 13
+ vmovdqu xmm5, OWORD PTR [r8+192]
+ jl L_AES_XTS_decrypt_avx512_last_31_2_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r8+208]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r8+224]
+L_AES_XTS_decrypt_avx512_last_31_2_aes_dec_block_last:
+ vaesdeclast xmm0, xmm0, xmm5
+ vpxor xmm0, xmm0, xmm8
+ sub r13, 16
+ lea rcx, QWORD PTR [rsi+r13]
+ vmovdqu OWORD PTR [rcx], xmm0
+L_AES_XTS_decrypt_avx512_done_dec:
+ vmovdqu xmm6, OWORD PTR [rsp+64]
+ vmovdqu xmm7, OWORD PTR [rsp+80]
+ vmovdqu xmm8, OWORD PTR [rsp+96]
+ vmovdqu xmm9, OWORD PTR [rsp+112]
+ vmovdqu xmm10, OWORD PTR [rsp+128]
+ vmovdqu xmm11, OWORD PTR [rsp+144]
+ vmovdqu xmm12, OWORD PTR [rsp+160]
+ vmovdqu xmm13, OWORD PTR [rsp+176]
+ vmovdqu xmm14, OWORD PTR [rsp+192]
+ vmovdqu xmm15, OWORD PTR [rsp+208]
+ add rsp, 224
+ pop r13
+ pop r12
+ pop rsi
+ pop rdi
+ ret
+AES_XTS_decrypt_avx512 ENDP
+_TEXT ENDS
+_TEXT SEGMENT READONLY PARA
+AES_XTS_decrypt_update_avx512 PROC
+ push rdi
+ push rsi
+ push r12
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rax, r8
+ mov r10, r9
+ mov r8, QWORD PTR [rsp+64]
+ mov r9d, DWORD PTR [rsp+72]
+ sub rsp, 224
+ vmovdqu OWORD PTR [rsp+64], xmm6
+ vmovdqu OWORD PTR [rsp+80], xmm7
+ vmovdqu OWORD PTR [rsp+96], xmm8
+ vmovdqu OWORD PTR [rsp+112], xmm9
+ vmovdqu OWORD PTR [rsp+128], xmm10
+ vmovdqu OWORD PTR [rsp+144], xmm11
+ vmovdqu OWORD PTR [rsp+160], xmm12
+ vmovdqu OWORD PTR [rsp+176], xmm13
+ vmovdqu OWORD PTR [rsp+192], xmm14
+ vmovdqu OWORD PTR [rsp+208], xmm15
+ vmovdqu xmm12, OWORD PTR L_avx512_aes_xts_gc_xts
+ vbroadcasti32x4 zmm13, ptr_L_avx512_aes_xts_poly
+ vmovdqu64 zmm14, ptr_L_avx512_aes_xts_shl
+ vmovdqu64 zmm15, ptr_L_avx512_aes_xts_shr
+ vmovdqu xmm8, OWORD PTR [r8]
+ xor r12d, r12d
+ mov r11d, eax
+ and r11d, 4294967280
+ cmp r11d, eax
+ je L_AES_XTS_decrypt_update_avx512_mul16_256
+ sub r11d, 16
+ cmp r11d, 16
+ jl L_AES_XTS_decrypt_update_avx512_last_31_start
+L_AES_XTS_decrypt_update_avx512_mul16_256:
+ cmp r11d, 32
+ jl L_AES_XTS_decrypt_update_avx512_done_128
+ vbroadcasti32x4 zmm16, [r10]
+ vbroadcasti32x4 zmm17, [r10+16]
+ vbroadcasti32x4 zmm18, [r10+32]
+ vbroadcasti32x4 zmm19, [r10+48]
+ vbroadcasti32x4 zmm20, [r10+64]
+ vbroadcasti32x4 zmm21, [r10+80]
+ vbroadcasti32x4 zmm22, [r10+96]
+ vbroadcasti32x4 zmm23, [r10+112]
+ vbroadcasti32x4 zmm24, [r10+128]
+ vbroadcasti32x4 zmm25, [r10+144]
+ vbroadcasti32x4 zmm26, [r10+160]
+ cmp r9d, 11
+ jl L_AES_XTS_decrypt_update_avx512_key_cached
+ vbroadcasti32x4 zmm27, [r10+176]
+ vbroadcasti32x4 zmm28, [r10+192]
+ cmp r9d, 13
+ jl L_AES_XTS_decrypt_update_avx512_key_cached
+ vbroadcasti32x4 zmm29, [r10+208]
+ vbroadcasti32x4 zmm30, [r10+224]
+L_AES_XTS_decrypt_update_avx512_key_cached:
+ cmp r11d, 256
+ jl L_AES_XTS_decrypt_update_avx512_done_256
+ and r11d, 4294967040
+ vshufi64x2 zmm5, zmm8, zmm8, 0
+ vpsrlvq zmm6, zmm5, zmm15
+ vpclmulqdq zmm7, zmm6, zmm13, 1
+ vpslldq zmm6, zmm6, 8
+ vpsllvq zmm4, zmm5, zmm14
+ vpternlogq zmm4, zmm7, zmm6, 150
+ vpsrlq zmm9, zmm4, 60
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm5, zmm4, 4
+ vpternlogq zmm5, zmm10, zmm9, 150
+ vpsrlq zmm9, zmm5, 60
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm6, zmm5, 4
+ vpternlogq zmm6, zmm10, zmm9, 150
+ vpsrlq zmm9, zmm6, 60
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm7, zmm6, 4
+ vpternlogq zmm7, zmm10, zmm9, 150
+L_AES_XTS_decrypt_update_avx512_dec_256:
+ ; 256 bytes of input
+ ; aes_dec_256
+ lea rcx, QWORD PTR [rdi+r12]
+ lea rdx, QWORD PTR [rsi+r12]
+ vmovdqu64 zmm0, [rcx]
+ vmovdqu64 zmm1, [rcx+64]
+ vmovdqu64 zmm2, [rcx+128]
+ vmovdqu64 zmm3, [rcx+192]
+ ; aes_dec_block
+ vpternlogq zmm0, zmm16, zmm4, 150
+ vpternlogq zmm1, zmm16, zmm5, 150
+ vpternlogq zmm2, zmm16, zmm6, 150
+ vpternlogq zmm3, zmm16, zmm7, 150
+ vaesdec zmm0, zmm0, zmm17
+ vaesdec zmm1, zmm1, zmm17
+ vaesdec zmm2, zmm2, zmm17
+ vaesdec zmm3, zmm3, zmm17
+ vaesdec zmm0, zmm0, zmm18
+ vaesdec zmm1, zmm1, zmm18
+ vaesdec zmm2, zmm2, zmm18
+ vaesdec zmm3, zmm3, zmm18
+ vaesdec zmm0, zmm0, zmm19
+ vaesdec zmm1, zmm1, zmm19
+ vaesdec zmm2, zmm2, zmm19
+ vaesdec zmm3, zmm3, zmm19
+ vaesdec zmm0, zmm0, zmm20
+ vaesdec zmm1, zmm1, zmm20
+ vaesdec zmm2, zmm2, zmm20
+ vaesdec zmm3, zmm3, zmm20
+ vaesdec zmm0, zmm0, zmm21
+ vaesdec zmm1, zmm1, zmm21
+ vaesdec zmm2, zmm2, zmm21
+ vaesdec zmm3, zmm3, zmm21
+ vaesdec zmm0, zmm0, zmm22
+ vaesdec zmm1, zmm1, zmm22
+ vaesdec zmm2, zmm2, zmm22
+ vaesdec zmm3, zmm3, zmm22
+ vaesdec zmm0, zmm0, zmm23
+ vaesdec zmm1, zmm1, zmm23
+ vaesdec zmm2, zmm2, zmm23
+ vaesdec zmm3, zmm3, zmm23
+ vaesdec zmm0, zmm0, zmm24
+ vaesdec zmm1, zmm1, zmm24
+ vaesdec zmm2, zmm2, zmm24
+ vaesdec zmm3, zmm3, zmm24
+ vaesdec zmm0, zmm0, zmm25
+ vaesdec zmm1, zmm1, zmm25
+ vaesdec zmm2, zmm2, zmm25
+ vaesdec zmm3, zmm3, zmm25
+ cmp r9d, 11
+ vmovdqa64 zmm9, zmm26
+ jl L_AES_XTS_decrypt_update_avx512_aes_dec_256_aes_dec_block_last
+ vaesdec zmm0, zmm0, zmm26
+ vaesdec zmm1, zmm1, zmm26
+ vaesdec zmm2, zmm2, zmm26
+ vaesdec zmm3, zmm3, zmm26
+ vaesdec zmm0, zmm0, zmm27
+ vaesdec zmm1, zmm1, zmm27
+ vaesdec zmm2, zmm2, zmm27
+ vaesdec zmm3, zmm3, zmm27
+ cmp r9d, 13
+ vmovdqa64 zmm9, zmm28
+ jl L_AES_XTS_decrypt_update_avx512_aes_dec_256_aes_dec_block_last
+ vaesdec zmm0, zmm0, zmm28
+ vaesdec zmm1, zmm1, zmm28
+ vaesdec zmm2, zmm2, zmm28
+ vaesdec zmm3, zmm3, zmm28
+ vaesdec zmm0, zmm0, zmm29
+ vaesdec zmm1, zmm1, zmm29
+ vaesdec zmm2, zmm2, zmm29
+ vaesdec zmm3, zmm3, zmm29
+ vmovdqa64 zmm9, zmm30
+L_AES_XTS_decrypt_update_avx512_aes_dec_256_aes_dec_block_last:
+ vaesdeclast zmm0, zmm0, zmm9
+ vaesdeclast zmm1, zmm1, zmm9
+ vaesdeclast zmm2, zmm2, zmm9
+ vaesdeclast zmm3, zmm3, zmm9
+ vpxorq zmm0, zmm0, zmm4
+ vmovdqu64 [rdx], zmm0
+ vpsrlq zmm9, zmm4, 48
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm4, zmm4, 16
+ vpternlogq zmm4, zmm10, zmm9, 150
+ vpxorq zmm1, zmm1, zmm5
+ vmovdqu64 [rdx+64], zmm1
+ vpsrlq zmm9, zmm5, 48
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm5, zmm5, 16
+ vpternlogq zmm5, zmm10, zmm9, 150
+ vpxorq zmm2, zmm2, zmm6
+ vmovdqu64 [rdx+128], zmm2
+ vpsrlq zmm9, zmm6, 48
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm6, zmm6, 16
+ vpternlogq zmm6, zmm10, zmm9, 150
+ vpxorq zmm3, zmm3, zmm7
+ vmovdqu64 [rdx+192], zmm3
+ vpsrlq zmm9, zmm7, 48
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm7, zmm7, 16
+ vpternlogq zmm7, zmm10, zmm9, 150
+ add r12d, 256
+ cmp r12d, r11d
+ jl L_AES_XTS_decrypt_update_avx512_dec_256
+ vextracti32x4 xmm8, zmm4, 0
+L_AES_XTS_decrypt_update_avx512_done_256:
+ cmp r12d, eax
+ mov r11d, eax
+ je L_AES_XTS_decrypt_update_avx512_done_dec
+ and r11d, 4294967280
+ cmp r11d, eax
+ je L_AES_XTS_decrypt_update_avx512_mul16_128
+ sub r11d, 16
+ sub r11d, r12d
+ cmp r11d, 16
+ jl L_AES_XTS_decrypt_update_avx512_last_31_start
+ add r11d, r12d
+L_AES_XTS_decrypt_update_avx512_mul16_128:
+ and r11d, 4294967168
+ cmp r12d, r11d
+ je L_AES_XTS_decrypt_update_avx512_done_128
+ ; 128 bytes of input
+ ; aes_dec_128
+ lea rcx, QWORD PTR [rdi+r12]
+ lea rdx, QWORD PTR [rsi+r12]
+ vmovdqu64 zmm0, [rcx]
+ vmovdqu64 zmm1, [rcx+64]
+ vshufi64x2 zmm5, zmm8, zmm8, 0
+ vpsrlvq zmm6, zmm5, zmm15
+ vpclmulqdq zmm7, zmm6, zmm13, 1
+ vpslldq zmm6, zmm6, 8
+ vpsllvq zmm4, zmm5, zmm14
+ vpternlogq zmm4, zmm7, zmm6, 150
+ vpsrlq zmm9, zmm4, 60
+ vpclmulqdq zmm10, zmm9, zmm13, 1
+ vpslldq zmm9, zmm9, 8
+ vpsllq zmm5, zmm4, 4
+ vpternlogq zmm5, zmm10, zmm9, 150
+ ; aes_dec_block
+ vpternlogq zmm0, zmm16, zmm4, 150
+ vpternlogq zmm1, zmm16, zmm5, 150
+ vaesdec zmm0, zmm0, zmm17
+ vaesdec zmm1, zmm1, zmm17
+ vaesdec zmm0, zmm0, zmm18
+ vaesdec zmm1, zmm1, zmm18
+ vaesdec zmm0, zmm0, zmm19
+ vaesdec zmm1, zmm1, zmm19
+ vaesdec zmm0, zmm0, zmm20
+ vaesdec zmm1, zmm1, zmm20
+ vaesdec zmm0, zmm0, zmm21
+ vaesdec zmm1, zmm1, zmm21
+ vaesdec zmm0, zmm0, zmm22
+ vaesdec zmm1, zmm1, zmm22
+ vaesdec zmm0, zmm0, zmm23
+ vaesdec zmm1, zmm1, zmm23
+ vaesdec zmm0, zmm0, zmm24
+ vaesdec zmm1, zmm1, zmm24
+ vaesdec zmm0, zmm0, zmm25
+ vaesdec zmm1, zmm1, zmm25
+ cmp r9d, 11
+ vmovdqa64 zmm9, zmm26
+ jl L_AES_XTS_decrypt_update_avx512_aes_dec_128_aes_dec_block_last
+ vaesdec zmm0, zmm0, zmm26
+ vaesdec zmm1, zmm1, zmm26
+ vaesdec zmm0, zmm0, zmm27
+ vaesdec zmm1, zmm1, zmm27
+ cmp r9d, 13
+ vmovdqa64 zmm9, zmm28
+ jl L_AES_XTS_decrypt_update_avx512_aes_dec_128_aes_dec_block_last
+ vaesdec zmm0, zmm0, zmm28
+ vaesdec zmm1, zmm1, zmm28
+ vaesdec zmm0, zmm0, zmm29
+ vaesdec zmm1, zmm1, zmm29
+ vmovdqa64 zmm9, zmm30
+L_AES_XTS_decrypt_update_avx512_aes_dec_128_aes_dec_block_last:
+ vaesdeclast zmm0, zmm0, zmm9
+ vaesdeclast zmm1, zmm1, zmm9
+ vpxorq zmm0, zmm0, zmm4
+ vmovdqu64 [rdx], zmm0
+ vpxorq zmm1, zmm1, zmm5
+ vmovdqu64 [rdx+64], zmm1
+ vextracti32x4 xmm8, zmm5, 3
+ vpshufd xmm9, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm9, xmm9, 31
+ vpternlogd xmm8, xmm9, xmm12, 120
+ add r12d, 128
+L_AES_XTS_decrypt_update_avx512_done_128:
+ cmp r12d, eax
+ mov r11d, eax
+ je L_AES_XTS_decrypt_update_avx512_done_dec
+ and r11d, 4294967280
+ cmp r11d, eax
+ je L_AES_XTS_decrypt_update_avx512_mul16_64
+ sub r11d, 16
+ sub r11d, r12d
+ cmp r11d, 16
+ jl L_AES_XTS_decrypt_update_avx512_last_31_start
+ add r11d, r12d
+L_AES_XTS_decrypt_update_avx512_mul16_64:
+ and r11d, 4294967232
+ cmp r12d, r11d
+ je L_AES_XTS_decrypt_update_avx512_done_64
+ ; 64 bytes of input
+ ; aes_dec_64
+ lea rcx, QWORD PTR [rdi+r12]
+ lea rdx, QWORD PTR [rsi+r12]
+ vmovdqu64 zmm0, [rcx]
+ vshufi64x2 zmm5, zmm8, zmm8, 0
+ vpsrlvq zmm6, zmm5, zmm15
+ vpclmulqdq zmm7, zmm6, zmm13, 1
+ vpslldq zmm6, zmm6, 8
+ vpsllvq zmm4, zmm5, zmm14
+ vpternlogq zmm4, zmm7, zmm6, 150
+ ; aes_dec_block
+ vpternlogq zmm0, zmm16, zmm4, 150
+ vaesdec zmm0, zmm0, zmm17
+ vaesdec zmm0, zmm0, zmm18
+ vaesdec zmm0, zmm0, zmm19
+ vaesdec zmm0, zmm0, zmm20
+ vaesdec zmm0, zmm0, zmm21
+ vaesdec zmm0, zmm0, zmm22
+ vaesdec zmm0, zmm0, zmm23
+ vaesdec zmm0, zmm0, zmm24
+ vaesdec zmm0, zmm0, zmm25
+ cmp r9d, 11
+ vmovdqa64 zmm9, zmm26
+ jl L_AES_XTS_decrypt_update_avx512_aes_dec_64_aes_dec_block_last
+ vaesdec zmm0, zmm0, zmm26
+ vaesdec zmm0, zmm0, zmm27
+ cmp r9d, 13
+ vmovdqa64 zmm9, zmm28
+ jl L_AES_XTS_decrypt_update_avx512_aes_dec_64_aes_dec_block_last
+ vaesdec zmm0, zmm0, zmm28
+ vaesdec zmm0, zmm0, zmm29
+ vmovdqa64 zmm9, zmm30
+L_AES_XTS_decrypt_update_avx512_aes_dec_64_aes_dec_block_last:
+ vaesdeclast zmm0, zmm0, zmm9
+ vpxorq zmm0, zmm0, zmm4
+ vmovdqu64 [rdx], zmm0
+ vextracti32x4 xmm8, zmm4, 3
+ vpshufd xmm9, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm9, xmm9, 31
+ vpternlogd xmm8, xmm9, xmm12, 120
+ add r12d, 64
+L_AES_XTS_decrypt_update_avx512_done_64:
+ cmp r12d, eax
+ mov r11d, eax
+ je L_AES_XTS_decrypt_update_avx512_done_dec
+ and r11d, 4294967280
+ cmp r11d, eax
+ je L_AES_XTS_decrypt_update_avx512_mul16_32
+ sub r11d, 16
+ sub r11d, r12d
+ cmp r11d, 16
+ jl L_AES_XTS_decrypt_update_avx512_last_31_start
+ add r11d, r12d
+L_AES_XTS_decrypt_update_avx512_mul16_32:
+ and r11d, 4294967264
+ cmp r12d, r11d
+ je L_AES_XTS_decrypt_update_avx512_done_32
+ ; 32 bytes of input
+ ; aes_dec_32
+ lea rcx, QWORD PTR [rdi+r12]
+ lea rdx, QWORD PTR [rsi+r12]
+ vmovdqu64 ymm0, [rcx]
+ vshufi64x2 zmm5, zmm8, zmm8, 0
+ vpsrlvq zmm6, zmm5, zmm15
+ vpclmulqdq zmm7, zmm6, zmm13, 1
+ vpslldq zmm6, zmm6, 8
+ vpsllvq zmm4, zmm5, zmm14
+ vpternlogq zmm4, zmm7, zmm6, 150
+ ; aes_dec_block
+ vpternlogq ymm0, ymm16, ymm4, 150
+ vaesdec ymm0, ymm0, ymm17
+ vaesdec ymm0, ymm0, ymm18
+ vaesdec ymm0, ymm0, ymm19
+ vaesdec ymm0, ymm0, ymm20
+ vaesdec ymm0, ymm0, ymm21
+ vaesdec ymm0, ymm0, ymm22
+ vaesdec ymm0, ymm0, ymm23
+ vaesdec ymm0, ymm0, ymm24
+ vaesdec ymm0, ymm0, ymm25
+ cmp r9d, 11
+ vmovdqa64 ymm9, ymm26
+ jl L_AES_XTS_decrypt_update_avx512_aes_dec_32_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm26
+ vaesdec ymm0, ymm0, ymm27
+ cmp r9d, 13
+ vmovdqa64 ymm9, ymm28
+ jl L_AES_XTS_decrypt_update_avx512_aes_dec_32_aes_dec_block_last
+ vaesdec ymm0, ymm0, ymm28
+ vaesdec ymm0, ymm0, ymm29
+ vmovdqa64 ymm9, ymm30
+L_AES_XTS_decrypt_update_avx512_aes_dec_32_aes_dec_block_last:
+ vaesdeclast ymm0, ymm0, ymm9
+ vpxorq ymm0, ymm0, ymm4
+ vmovdqu64 [rdx], ymm0
+ vextracti32x4 xmm8, zmm4, 2
+ add r12d, 32
+L_AES_XTS_decrypt_update_avx512_done_32:
+ cmp r12d, eax
+ mov r11d, eax
+ je L_AES_XTS_decrypt_update_avx512_done_dec
+ and r11d, 4294967280
+ cmp r11d, eax
+ je L_AES_XTS_decrypt_update_avx512_mul16
+ sub r11d, 16
+ sub r11d, r12d
+ cmp r11d, 16
+ jl L_AES_XTS_decrypt_update_avx512_last_31_start
+ add r11d, r12d
+L_AES_XTS_decrypt_update_avx512_mul16:
+L_AES_XTS_decrypt_update_avx512_dec_16:
+ ; 16 bytes of input
+ lea rcx, QWORD PTR [rdi+r12]
+ vmovdqu xmm0, OWORD PTR [rcx]
+ vpxor xmm0, xmm0, xmm8
+ ; aes_dec_block
+ vpxor xmm0, xmm0, [r10]
+ vmovdqu xmm5, OWORD PTR [r10+16]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+32]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+48]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+64]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+80]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+96]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+112]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+128]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+144]
+ vaesdec xmm0, xmm0, xmm5
+ cmp r9d, 11
+ vmovdqu xmm5, OWORD PTR [r10+160]
+ jl L_AES_XTS_decrypt_update_avx512_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r10+176]
+ vaesdec xmm0, xmm0, xmm6
+ cmp r9d, 13
+ vmovdqu xmm5, OWORD PTR [r10+192]
+ jl L_AES_XTS_decrypt_update_avx512_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r10+208]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r10+224]
+L_AES_XTS_decrypt_update_avx512_aes_dec_block_last:
+ vaesdeclast xmm0, xmm0, xmm5
+ vpxor xmm0, xmm0, xmm8
+ lea rcx, QWORD PTR [rsi+r12]
+ vmovdqu OWORD PTR [rcx], xmm0
+ vpshufd xmm4, xmm8, 19
+ vpaddq xmm8, xmm8, xmm8
+ vpsrad xmm4, xmm4, 31
+ vpternlogd xmm8, xmm4, xmm12, 120
+ add r12d, 16
+ cmp r12d, r11d
+ jl L_AES_XTS_decrypt_update_avx512_dec_16
+ cmp r12d, eax
+ je L_AES_XTS_decrypt_update_avx512_done_dec
+L_AES_XTS_decrypt_update_avx512_last_31_start:
+ vpshufd xmm4, xmm8, 19
+ vpaddq xmm7, xmm8, xmm8
+ vpsrad xmm4, xmm4, 31
+ vpternlogd xmm7, xmm4, xmm12, 120
+ lea rcx, QWORD PTR [rdi+r12]
+ vmovdqu xmm0, OWORD PTR [rcx]
+ vpxor xmm0, xmm0, xmm7
+ ; aes_dec_block
+ vpxor xmm0, xmm0, [r10]
+ vmovdqu xmm5, OWORD PTR [r10+16]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+32]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+48]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+64]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+80]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+96]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+112]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+128]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+144]
+ vaesdec xmm0, xmm0, xmm5
+ cmp r9d, 11
+ vmovdqu xmm5, OWORD PTR [r10+160]
+ jl L_AES_XTS_decrypt_update_avx512_last_31_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r10+176]
+ vaesdec xmm0, xmm0, xmm6
+ cmp r9d, 13
+ vmovdqu xmm5, OWORD PTR [r10+192]
+ jl L_AES_XTS_decrypt_update_avx512_last_31_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r10+208]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r10+224]
+L_AES_XTS_decrypt_update_avx512_last_31_aes_dec_block_last:
+ vaesdeclast xmm0, xmm0, xmm5
+ vpxor xmm0, xmm0, xmm7
+ vmovdqu OWORD PTR [rsp], xmm0
+ add r12, 16
+ xor rdx, rdx
+L_AES_XTS_decrypt_update_avx512_last_31_byte_loop:
+ mov r11b, BYTE PTR [rsp+rdx]
+ mov cl, BYTE PTR [rdi+r12]
+ mov BYTE PTR [rsi+r12], r11b
+ mov BYTE PTR [rsp+rdx], cl
+ inc r12d
+ inc edx
+ cmp r12d, eax
+ jl L_AES_XTS_decrypt_update_avx512_last_31_byte_loop
+ sub r12, rdx
+ vmovdqu xmm0, OWORD PTR [rsp]
+ vpxor xmm0, xmm0, xmm8
+ ; aes_dec_block
+ vpxor xmm0, xmm0, [r10]
+ vmovdqu xmm5, OWORD PTR [r10+16]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+32]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+48]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+64]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+80]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+96]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+112]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+128]
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm5, OWORD PTR [r10+144]
+ vaesdec xmm0, xmm0, xmm5
+ cmp r9d, 11
+ vmovdqu xmm5, OWORD PTR [r10+160]
+ jl L_AES_XTS_decrypt_update_avx512_last_31_2_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r10+176]
+ vaesdec xmm0, xmm0, xmm6
+ cmp r9d, 13
+ vmovdqu xmm5, OWORD PTR [r10+192]
+ jl L_AES_XTS_decrypt_update_avx512_last_31_2_aes_dec_block_last
+ vaesdec xmm0, xmm0, xmm5
+ vmovdqu xmm6, OWORD PTR [r10+208]
+ vaesdec xmm0, xmm0, xmm6
+ vmovdqu xmm5, OWORD PTR [r10+224]
+L_AES_XTS_decrypt_update_avx512_last_31_2_aes_dec_block_last:
+ vaesdeclast xmm0, xmm0, xmm5
+ vpxor xmm0, xmm0, xmm8
+ sub r12, 16
+ lea rcx, QWORD PTR [rsi+r12]
+ vmovdqu OWORD PTR [rcx], xmm0
+L_AES_XTS_decrypt_update_avx512_done_dec:
+ vmovdqu OWORD PTR [r8], xmm8
+ vmovdqu xmm6, OWORD PTR [rsp+64]
+ vmovdqu xmm7, OWORD PTR [rsp+80]
+ vmovdqu xmm8, OWORD PTR [rsp+96]
+ vmovdqu xmm9, OWORD PTR [rsp+112]
+ vmovdqu xmm10, OWORD PTR [rsp+128]
+ vmovdqu xmm11, OWORD PTR [rsp+144]
+ vmovdqu xmm12, OWORD PTR [rsp+160]
+ vmovdqu xmm13, OWORD PTR [rsp+176]
+ vmovdqu xmm14, OWORD PTR [rsp+192]
+ vmovdqu xmm15, OWORD PTR [rsp+208]
+ add rsp, 224
+ pop r12
+ pop rsi
+ pop rdi
+ ret
+AES_XTS_decrypt_update_avx512 ENDP
+_TEXT ENDS
+ENDIF
END
diff --git a/wolfcrypt/src/chacha_asm.S b/wolfcrypt/src/chacha_asm.S
index 6109e22f603..ba8768bd9c2 100644
--- a/wolfcrypt/src/chacha_asm.S
+++ b/wolfcrypt/src/chacha_asm.S
@@ -46,6 +46,16 @@
#define HAVE_INTEL_AVX2
#endif /* HAVE_INTEL_AVX2 */
#endif /* NO_AVX2_SUPPORT */
+#ifndef NO_VAES_SUPPORT
+#ifndef HAVE_INTEL_VAES
+#define HAVE_INTEL_VAES
+#endif /* HAVE_INTEL_VAES */
+#endif /* NO_VAES_SUPPORT */
+#ifndef NO_AVX512_SUPPORT
+#ifndef HAVE_INTEL_AVX512
+#define HAVE_INTEL_AVX512
+#endif /* HAVE_INTEL_AVX512 */
+#endif /* NO_AVX512_SUPPORT */
#ifdef WOLFSSL_X86_64_BUILD
#ifndef __APPLE__
diff --git a/wolfcrypt/src/cpuid.c b/wolfcrypt/src/cpuid.c
index 2c3670234a6..8963abb49a8 100644
--- a/wolfcrypt/src/cpuid.c
+++ b/wolfcrypt/src/cpuid.c
@@ -130,6 +130,8 @@
if (cpuid_flag(1, 0, ECX, 22)) { new_cpuid_flags |= CPUID_MOVBE ; }
if (cpuid_flag(7, 0, EBX, 3)) { new_cpuid_flags |= CPUID_BMI1 ; }
if (cpuid_flag(7, 0, EBX, 29)) { new_cpuid_flags |= CPUID_SHA ; }
+ if (cpuid_flag(7, 0, ECX, 9)) { new_cpuid_flags |= CPUID_VAES ; }
+ if (cpuid_flag(7, 0, EBX, 16)) { new_cpuid_flags |= CPUID_AVX512; }
(void)wolfSSL_Atomic_Uint_CompareExchange
(&cpuid_flags, &old_cpuid_flags, new_cpuid_flags);
}
diff --git a/wolfcrypt/src/fe_x25519_asm.S b/wolfcrypt/src/fe_x25519_asm.S
index 3f0e0dd6a89..7e976fa1f28 100644
--- a/wolfcrypt/src/fe_x25519_asm.S
+++ b/wolfcrypt/src/fe_x25519_asm.S
@@ -46,6 +46,16 @@
#define HAVE_INTEL_AVX2
#endif /* HAVE_INTEL_AVX2 */
#endif /* NO_AVX2_SUPPORT */
+#ifndef NO_VAES_SUPPORT
+#ifndef HAVE_INTEL_VAES
+#define HAVE_INTEL_VAES
+#endif /* HAVE_INTEL_VAES */
+#endif /* NO_VAES_SUPPORT */
+#ifndef NO_AVX512_SUPPORT
+#ifndef HAVE_INTEL_AVX512
+#define HAVE_INTEL_AVX512
+#endif /* HAVE_INTEL_AVX512 */
+#endif /* NO_AVX512_SUPPORT */
#ifndef __APPLE__
.text
diff --git a/wolfcrypt/src/include.am b/wolfcrypt/src/include.am
index 18d7a339cd5..908c43984cd 100644
--- a/wolfcrypt/src/include.am
+++ b/wolfcrypt/src/include.am
@@ -16,6 +16,7 @@ EXTRA_DIST += wolfcrypt/src/evp.c
EXTRA_DIST += wolfcrypt/src/evp_pk.c
EXTRA_DIST += wolfcrypt/src/asm.c
EXTRA_DIST += wolfcrypt/src/aes_asm.asm
+EXTRA_DIST += wolfcrypt/src/aes_x86_64_asm.asm
EXTRA_DIST += wolfcrypt/src/aes_gcm_asm.asm
EXTRA_DIST += wolfcrypt/src/aes_xts_asm.asm
EXTRA_DIST += wolfcrypt/src/chacha_asm.asm
diff --git a/wolfcrypt/src/poly1305_asm.S b/wolfcrypt/src/poly1305_asm.S
index 7f73e87b67e..f55cce5a079 100644
--- a/wolfcrypt/src/poly1305_asm.S
+++ b/wolfcrypt/src/poly1305_asm.S
@@ -46,6 +46,16 @@
#define HAVE_INTEL_AVX2
#endif /* HAVE_INTEL_AVX2 */
#endif /* NO_AVX2_SUPPORT */
+#ifndef NO_VAES_SUPPORT
+#ifndef HAVE_INTEL_VAES
+#define HAVE_INTEL_VAES
+#endif /* HAVE_INTEL_VAES */
+#endif /* NO_VAES_SUPPORT */
+#ifndef NO_AVX512_SUPPORT
+#ifndef HAVE_INTEL_AVX512
+#define HAVE_INTEL_AVX512
+#endif /* HAVE_INTEL_AVX512 */
+#endif /* NO_AVX512_SUPPORT */
#ifdef WOLFSSL_X86_64_BUILD
#ifdef HAVE_INTEL_AVX1
diff --git a/wolfcrypt/src/sha256_asm.S b/wolfcrypt/src/sha256_asm.S
index a407b7de1f5..d91a82aff94 100644
--- a/wolfcrypt/src/sha256_asm.S
+++ b/wolfcrypt/src/sha256_asm.S
@@ -46,6 +46,16 @@
#define HAVE_INTEL_AVX2
#endif /* HAVE_INTEL_AVX2 */
#endif /* NO_AVX2_SUPPORT */
+#ifndef NO_VAES_SUPPORT
+#ifndef HAVE_INTEL_VAES
+#define HAVE_INTEL_VAES
+#endif /* HAVE_INTEL_VAES */
+#endif /* NO_VAES_SUPPORT */
+#ifndef NO_AVX512_SUPPORT
+#ifndef HAVE_INTEL_AVX512
+#define HAVE_INTEL_AVX512
+#endif /* HAVE_INTEL_AVX512 */
+#endif /* NO_AVX512_SUPPORT */
#ifdef WOLFSSL_X86_64_BUILD
#ifndef __APPLE__
diff --git a/wolfcrypt/src/sha3_asm.S b/wolfcrypt/src/sha3_asm.S
index 6abc9d851b1..810a1c67433 100644
--- a/wolfcrypt/src/sha3_asm.S
+++ b/wolfcrypt/src/sha3_asm.S
@@ -46,6 +46,16 @@
#define HAVE_INTEL_AVX2
#endif /* HAVE_INTEL_AVX2 */
#endif /* NO_AVX2_SUPPORT */
+#ifndef NO_VAES_SUPPORT
+#ifndef HAVE_INTEL_VAES
+#define HAVE_INTEL_VAES
+#endif /* HAVE_INTEL_VAES */
+#endif /* NO_VAES_SUPPORT */
+#ifndef NO_AVX512_SUPPORT
+#ifndef HAVE_INTEL_AVX512
+#define HAVE_INTEL_AVX512
+#endif /* HAVE_INTEL_AVX512 */
+#endif /* NO_AVX512_SUPPORT */
#ifndef __APPLE__
.data
diff --git a/wolfcrypt/src/sha512_asm.S b/wolfcrypt/src/sha512_asm.S
index d0ca1dd4fd4..b3c377deabc 100644
--- a/wolfcrypt/src/sha512_asm.S
+++ b/wolfcrypt/src/sha512_asm.S
@@ -46,6 +46,16 @@
#define HAVE_INTEL_AVX2
#endif /* HAVE_INTEL_AVX2 */
#endif /* NO_AVX2_SUPPORT */
+#ifndef NO_VAES_SUPPORT
+#ifndef HAVE_INTEL_VAES
+#define HAVE_INTEL_VAES
+#endif /* HAVE_INTEL_VAES */
+#endif /* NO_VAES_SUPPORT */
+#ifndef NO_AVX512_SUPPORT
+#ifndef HAVE_INTEL_AVX512
+#define HAVE_INTEL_AVX512
+#endif /* HAVE_INTEL_AVX512 */
+#endif /* NO_AVX512_SUPPORT */
#ifdef HAVE_INTEL_AVX1
#ifndef __APPLE__
diff --git a/wolfcrypt/src/wc_mldsa_asm.S b/wolfcrypt/src/wc_mldsa_asm.S
index 717986e4a5c..e1e77a93783 100644
--- a/wolfcrypt/src/wc_mldsa_asm.S
+++ b/wolfcrypt/src/wc_mldsa_asm.S
@@ -46,6 +46,16 @@
#define HAVE_INTEL_AVX2
#endif /* HAVE_INTEL_AVX2 */
#endif /* NO_AVX2_SUPPORT */
+#ifndef NO_VAES_SUPPORT
+#ifndef HAVE_INTEL_VAES
+#define HAVE_INTEL_VAES
+#endif /* HAVE_INTEL_VAES */
+#endif /* NO_VAES_SUPPORT */
+#ifndef NO_AVX512_SUPPORT
+#ifndef HAVE_INTEL_AVX512
+#define HAVE_INTEL_AVX512
+#endif /* HAVE_INTEL_AVX512 */
+#endif /* NO_AVX512_SUPPORT */
#ifdef WOLFSSL_HAVE_MLDSA
#ifdef HAVE_INTEL_AVX2
diff --git a/wolfcrypt/src/wc_mlkem_asm.S b/wolfcrypt/src/wc_mlkem_asm.S
index 9b80cf8d432..b399218dfdd 100644
--- a/wolfcrypt/src/wc_mlkem_asm.S
+++ b/wolfcrypt/src/wc_mlkem_asm.S
@@ -46,6 +46,16 @@
#define HAVE_INTEL_AVX2
#endif /* HAVE_INTEL_AVX2 */
#endif /* NO_AVX2_SUPPORT */
+#ifndef NO_VAES_SUPPORT
+#ifndef HAVE_INTEL_VAES
+#define HAVE_INTEL_VAES
+#endif /* HAVE_INTEL_VAES */
+#endif /* NO_VAES_SUPPORT */
+#ifndef NO_AVX512_SUPPORT
+#ifndef HAVE_INTEL_AVX512
+#define HAVE_INTEL_AVX512
+#endif /* HAVE_INTEL_AVX512 */
+#endif /* NO_AVX512_SUPPORT */
#ifdef WOLFSSL_HAVE_MLKEM
#ifdef HAVE_INTEL_AVX2
diff --git a/wolfssl-VS2022.vcxproj b/wolfssl-VS2022.vcxproj
index a4b8d39b196..81d32758e91 100644
--- a/wolfssl-VS2022.vcxproj
+++ b/wolfssl-VS2022.vcxproj
@@ -1,577 +1,591 @@
-
-
-
-
- Debug
- Win32
-
-
- Debug
- x64
-
-
- Debug
- ARM64
-
-
- DLL Debug
- Win32
-
-
- DLL Debug
- x64
-
-
- DLL Debug
- ARM64
-
-
- DLL Release
- Win32
-
-
- DLL Release
- x64
-
-
- DLL Release
- ARM64
-
-
- Release
- Win32
-
-
- Release
- x64
-
-
- Release
- ARM64
-
-
-
- {12226DBE-7278-4DFA-A119-5A0294CF0B33}
- wolfssl
- Win32Proj
- wolfssl
-
-
-
- StaticLibrary
- v143
- Unicode
- true
-
-
- DynamicLibrary
- v143
- Unicode
- true
-
-
- StaticLibrary
- v143
- Unicode
- true
-
-
- DynamicLibrary
- v143
- Unicode
- true
-
-
- StaticLibrary
- v143
- Unicode
- true
-
-
- DynamicLibrary
- v143
- Unicode
- true
-
-
- StaticLibrary
- v143
- Unicode
-
-
- DynamicLibrary
- v143
- Unicode
-
-
- StaticLibrary
- v143
- Unicode
-
-
- DynamicLibrary
- v143
- Unicode
-
-
- StaticLibrary
- v143
- Unicode
-
-
- DynamicLibrary
- v143
- Unicode
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- $(SolutionDir)$(Configuration)\$(Platform)\
- $(Configuration)\$(Platform)\$(ProjectName)_obj\
-
-
- $(SolutionDir)$(Configuration)\$(Platform)\
- $(Configuration)\$(Platform)\$(ProjectName)_obj\
-
-
- $(SolutionDir)$(Configuration)\$(Platform)\
- $(Configuration)\$(Platform)\$(ProjectName)_obj\
-
-
- $(SolutionDir)$(Configuration)\$(Platform)\
- $(Configuration)\$(Platform)\$(ProjectName)_obj\
-
-
- $(SolutionDir)$(Configuration)\$(Platform)\
- $(Configuration)\$(Platform)\$(ProjectName)_obj\
-
-
- $(SolutionDir)$(Configuration)\$(Platform)\
- $(Configuration)\$(Platform)\$(ProjectName)_obj\
-
-
- $(SolutionDir)$(Configuration)\$(Platform)\
- $(Configuration)\$(Platform)\$(ProjectName)_obj\
-
-
- $(SolutionDir)$(Configuration)\$(Platform)\
- $(Configuration)\$(Platform)\$(ProjectName)_obj\
-
-
- $(SolutionDir)$(Configuration)\$(Platform)\
- $(Configuration)\$(Platform)\$(ProjectName)_obj\
-
-
- $(SolutionDir)$(Configuration)\$(Platform)\
- $(Configuration)\$(Platform)\$(ProjectName)_obj\
-
-
- $(SolutionDir)$(Configuration)\$(Platform)\
- $(Configuration)\$(Platform)\$(ProjectName)_obj\
-
-
- $(SolutionDir)$(Configuration)\$(Platform)\
- $(Configuration)\$(Platform)\$(ProjectName)_obj\
-
-
-
- Disabled
- ./;./IDE/WIN;%(AdditionalIncludeDirectories)
- WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
- true
- EnableFastChecks
- MultiThreadedDebugDLL
-
- Level4
- EditAndContinue
- 4206;4214;4706;%(DisableSpecificWarnings)
-
-
-
-
- Disabled
- ./;./IDE/WIN;%(AdditionalIncludeDirectories)
- WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
- true
- EnableFastChecks
- MultiThreadedDebugDLL
-
-
- Level4
- EditAndContinue
- 4206;4214;4706;%(DisableSpecificWarnings)
-
-
- ws2_32.lib;%(AdditionalDependencies)
- false
- true
- false
-
-
-
-
- Disabled
- ./;./IDE/WIN;%(AdditionalIncludeDirectories)
- WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
- EnableFastChecks
- MultiThreadedDebugDLL
-
-
- Level4
- ProgramDatabase
- 4206;4214;4706;%(DisableSpecificWarnings)
-
-
-
-
- Disabled
- ./;./IDE/WIN;%(AdditionalIncludeDirectories)
- WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
- EnableFastChecks
- MultiThreadedDebugDLL
-
-
- Level4
- ProgramDatabase
- 4206;4214;4706;%(DisableSpecificWarnings)
-
-
- ws2_32.lib;%(AdditionalDependencies)
- false
- true
-
-
-
-
- Disabled
- ./;./IDE/WIN;%(AdditionalIncludeDirectories)
- WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
- EnableFastChecks
- MultiThreadedDebugDLL
-
-
- Level4
- ProgramDatabase
- 4206;4214;4706;%(DisableSpecificWarnings)
-
-
-
-
- Disabled
- ./;./IDE/WIN;%(AdditionalIncludeDirectories)
- WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
- EnableFastChecks
- MultiThreadedDebugDLL
-
-
- Level4
- ProgramDatabase
- 4206;4214;4706;%(DisableSpecificWarnings)
-
-
- ws2_32.lib;%(AdditionalDependencies)
- false
- true
-
-
-
-
- MaxSpeed
- true
- ./;./IDE/WIN;%(AdditionalIncludeDirectories)
- WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
- MultiThreadedDLL
- true
-
- Level3
- ProgramDatabase
-
-
-
-
- MaxSpeed
- true
- ./;./IDE/WIN;%(AdditionalIncludeDirectories)
- WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
- MultiThreadedDLL
- true
-
-
- Level3
- ProgramDatabase
-
-
- ws2_32.lib;%(AdditionalDependencies)
- true
-
-
-
-
- MaxSpeed
- true
- ./;./IDE/WIN;%(AdditionalIncludeDirectories)
- WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
- MultiThreadedDLL
- true
-
-
- Level3
- ProgramDatabase
-
-
-
-
- MaxSpeed
- true
- ./;./IDE/WIN;%(AdditionalIncludeDirectories)
- WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
- MultiThreadedDLL
- true
-
-
- Level3
- ProgramDatabase
-
-
- ws2_32.lib;%(AdditionalDependencies)
- true
-
-
-
-
- MaxSpeed
- true
- ./;./IDE/WIN;%(AdditionalIncludeDirectories)
- WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
- MultiThreadedDLL
- true
-
-
- Level3
- ProgramDatabase
-
-
-
-
- MaxSpeed
- true
- ./;./IDE/WIN;%(AdditionalIncludeDirectories)
- WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
- MultiThreadedDLL
- true
-
-
- Level3
- ProgramDatabase
-
-
- ws2_32.lib;%(AdditionalDependencies)
- true
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- false
- false
- ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
- ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
- $(OutDir)%(Filename).obj
- $(IntDir)%(Filename).obj
- false
- false
- ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
- ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
- $(OutDir)%(Filename).obj
- $(IntDir)%(Filename).obj
-
-
- false
- false
- ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
- ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
- $(OutDir)%(Filename).obj
- $(IntDir)%(Filename).obj
- false
- false
- ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
- ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
- $(OutDir)%(Filename).obj
- $(IntDir)%(Filename).obj
-
-
- false
- false
- ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
- ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
- $(OutDir)%(Filename).obj
- $(IntDir)%(Filename).obj
- false
- false
- ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
- ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
- $(OutDir)%(Filename).obj
- $(IntDir)%(Filename).obj
-
-
- false
- false
- ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
- ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
- $(OutDir)%(Filename).obj
- $(IntDir)%(Filename).obj
- false
- false
- ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
- ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
- $(OutDir)%(Filename).obj
- $(IntDir)%(Filename).obj
-
-
- false
- false
- ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
- ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
- $(OutDir)%(Filename).obj
- $(IntDir)%(Filename).obj
- false
- false
- ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
- ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
- $(OutDir)%(Filename).obj
- $(IntDir)%(Filename).obj
-
-
- false
- false
- ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
- ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
- $(OutDir)%(Filename).obj
- $(IntDir)%(Filename).obj
- false
- false
- ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
- ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
- $(OutDir)%(Filename).obj
- $(IntDir)%(Filename).obj
-
-
-
-
-
- true
- true
- true
- true
- true
- true
-
-
-
-
-
-
+
+
+
+
+ Debug
+ Win32
+
+
+ Debug
+ x64
+
+
+ Debug
+ ARM64
+
+
+ DLL Debug
+ Win32
+
+
+ DLL Debug
+ x64
+
+
+ DLL Debug
+ ARM64
+
+
+ DLL Release
+ Win32
+
+
+ DLL Release
+ x64
+
+
+ DLL Release
+ ARM64
+
+
+ Release
+ Win32
+
+
+ Release
+ x64
+
+
+ Release
+ ARM64
+
+
+
+ {12226DBE-7278-4DFA-A119-5A0294CF0B33}
+ wolfssl
+ Win32Proj
+ wolfssl
+
+
+
+ StaticLibrary
+ v143
+ Unicode
+ true
+
+
+ DynamicLibrary
+ v143
+ Unicode
+ true
+
+
+ StaticLibrary
+ v143
+ Unicode
+ true
+
+
+ DynamicLibrary
+ v143
+ Unicode
+ true
+
+
+ StaticLibrary
+ v143
+ Unicode
+ true
+
+
+ DynamicLibrary
+ v143
+ Unicode
+ true
+
+
+ StaticLibrary
+ v143
+ Unicode
+
+
+ DynamicLibrary
+ v143
+ Unicode
+
+
+ StaticLibrary
+ v143
+ Unicode
+
+
+ DynamicLibrary
+ v143
+ Unicode
+
+
+ StaticLibrary
+ v143
+ Unicode
+
+
+ DynamicLibrary
+ v143
+ Unicode
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ $(SolutionDir)$(Configuration)\$(Platform)\
+ $(Configuration)\$(Platform)\$(ProjectName)_obj\
+
+
+ $(SolutionDir)$(Configuration)\$(Platform)\
+ $(Configuration)\$(Platform)\$(ProjectName)_obj\
+
+
+ $(SolutionDir)$(Configuration)\$(Platform)\
+ $(Configuration)\$(Platform)\$(ProjectName)_obj\
+
+
+ $(SolutionDir)$(Configuration)\$(Platform)\
+ $(Configuration)\$(Platform)\$(ProjectName)_obj\
+
+
+ $(SolutionDir)$(Configuration)\$(Platform)\
+ $(Configuration)\$(Platform)\$(ProjectName)_obj\
+
+
+ $(SolutionDir)$(Configuration)\$(Platform)\
+ $(Configuration)\$(Platform)\$(ProjectName)_obj\
+
+
+ $(SolutionDir)$(Configuration)\$(Platform)\
+ $(Configuration)\$(Platform)\$(ProjectName)_obj\
+
+
+ $(SolutionDir)$(Configuration)\$(Platform)\
+ $(Configuration)\$(Platform)\$(ProjectName)_obj\
+
+
+ $(SolutionDir)$(Configuration)\$(Platform)\
+ $(Configuration)\$(Platform)\$(ProjectName)_obj\
+
+
+ $(SolutionDir)$(Configuration)\$(Platform)\
+ $(Configuration)\$(Platform)\$(ProjectName)_obj\
+
+
+ $(SolutionDir)$(Configuration)\$(Platform)\
+ $(Configuration)\$(Platform)\$(ProjectName)_obj\
+
+
+ $(SolutionDir)$(Configuration)\$(Platform)\
+ $(Configuration)\$(Platform)\$(ProjectName)_obj\
+
+
+
+ Disabled
+ ./;./IDE/WIN;%(AdditionalIncludeDirectories)
+ WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
+ true
+ EnableFastChecks
+ MultiThreadedDebugDLL
+
+ Level4
+ EditAndContinue
+ 4206;4214;4706;%(DisableSpecificWarnings)
+
+
+
+
+ Disabled
+ ./;./IDE/WIN;%(AdditionalIncludeDirectories)
+ WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
+ true
+ EnableFastChecks
+ MultiThreadedDebugDLL
+
+
+ Level4
+ EditAndContinue
+ 4206;4214;4706;%(DisableSpecificWarnings)
+
+
+ ws2_32.lib;%(AdditionalDependencies)
+ false
+ true
+ false
+
+
+
+
+ Disabled
+ ./;./IDE/WIN;%(AdditionalIncludeDirectories)
+ WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
+ EnableFastChecks
+ MultiThreadedDebugDLL
+
+
+ Level4
+ ProgramDatabase
+ 4206;4214;4706;%(DisableSpecificWarnings)
+
+
+
+
+ Disabled
+ ./;./IDE/WIN;%(AdditionalIncludeDirectories)
+ WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
+ EnableFastChecks
+ MultiThreadedDebugDLL
+
+
+ Level4
+ ProgramDatabase
+ 4206;4214;4706;%(DisableSpecificWarnings)
+
+
+ ws2_32.lib;%(AdditionalDependencies)
+ false
+ true
+
+
+
+
+ Disabled
+ ./;./IDE/WIN;%(AdditionalIncludeDirectories)
+ WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
+ EnableFastChecks
+ MultiThreadedDebugDLL
+
+
+ Level4
+ ProgramDatabase
+ 4206;4214;4706;%(DisableSpecificWarnings)
+
+
+
+
+ Disabled
+ ./;./IDE/WIN;%(AdditionalIncludeDirectories)
+ WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
+ EnableFastChecks
+ MultiThreadedDebugDLL
+
+
+ Level4
+ ProgramDatabase
+ 4206;4214;4706;%(DisableSpecificWarnings)
+
+
+ ws2_32.lib;%(AdditionalDependencies)
+ false
+ true
+
+
+
+
+ MaxSpeed
+ true
+ ./;./IDE/WIN;%(AdditionalIncludeDirectories)
+ WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
+ MultiThreadedDLL
+ true
+
+ Level3
+ ProgramDatabase
+
+
+
+
+ MaxSpeed
+ true
+ ./;./IDE/WIN;%(AdditionalIncludeDirectories)
+ WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
+ MultiThreadedDLL
+ true
+
+
+ Level3
+ ProgramDatabase
+
+
+ ws2_32.lib;%(AdditionalDependencies)
+ true
+
+
+
+
+ MaxSpeed
+ true
+ ./;./IDE/WIN;%(AdditionalIncludeDirectories)
+ WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
+ MultiThreadedDLL
+ true
+
+
+ Level3
+ ProgramDatabase
+
+
+
+
+ MaxSpeed
+ true
+ ./;./IDE/WIN;%(AdditionalIncludeDirectories)
+ WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
+ MultiThreadedDLL
+ true
+
+
+ Level3
+ ProgramDatabase
+
+
+ ws2_32.lib;%(AdditionalDependencies)
+ true
+
+
+
+
+ MaxSpeed
+ true
+ ./;./IDE/WIN;%(AdditionalIncludeDirectories)
+ WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
+ MultiThreadedDLL
+ true
+
+
+ Level3
+ ProgramDatabase
+
+
+
+
+ MaxSpeed
+ true
+ ./;./IDE/WIN;%(AdditionalIncludeDirectories)
+ WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions)
+ MultiThreadedDLL
+ true
+
+
+ Level3
+ ProgramDatabase
+
+
+ ws2_32.lib;%(AdditionalDependencies)
+ true
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ false
+ false
+ ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+ false
+ false
+ ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+
+
+ false
+ false
+ ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+ false
+ false
+ ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+
+
+ false
+ false
+ ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+ false
+ false
+ ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+
+
+ false
+ false
+ ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+ false
+ false
+ ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+
+
+ false
+ false
+ ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+ false
+ false
+ ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+
+
+ false
+ false
+ ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+ false
+ false
+ ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+
+
+ false
+ false
+ ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+ false
+ false
+ ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+
+
+
+
+
+ true
+ true
+ true
+ true
+ true
+ true
+
+
+
+
+
+
diff --git a/wolfssl.vcxproj b/wolfssl.vcxproj
index c38bc90b99d..44c23ab74ee 100644
--- a/wolfssl.vcxproj
+++ b/wolfssl.vcxproj
@@ -489,6 +489,20 @@
$(OutDir)%(Filename).obj
$(IntDir)%(Filename).obj
+
+ false
+ false
+ ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+ false
+ false
+ ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+
false
false
diff --git a/wolfssl/wolfcrypt/cpuid.h b/wolfssl/wolfcrypt/cpuid.h
index bb7e68436b4..aada8801191 100644
--- a/wolfssl/wolfcrypt/cpuid.h
+++ b/wolfssl/wolfcrypt/cpuid.h
@@ -67,6 +67,8 @@ typedef word32 cpuid_flags_t;
#define CPUID_MOVBE 0x0080 /* Move and byte swap */
#define CPUID_BMI1 0x0100 /* ANDN */
#define CPUID_SHA 0x0200 /* SHA-1 and SHA-256 instructions */
+ #define CPUID_VAES 0x0400
+ #define CPUID_AVX512 0x0800
#define IS_INTEL_AVX1(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_AVX1)
#define IS_INTEL_AVX2(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_AVX2)
@@ -78,6 +80,8 @@ typedef word32 cpuid_flags_t;
#define IS_INTEL_MOVBE(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_MOVBE)
#define IS_INTEL_BMI1(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_BMI1)
#define IS_INTEL_SHA(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_SHA)
+ #define IS_INTEL_VAES(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_VAES)
+ #define IS_INTEL_AVX512(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_AVX512)
#elif defined(HAVE_CPUID_AARCH64)
diff --git a/wrapper/CSharp/wolfssl.vcxproj b/wrapper/CSharp/wolfssl.vcxproj
index 66694f76438..7a963cbd913 100644
--- a/wrapper/CSharp/wolfssl.vcxproj
+++ b/wrapper/CSharp/wolfssl.vcxproj
@@ -371,6 +371,20 @@
$(OutDir)%(Filename).obj
$(IntDir)%(Filename).obj
+
+ false
+ false
+ ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+ false
+ false
+ ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)
+ ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)
+ $(OutDir)%(Filename).obj
+ $(IntDir)%(Filename).obj
+
false
false