From a342eba578b657db9b4a0a8b081f620c346f9391 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Tue, 23 Jun 2026 10:35:31 +1000 Subject: [PATCH] Intel x64 ASM: Add new assembly for AES Support AES-XTS AVX512/VAES Support AES-GCM AVX512/VAES Support AES-ECB/CBC/CTR AVX512/VAES/AVX1/AES-NI. Remove code from aes_asm.S/aes_asm.asm Add CPU defines for AVX512 and VAES Updated ASM files with new defines for AVX512. Added support for printing out the new CPU Id flags in benchmark. Added new files to Windows projects. aes.c: Supports ECB/CBC/CTR in assembly. Supports calling AVX512/VAES assembly. --- .github/workflows/symbol-prefixes.yml | 2 +- .wolfssl_known_macro_extras | 2 + linuxkm/Kbuild | 2 + src/include.am | 5 + wolfcrypt/benchmark/benchmark.c | 2 + wolfcrypt/src/aes.c | 775 +- wolfcrypt/src/aes_asm.S | 1311 +-- wolfcrypt/src/aes_asm.asm | 1579 +-- wolfcrypt/src/aes_gcm_asm.S | 14249 +++++++++++++++++++++++- wolfcrypt/src/aes_gcm_asm.asm | 14181 ++++++++++++++++++++++- wolfcrypt/src/aes_x86_64_asm.S | 4375 ++++++++ wolfcrypt/src/aes_x86_64_asm.asm | 4283 +++++++ wolfcrypt/src/aes_xts_asm.S | 4412 ++++++++ wolfcrypt/src/aes_xts_asm.asm | 4468 ++++++++ wolfcrypt/src/chacha_asm.S | 10 + wolfcrypt/src/cpuid.c | 2 + wolfcrypt/src/fe_x25519_asm.S | 10 + wolfcrypt/src/include.am | 1 + wolfcrypt/src/poly1305_asm.S | 10 + wolfcrypt/src/sha256_asm.S | 10 + wolfcrypt/src/sha3_asm.S | 10 + wolfcrypt/src/sha512_asm.S | 10 + wolfcrypt/src/wc_mldsa_asm.S | 10 + wolfcrypt/src/wc_mlkem_asm.S | 10 + wolfssl-VS2022.vcxproj | 1168 +- wolfssl.vcxproj | 14 + wolfssl/wolfcrypt/cpuid.h | 4 + wrapper/CSharp/wolfssl.vcxproj | 14 + 28 files changed, 47480 insertions(+), 3449 deletions(-) create mode 100644 wolfcrypt/src/aes_x86_64_asm.S create mode 100644 wolfcrypt/src/aes_x86_64_asm.asm diff --git a/.github/workflows/symbol-prefixes.yml b/.github/workflows/symbol-prefixes.yml index 33142162ccf..fd767ca0092 100644 --- a/.github/workflows/symbol-prefixes.yml +++ b/.github/workflows/symbol-prefixes.yml @@ -49,7 +49,7 @@ jobs: { if (($7 !~ /^[0-9]+$/) || ($8 ~ /^(wc_|wolf|WOLF|__pfx|fe_|sp_[a-zA-Z090-0_]*[0-9])/) || - ($8 ~ /(_avx[12]|_AVX[12]|_sse[12]|_SSE[12]|_aesni|_AESNI|_bmi2|_x64$)/)) + ($8 ~ /(_avx[12]|_AVX[12]|_sse[12]|_SSE[12]|_aesni|_AESNI|_vaes|_VAES|_avx512|_AVX512|_bmi2|_x64$)/)) { next; } diff --git a/.wolfssl_known_macro_extras b/.wolfssl_known_macro_extras index 101ebf2fa88..c08a8580379 100644 --- a/.wolfssl_known_macro_extras +++ b/.wolfssl_known_macro_extras @@ -381,6 +381,7 @@ NO_AES_DECRYPT NO_ARDUINO_DEFAULT NO_ASM NO_ASN_OLD_TYPE_NAMES +NO_AVX512_SUPPORT NO_CAMELLIA_CBC NO_CERT NO_CERT_IN_TICKET @@ -459,6 +460,7 @@ NO_STDIO_FGETS_REMAP NO_STM32_HMAC NO_TKERNEL_MEM_POOL NO_TLSX_PSKKEM_PLAIN_ANNOUNCE +NO_VAES_SUPPORT NO_VERIFY_OID NO_WC_DHGENERATEPUBLIC NO_WC_SHE_GETUID diff --git a/linuxkm/Kbuild b/linuxkm/Kbuild index fe3f823942f..831a45c76a4 100644 --- a/linuxkm/Kbuild +++ b/linuxkm/Kbuild @@ -200,6 +200,8 @@ $(obj)/wolfcrypt/src/aes_gcm_asm.o: asflags-y := $(WOLFSSL_ASFLAGS) $(ASFLAGS_FP $(obj)/wolfcrypt/src/aes_gcm_asm.o: OBJECT_FILES_NON_STANDARD := y $(obj)/wolfcrypt/src/aes_xts_asm.o: asflags-y := $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE) $(obj)/wolfcrypt/src/aes_xts_asm.o: OBJECT_FILES_NON_STANDARD := y +$(obj)/wolfcrypt/src/aes_x86_64_asm.o: asflags-y := $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE) +$(obj)/wolfcrypt/src/aes_x86_64_asm.o: OBJECT_FILES_NON_STANDARD := y $(obj)/wolfcrypt/src/sp_x86_64_asm.o: asflags-y := $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE) $(obj)/wolfcrypt/src/sp_x86_64_asm.o: OBJECT_FILES_NON_STANDARD := y $(obj)/wolfcrypt/src/sha256_asm.o: asflags-y := $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE) diff --git a/src/include.am b/src/include.am index 4b80e149bac..2e904706f83 100644 --- a/src/include.am +++ b/src/include.am @@ -109,6 +109,7 @@ endif if BUILD_AESNI src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_asm.S +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_x86_64_asm.S if BUILD_X86_ASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S else @@ -259,6 +260,7 @@ endif BUILD_PPC64_ASM if BUILD_AESNI src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_asm.S +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_x86_64_asm.S if BUILD_X86_ASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S else @@ -532,6 +534,7 @@ endif BUILD_PPC64_ASM if BUILD_AESNI src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_asm.S +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_x86_64_asm.S if BUILD_X86_ASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S else @@ -867,6 +870,7 @@ endif BUILD_AES if BUILD_AESNI src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_asm.S +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_x86_64_asm.S if BUILD_X86_ASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S else @@ -1708,6 +1712,7 @@ endif if !BUILD_FIPS_V2_PLUS if BUILD_AESNI src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_asm.S +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_x86_64_asm.S if BUILD_X86_ASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S else diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c index 214d873bc2f..a58fd300a3a 100644 --- a/wolfcrypt/benchmark/benchmark.c +++ b/wolfcrypt/benchmark/benchmark.c @@ -4794,6 +4794,8 @@ static void print_cpu_features(void) if (IS_INTEL_MOVBE(cpuid_flags)) printf(" movbe"); if (IS_INTEL_BMI1(cpuid_flags)) printf(" bmi1"); if (IS_INTEL_SHA(cpuid_flags)) printf(" sha"); + if (IS_INTEL_VAES(cpuid_flags)) printf(" vaes"); + if (IS_INTEL_AVX512(cpuid_flags)) printf(" avx512"); #endif #ifdef __aarch64__ printf("Aarch64 -"); diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index 6806acbc965..8a630217b9d 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -809,6 +809,218 @@ block cipher mechanism that uses n-bit binary string parameter key with 128-bits unsigned char* key_schedule) XASM_LINK("AES_256_Key_Expansion_AESNI"); +#ifdef WOLFSSL_X86_64_BUILD + /* Wide ECB / CBC / CTR variants for x86_64. They share the AES-NI key + * schedule declared above and are selected at runtime from intel_flags. + * AES_CBC_decrypt_AESNI is the single max-width path (the by4/by6/by8 + * variants are only used by the 32-bit x86 build). */ + #if defined(USE_INTEL_SPEEDUP) + #ifndef HAVE_INTEL_AVX1 + #define HAVE_INTEL_AVX1 + #endif + #if !defined(NO_AVX2_SUPPORT) && !defined(HAVE_INTEL_AVX2) + #define HAVE_INTEL_AVX2 + #endif + #if !defined(NO_VAES_SUPPORT) && !defined(HAVE_INTEL_VAES) + #define HAVE_INTEL_VAES + #endif + #if !defined(NO_AVX512_SUPPORT) && !defined(HAVE_INTEL_AVX512) + #define HAVE_INTEL_AVX512 + #endif + #endif + + void AES_CTR_encrypt_AESNI(const unsigned char* in, unsigned char* out, + unsigned long length, const unsigned char* KS, int nr, + unsigned char* ctr) XASM_LINK("AES_CTR_encrypt_AESNI"); + #ifdef HAVE_AES_DECRYPT + void AES_CBC_decrypt_AESNI(const unsigned char* in, unsigned char* out, + unsigned char* ivec, unsigned long length, const unsigned char* KS, + int nr) XASM_LINK("AES_CBC_decrypt_AESNI"); + #endif + + #define AES_DECL_VARIANT(suff) \ + void AES_ECB_encrypt_##suff(const unsigned char* in, \ + unsigned char* out, unsigned long length, \ + const unsigned char* KS, int nr) \ + XASM_LINK("AES_ECB_encrypt_" #suff); \ + void AES_CBC_encrypt_##suff(const unsigned char* in, \ + unsigned char* out, unsigned char* ivec, unsigned long length, \ + const unsigned char* KS, int nr) \ + XASM_LINK("AES_CBC_encrypt_" #suff); \ + void AES_CTR_encrypt_##suff(const unsigned char* in, \ + unsigned char* out, unsigned long length, \ + const unsigned char* KS, int nr, unsigned char* ctr) \ + XASM_LINK("AES_CTR_encrypt_" #suff) + #ifdef HAVE_AES_DECRYPT + #define AES_DECL_VARIANT_DEC(suff) \ + void AES_ECB_decrypt_##suff(const unsigned char* in, \ + unsigned char* out, unsigned long length, \ + const unsigned char* KS, int nr) \ + XASM_LINK("AES_ECB_decrypt_" #suff); \ + void AES_CBC_decrypt_##suff(const unsigned char* in, \ + unsigned char* out, unsigned char* ivec, \ + unsigned long length, const unsigned char* KS, int nr) \ + XASM_LINK("AES_CBC_decrypt_" #suff) + #else + #define AES_DECL_VARIANT_DEC(suff) /* no decrypt */ + #endif + + #ifdef HAVE_INTEL_AVX1 + AES_DECL_VARIANT(avx1); + AES_DECL_VARIANT_DEC(avx1); + #endif + #ifdef HAVE_INTEL_VAES + AES_DECL_VARIANT(vaes); + AES_DECL_VARIANT_DEC(vaes); + #endif + #ifdef HAVE_INTEL_AVX512 + AES_DECL_VARIANT(avx512); + AES_DECL_VARIANT_DEC(avx512); + #endif + + /* Pick the widest available implementation at runtime. Callers must + * already be inside a VECTOR_REGISTERS_PUSH / SAVE_VECTOR_REGISTERS + * region (all bulk AES-NI call sites are). */ + static WC_INLINE void AesEcbEncryptBlocks(const unsigned char* in, + unsigned char* out, word32 sz, const unsigned char* key, int nr) + { + #ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_ECB_encrypt_avx512(in, out, sz, key, nr); + } + else + #endif + #ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_ECB_encrypt_vaes(in, out, sz, key, nr); + } + else + #endif + #ifdef HAVE_INTEL_AVX1 + if (IS_INTEL_AVX1(intel_flags)) { + AES_ECB_encrypt_avx1(in, out, sz, key, nr); + } + else + #endif + { + AES_ECB_encrypt_AESNI(in, out, sz, key, nr); + } + } + + #ifdef HAVE_AES_DECRYPT + static WC_INLINE void AesEcbDecryptBlocks(const unsigned char* in, + unsigned char* out, word32 sz, const unsigned char* key, int nr) + { + #ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_ECB_decrypt_avx512(in, out, sz, key, nr); + } + else + #endif + #ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_ECB_decrypt_vaes(in, out, sz, key, nr); + } + else + #endif + #ifdef HAVE_INTEL_AVX1 + if (IS_INTEL_AVX1(intel_flags)) { + AES_ECB_decrypt_avx1(in, out, sz, key, nr); + } + else + #endif + { + AES_ECB_decrypt_AESNI(in, out, sz, key, nr); + } + } + #endif + + #ifdef HAVE_AES_CBC + static WC_INLINE void AesCbcEncryptBlocks(const unsigned char* in, + unsigned char* out, unsigned char* iv, word32 sz, + const unsigned char* key, int nr) + { + #ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_CBC_encrypt_avx512(in, out, iv, sz, key, nr); + } + else + #endif + #ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_CBC_encrypt_vaes(in, out, iv, sz, key, nr); + } + else + #endif + #ifdef HAVE_INTEL_AVX1 + if (IS_INTEL_AVX1(intel_flags)) { + AES_CBC_encrypt_avx1(in, out, iv, sz, key, nr); + } + else + #endif + { + AES_CBC_encrypt_AESNI(in, out, iv, sz, key, nr); + } + } + #endif /* HAVE_AES_CBC */ + + #ifdef HAVE_AES_DECRYPT + static WC_INLINE void AesCbcDecryptBlocks(const unsigned char* in, + unsigned char* out, unsigned char* iv, word32 sz, + const unsigned char* key, int nr) + { + #ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_CBC_decrypt_avx512(in, out, iv, sz, key, nr); + } + else + #endif + #ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_CBC_decrypt_vaes(in, out, iv, sz, key, nr); + } + else + #endif + #ifdef HAVE_INTEL_AVX1 + if (IS_INTEL_AVX1(intel_flags)) { + AES_CBC_decrypt_avx1(in, out, iv, sz, key, nr); + } + else + #endif + { + AES_CBC_decrypt_AESNI(in, out, iv, sz, key, nr); + } + } + #endif /* HAVE_AES_DECRYPT */ + + static WC_INLINE void AesCtrEncryptBlocks(const unsigned char* in, + unsigned char* out, word32 sz, const unsigned char* key, int nr, + unsigned char* ctr) + { + #ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_CTR_encrypt_avx512(in, out, sz, key, nr, ctr); + } + else + #endif + #ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_CTR_encrypt_vaes(in, out, sz, key, nr, ctr); + } + else + #endif + #ifdef HAVE_INTEL_AVX1 + if (IS_INTEL_AVX1(intel_flags)) { + AES_CTR_encrypt_avx1(in, out, sz, key, nr, ctr); + } + else + #endif + { + AES_CTR_encrypt_AESNI(in, out, sz, key, nr, ctr); + } + } +#endif /* WOLFSSL_X86_64_BUILD */ + static WARN_UNUSED_RESULT int AES_set_encrypt_key_AESNI( const unsigned char *userKey, const int bits, Aes* aes) @@ -6858,8 +7070,13 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) else { tmp_align = tmp + (AESNI_ALIGN - ((wc_ptr_t)tmp % AESNI_ALIGN)); XMEMCPY(tmp_align, in, sz); + #ifdef WOLFSSL_X86_64_BUILD + AesCbcEncryptBlocks(tmp_align, tmp_align, (byte*)aes->reg, sz, + (byte*)aes->key, (int)aes->rounds); + #else AES_CBC_encrypt_AESNI(tmp_align, tmp_align, (byte*)aes->reg, sz, (byte*)aes->key, (int)aes->rounds); + #endif /* store iv for next call */ XMEMCPY(aes->reg, tmp_align + sz - WC_AES_BLOCK_SIZE, WC_AES_BLOCK_SIZE); @@ -6873,8 +7090,13 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) ret = BAD_ALIGN_E; #endif } else { + #ifdef WOLFSSL_X86_64_BUILD + AesCbcEncryptBlocks(in, out, (byte*)aes->reg, sz, (byte*)aes->key, + (int)aes->rounds); + #else AES_CBC_encrypt_AESNI(in, out, (byte*)aes->reg, sz, (byte*)aes->key, (int)aes->rounds); + #endif /* store iv for next call */ XMEMCPY(aes->reg, out + sz - WC_AES_BLOCK_SIZE, WC_AES_BLOCK_SIZE); @@ -7056,7 +7278,10 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) /* if input and output same will overwrite input iv */ XMEMCPY(aes->tmp, in + sz - WC_AES_BLOCK_SIZE, WC_AES_BLOCK_SIZE); - #if defined(WOLFSSL_AESNI_BY4) || defined(WOLFSSL_X86_BUILD) + #if defined(WOLFSSL_X86_64_BUILD) + AesCbcDecryptBlocks(in, out, (byte*)aes->reg, sz, (byte*)aes->key, + (int)aes->rounds); + #elif defined(WOLFSSL_AESNI_BY4) || defined(WOLFSSL_X86_BUILD) AES_CBC_decrypt_AESNI_by4(in, out, (byte*)aes->reg, sz, (byte*)aes->key, aes->rounds); #elif defined(WOLFSSL_AESNI_BY6) @@ -7563,6 +7788,19 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) #else VECTOR_REGISTERS_PUSH; + #if defined(WOLFSSL_AESNI) && defined(WOLFSSL_X86_64_BUILD) + if (aes->use_aesni && sz >= WC_AES_BLOCK_SIZE) { + word32 ctrBlocks = sz / WC_AES_BLOCK_SIZE; + word32 ctrBytes = ctrBlocks * WC_AES_BLOCK_SIZE; + AesCtrEncryptBlocks(in, out, ctrBytes, (byte*)aes->key, + (int)aes->rounds, (byte*)aes->reg); + in += ctrBytes; + out += ctrBytes; + sz -= ctrBytes; + aes->left = 0; + } + #endif + #if defined(HAVE_AES_ECB) && !defined(WOLFSSL_PIC32MZ_CRYPT) && \ !defined(XTRANSFORM_AESCTRBLOCK) if (in != out && sz >= WC_AES_BLOCK_SIZE) { @@ -7910,7 +8148,17 @@ void GenerateM0(Gcm* gcm) #if defined(WOLFSSL_AESNI) && defined(USE_INTEL_SPEEDUP) #define HAVE_INTEL_AVX1 - #define HAVE_INTEL_AVX2 + #ifndef NO_AVX2_SUPPORT + #define HAVE_INTEL_AVX2 + #endif + #ifdef WOLFSSL_X86_64_BUILD + #ifndef NO_VAES_SUPPORT + #define HAVE_INTEL_VAES + #endif + #ifndef NO_AVX512_SUPPORT + #define HAVE_INTEL_AVX512 + #endif + #endif #endif #if defined(WOLFSSL_AESNI) && defined(GCM_TABLE_4BIT) && \ @@ -8128,6 +8376,24 @@ void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, word32 tbytes, const unsigned char* key, int nr) XASM_LINK("AES_GCM_encrypt_avx2"); +#ifdef HAVE_INTEL_AVX512 +void AES_GCM_encrypt_avx512(const unsigned char *in, unsigned char *out, + const unsigned char* addt, const unsigned char* ivec, + unsigned char *tag, word32 nbytes, + word32 abytes, word32 ibytes, + word32 tbytes, const unsigned char* key, + int nr) + XASM_LINK("AES_GCM_encrypt_avx512"); +#endif +#ifdef HAVE_INTEL_VAES +void AES_GCM_encrypt_vaes(const unsigned char *in, unsigned char *out, + const unsigned char* addt, const unsigned char* ivec, + unsigned char *tag, word32 nbytes, + word32 abytes, word32 ibytes, + word32 tbytes, const unsigned char* key, + int nr) + XASM_LINK("AES_GCM_encrypt_vaes"); +#endif #endif /* HAVE_INTEL_AVX2 */ #endif /* HAVE_INTEL_AVX1 */ @@ -8152,6 +8418,22 @@ void AES_GCM_decrypt_avx2(const unsigned char *in, unsigned char *out, word32 abytes, word32 ibytes, word32 tbytes, const unsigned char* key, int nr, int* res) XASM_LINK("AES_GCM_decrypt_avx2"); +#ifdef HAVE_INTEL_AVX512 +void AES_GCM_decrypt_avx512(const unsigned char *in, unsigned char *out, + const unsigned char* addt, const unsigned char* ivec, + const unsigned char *tag, word32 nbytes, + word32 abytes, word32 ibytes, word32 tbytes, + const unsigned char* key, int nr, int* res) + XASM_LINK("AES_GCM_decrypt_avx512"); +#endif +#ifdef HAVE_INTEL_VAES +void AES_GCM_decrypt_vaes(const unsigned char *in, unsigned char *out, + const unsigned char* addt, const unsigned char* ivec, + const unsigned char *tag, word32 nbytes, + word32 abytes, word32 ibytes, word32 tbytes, + const unsigned char* key, int nr, int* res) + XASM_LINK("AES_GCM_decrypt_vaes"); +#endif #endif /* HAVE_INTEL_AVX2 */ #endif /* HAVE_INTEL_AVX1 */ #endif /* HAVE_AES_DECRYPT */ @@ -10535,6 +10817,22 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, #else #ifdef WOLFSSL_AESNI if (aes->use_aesni) { +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_encrypt_avx512(in, out, authIn, iv, authTag, sz, authInSz, ivSz, + authTagSz, (const byte*)aes->key, (int)aes->rounds); + ret = 0; + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_encrypt_vaes(in, out, authIn, iv, authTag, sz, authInSz, ivSz, + authTagSz, (const byte*)aes->key, (int)aes->rounds); + ret = 0; + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_encrypt_avx2(in, out, authIn, iv, authTag, sz, authInSz, ivSz, @@ -11293,6 +11591,28 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, #else #ifdef WOLFSSL_AESNI if (aes->use_aesni) { +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_decrypt_avx512(in, out, authIn, iv, authTag, sz, authInSz, ivSz, + authTagSz, (byte*)aes->key, (int)aes->rounds, &res); + if (res == 0) + ret = AES_GCM_AUTH_E; + else + ret = 0; + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_decrypt_vaes(in, out, authIn, iv, authTag, sz, authInSz, ivSz, + authTagSz, (byte*)aes->key, (int)aes->rounds, &res); + if (res == 0) + ret = AES_GCM_AUTH_E; + else + ret = 0; + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_decrypt_avx2(in, out, authIn, iv, authTag, sz, authInSz, ivSz, @@ -11513,19 +11833,73 @@ static WARN_UNUSED_RESULT int AesGcmFinal_C( extern void AES_GCM_init_avx2(const unsigned char* key, int nr, const unsigned char* ivec, unsigned int ibytes, unsigned char* h, unsigned char* counter, unsigned char* initCtr); +#ifdef HAVE_INTEL_AVX512 +extern void AES_GCM_init_avx512(const unsigned char* key, int nr, + const unsigned char* ivec, unsigned int ibytes, unsigned char* h, + unsigned char* counter, unsigned char* initCtr); +#endif +#ifdef HAVE_INTEL_VAES +extern void AES_GCM_init_vaes(const unsigned char* key, int nr, + const unsigned char* ivec, unsigned int ibytes, unsigned char* h, + unsigned char* counter, unsigned char* initCtr); +#endif extern void AES_GCM_aad_update_avx2(const unsigned char* addt, unsigned int abytes, unsigned char* tag, unsigned char* h); +#ifdef HAVE_INTEL_AVX512 +extern void AES_GCM_aad_update_avx512(const unsigned char* addt, + unsigned int abytes, unsigned char* tag, unsigned char* h); +#endif +#ifdef HAVE_INTEL_VAES +extern void AES_GCM_aad_update_vaes(const unsigned char* addt, + unsigned int abytes, unsigned char* tag, unsigned char* h); +#endif extern void AES_GCM_encrypt_block_avx2(const unsigned char* key, int nr, unsigned char* out, const unsigned char* in, unsigned char* counter); +#ifdef HAVE_INTEL_AVX512 +extern void AES_GCM_encrypt_block_avx512(const unsigned char* key, int nr, + unsigned char* out, const unsigned char* in, unsigned char* counter); +#endif +#ifdef HAVE_INTEL_VAES +extern void AES_GCM_encrypt_block_vaes(const unsigned char* key, int nr, + unsigned char* out, const unsigned char* in, unsigned char* counter); +#endif extern void AES_GCM_ghash_block_avx2(const unsigned char* data, unsigned char* tag, unsigned char* h); +#ifdef HAVE_INTEL_AVX512 +extern void AES_GCM_ghash_block_avx512(const unsigned char* data, + unsigned char* tag, unsigned char* h); +#endif +#ifdef HAVE_INTEL_VAES +extern void AES_GCM_ghash_block_vaes(const unsigned char* data, + unsigned char* tag, unsigned char* h); +#endif extern void AES_GCM_encrypt_update_avx2(const unsigned char* key, int nr, unsigned char* out, const unsigned char* in, unsigned int nbytes, unsigned char* tag, unsigned char* h, unsigned char* counter); +#ifdef HAVE_INTEL_AVX512 +extern void AES_GCM_encrypt_update_avx512(const unsigned char* key, int nr, + unsigned char* out, const unsigned char* in, unsigned int nbytes, + unsigned char* tag, unsigned char* h, unsigned char* counter); +#endif +#ifdef HAVE_INTEL_VAES +extern void AES_GCM_encrypt_update_vaes(const unsigned char* key, int nr, + unsigned char* out, const unsigned char* in, unsigned int nbytes, + unsigned char* tag, unsigned char* h, unsigned char* counter); +#endif extern void AES_GCM_encrypt_final_avx2(unsigned char* tag, unsigned char* authTag, unsigned int tbytes, unsigned int nbytes, unsigned int abytes, unsigned char* h, unsigned char* initCtr); +#ifdef HAVE_INTEL_AVX512 +extern void AES_GCM_encrypt_final_avx512(unsigned char* tag, + unsigned char* authTag, unsigned int tbytes, unsigned int nbytes, + unsigned int abytes, unsigned char* h, unsigned char* initCtr); +#endif +#ifdef HAVE_INTEL_VAES +extern void AES_GCM_encrypt_final_vaes(unsigned char* tag, + unsigned char* authTag, unsigned int tbytes, unsigned int nbytes, + unsigned int abytes, unsigned char* h, unsigned char* initCtr); +#endif #endif #ifdef HAVE_INTEL_AVX1 extern void AES_GCM_init_avx1(const unsigned char* key, int nr, @@ -11587,6 +11961,20 @@ static WARN_UNUSED_RESULT int AesGcmInit_aesni( aes->aOver = 0; aes->cOver = 0; +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_init_avx512((byte*)aes->key, (int)aes->rounds, iv, ivSz, + aes->gcm.H, AES_COUNTER(aes), AES_INITCTR(aes)); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_init_vaes((byte*)aes->key, (int)aes->rounds, iv, ivSz, + aes->gcm.H, AES_COUNTER(aes), AES_INITCTR(aes)); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_init_avx2((byte*)aes->key, (int)aes->rounds, iv, ivSz, @@ -11641,6 +12029,20 @@ static WARN_UNUSED_RESULT int AesGcmAadUpdate_aesni( aes->aOver = (byte)(aes->aOver + sz); if (aes->aOver == WC_AES_BLOCK_SIZE) { /* We have filled up the block and can process. */ +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_avx512(AES_LASTGBLOCK(aes), AES_TAG(aes), + aes->gcm.H); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_vaes(AES_LASTGBLOCK(aes), AES_TAG(aes), + aes->gcm.H); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_ghash_block_avx2(AES_LASTGBLOCK(aes), AES_TAG(aes), @@ -11672,6 +12074,20 @@ static WARN_UNUSED_RESULT int AesGcmAadUpdate_aesni( partial = aSz % WC_AES_BLOCK_SIZE; if (blocks > 0) { /* GHASH full blocks now. */ +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_aad_update_avx512(a, blocks * WC_AES_BLOCK_SIZE, + AES_TAG(aes), aes->gcm.H); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_aad_update_vaes(a, blocks * WC_AES_BLOCK_SIZE, + AES_TAG(aes), aes->gcm.H); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_aad_update_avx2(a, blocks * WC_AES_BLOCK_SIZE, @@ -11705,6 +12121,20 @@ static WARN_UNUSED_RESULT int AesGcmAadUpdate_aesni( XMEMSET(AES_LASTGBLOCK(aes) + aes->aOver, 0, (size_t)WC_AES_BLOCK_SIZE - aes->aOver); /* GHASH last AAD block. */ +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_avx512(AES_LASTGBLOCK(aes), AES_TAG(aes), + aes->gcm.H); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_vaes(AES_LASTGBLOCK(aes), AES_TAG(aes), + aes->gcm.H); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_ghash_block_avx2(AES_LASTGBLOCK(aes), AES_TAG(aes), @@ -11772,6 +12202,20 @@ static WARN_UNUSED_RESULT int AesGcmEncryptUpdate_aesni( aes->cOver = (byte)(aes->cOver + sz); if (aes->cOver == WC_AES_BLOCK_SIZE) { /* We have filled up the block and can process. */ +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_avx512(AES_LASTGBLOCK(aes), AES_TAG(aes), + aes->gcm.H); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_vaes(AES_LASTGBLOCK(aes), AES_TAG(aes), + aes->gcm.H); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_ghash_block_avx2(AES_LASTGBLOCK(aes), AES_TAG(aes), @@ -11804,6 +12248,22 @@ static WARN_UNUSED_RESULT int AesGcmEncryptUpdate_aesni( partial = cSz % WC_AES_BLOCK_SIZE; if (blocks > 0) { /* Encrypt and GHASH full blocks now. */ +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_encrypt_update_avx512((byte*)aes->key, (int)aes->rounds, + c, p, blocks * WC_AES_BLOCK_SIZE, AES_TAG(aes), aes->gcm.H, + AES_COUNTER(aes)); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_encrypt_update_vaes((byte*)aes->key, (int)aes->rounds, + c, p, blocks * WC_AES_BLOCK_SIZE, AES_TAG(aes), aes->gcm.H, + AES_COUNTER(aes)); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_encrypt_update_avx2((byte*)aes->key, (int)aes->rounds, @@ -11832,6 +12292,20 @@ static WARN_UNUSED_RESULT int AesGcmEncryptUpdate_aesni( if (partial != 0) { /* Encrypt the counter - XOR in zeros as proxy for plaintext. */ XMEMSET(AES_LASTGBLOCK(aes), 0, WC_AES_BLOCK_SIZE); +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_encrypt_block_avx512((byte*)aes->key, (int)aes->rounds, + AES_LASTGBLOCK(aes), AES_LASTGBLOCK(aes), AES_COUNTER(aes)); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_encrypt_block_vaes((byte*)aes->key, (int)aes->rounds, + AES_LASTGBLOCK(aes), AES_LASTGBLOCK(aes), AES_COUNTER(aes)); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_encrypt_block_avx2((byte*)aes->key, (int)aes->rounds, @@ -11887,6 +12361,20 @@ static WARN_UNUSED_RESULT int AesGcmEncryptFinal_aesni( /* Fill the rest of the block with zeros. */ XMEMSET(AES_LASTGBLOCK(aes) + over, 0, (size_t)WC_AES_BLOCK_SIZE - over); /* GHASH last cipher block. */ +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_avx512(AES_LASTGBLOCK(aes), AES_TAG(aes), + aes->gcm.H); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_vaes(AES_LASTGBLOCK(aes), AES_TAG(aes), + aes->gcm.H); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_ghash_block_avx2(AES_LASTGBLOCK(aes), AES_TAG(aes), @@ -11907,6 +12395,20 @@ static WARN_UNUSED_RESULT int AesGcmEncryptFinal_aesni( } } /* Calculate the authentication tag. */ +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_encrypt_final_avx512(AES_TAG(aes), authTag, authTagSz, aes->cSz, + aes->aSz, aes->gcm.H, AES_INITCTR(aes)); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_encrypt_final_vaes(AES_TAG(aes), authTag, authTagSz, aes->cSz, + aes->aSz, aes->gcm.H, AES_INITCTR(aes)); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_encrypt_final_avx2(AES_TAG(aes), authTag, authTagSz, aes->cSz, @@ -11940,9 +12442,29 @@ static WARN_UNUSED_RESULT int AesGcmEncryptFinal_aesni( extern void AES_GCM_decrypt_update_avx2(const unsigned char* key, int nr, unsigned char* out, const unsigned char* in, unsigned int nbytes, unsigned char* tag, unsigned char* h, unsigned char* counter); +#ifdef HAVE_INTEL_AVX512 +extern void AES_GCM_decrypt_update_avx512(const unsigned char* key, int nr, + unsigned char* out, const unsigned char* in, unsigned int nbytes, + unsigned char* tag, unsigned char* h, unsigned char* counter); +#endif +#ifdef HAVE_INTEL_VAES +extern void AES_GCM_decrypt_update_vaes(const unsigned char* key, int nr, + unsigned char* out, const unsigned char* in, unsigned int nbytes, + unsigned char* tag, unsigned char* h, unsigned char* counter); +#endif extern void AES_GCM_decrypt_final_avx2(unsigned char* tag, const unsigned char* authTag, unsigned int tbytes, unsigned int nbytes, unsigned int abytes, unsigned char* h, unsigned char* initCtr, int* res); +#ifdef HAVE_INTEL_AVX512 +extern void AES_GCM_decrypt_final_avx512(unsigned char* tag, + const unsigned char* authTag, unsigned int tbytes, unsigned int nbytes, + unsigned int abytes, unsigned char* h, unsigned char* initCtr, int* res); +#endif +#ifdef HAVE_INTEL_VAES +extern void AES_GCM_decrypt_final_vaes(unsigned char* tag, + const unsigned char* authTag, unsigned int tbytes, unsigned int nbytes, + unsigned int abytes, unsigned char* h, unsigned char* initCtr, int* res); +#endif #endif #ifdef HAVE_INTEL_AVX1 extern void AES_GCM_decrypt_update_avx1(const unsigned char* key, int nr, @@ -12005,6 +12527,20 @@ static WARN_UNUSED_RESULT int AesGcmDecryptUpdate_aesni( aes->cOver = (byte)(aes->cOver + sz); if (aes->cOver == WC_AES_BLOCK_SIZE) { /* We have filled up the block and can process. */ +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_avx512(AES_LASTBLOCK(aes), AES_TAG(aes), + aes->gcm.H); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_vaes(AES_LASTBLOCK(aes), AES_TAG(aes), + aes->gcm.H); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_ghash_block_avx2(AES_LASTBLOCK(aes), AES_TAG(aes), @@ -12037,6 +12573,22 @@ static WARN_UNUSED_RESULT int AesGcmDecryptUpdate_aesni( partial = cSz % WC_AES_BLOCK_SIZE; if (blocks > 0) { /* Decrypt and GHASH full blocks now. */ +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_decrypt_update_avx512((byte*)aes->key, (int)aes->rounds, + p, c, blocks * WC_AES_BLOCK_SIZE, AES_TAG(aes), aes->gcm.H, + AES_COUNTER(aes)); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_decrypt_update_vaes((byte*)aes->key, (int)aes->rounds, + p, c, blocks * WC_AES_BLOCK_SIZE, AES_TAG(aes), aes->gcm.H, + AES_COUNTER(aes)); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_decrypt_update_avx2((byte*)aes->key, (int)aes->rounds, @@ -12065,6 +12617,20 @@ static WARN_UNUSED_RESULT int AesGcmDecryptUpdate_aesni( if (partial != 0) { /* Encrypt the counter - XOR in zeros as proxy for cipher text. */ XMEMSET(AES_LASTGBLOCK(aes), 0, WC_AES_BLOCK_SIZE); +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_encrypt_block_avx512((byte*)aes->key, (int)aes->rounds, + AES_LASTGBLOCK(aes), AES_LASTGBLOCK(aes), AES_COUNTER(aes)); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_encrypt_block_vaes((byte*)aes->key, (int)aes->rounds, + AES_LASTGBLOCK(aes), AES_LASTGBLOCK(aes), AES_COUNTER(aes)); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_encrypt_block_avx2((byte*)aes->key, (int)aes->rounds, @@ -12127,6 +12693,18 @@ static WARN_UNUSED_RESULT int AesGcmDecryptFinal_aesni( /* Zeroize the unused part of the block. */ XMEMSET(lastBlock + over, 0, (size_t)WC_AES_BLOCK_SIZE - over); /* Hash the last block of cipher text. */ +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_avx512(lastBlock, AES_TAG(aes), aes->gcm.H); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_vaes(lastBlock, AES_TAG(aes), aes->gcm.H); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_ghash_block_avx2(lastBlock, AES_TAG(aes), aes->gcm.H); @@ -12144,6 +12722,20 @@ static WARN_UNUSED_RESULT int AesGcmDecryptFinal_aesni( } } /* Calculate and compare the authentication tag. */ +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_decrypt_final_avx512(AES_TAG(aes), authTag, authTagSz, aes->cSz, + aes->aSz, aes->gcm.H, AES_INITCTR(aes), &res); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_decrypt_final_vaes(AES_TAG(aes), authTag, authTagSz, aes->cSz, + aes->aSz, aes->gcm.H, AES_INITCTR(aes), &res); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_decrypt_final_avx2(AES_TAG(aes), authTag, authTagSz, aes->cSz, @@ -14537,7 +15129,11 @@ static WARN_UNUSED_RESULT int _AesEcbEncrypt( #else #ifdef WOLFSSL_AESNI if (aes->use_aesni) { + #ifdef WOLFSSL_X86_64_BUILD + AesEcbEncryptBlocks(in, out, sz, (byte*)aes->key, (int)aes->rounds); + #else AES_ECB_encrypt_AESNI(in, out, sz, (byte*)aes->key, (int)aes->rounds); + #endif } else #endif @@ -14632,7 +15228,11 @@ static WARN_UNUSED_RESULT int _AesEcbDecrypt( #else #ifdef WOLFSSL_AESNI if (aes->use_aesni) { + #ifdef WOLFSSL_X86_64_BUILD + AesEcbDecryptBlocks(in, out, sz, (byte*)aes->key, (int)aes->rounds); + #else AES_ECB_decrypt_AESNI(in, out, sz, (byte*)aes->key, (int)aes->rounds); + #endif } else #endif @@ -15797,6 +16397,37 @@ void AES_XTS_encrypt_update_avx1(const unsigned char *in, unsigned char *out, wo XASM_LINK("AES_XTS_encrypt_update_avx1"); #endif #endif /* HAVE_INTEL_AVX1 */ +#ifdef HAVE_INTEL_VAES +void AES_XTS_encrypt_vaes(const unsigned char *in, unsigned char *out, + word32 sz, const unsigned char* i, + const unsigned char* key, const unsigned char* key2, + int nr) + XASM_LINK("AES_XTS_encrypt_vaes"); +#ifdef WOLFSSL_AESXTS_STREAM +void AES_XTS_init_vaes(unsigned char* i, const unsigned char* tweak_key, + int tweak_nr) + XASM_LINK("AES_XTS_init_vaes"); +void AES_XTS_encrypt_update_vaes(const unsigned char *in, unsigned char *out, word32 sz, + const unsigned char* key, unsigned char *i, int nr) + XASM_LINK("AES_XTS_encrypt_update_vaes"); +#endif +#endif /* HAVE_INTEL_VAES */ +#ifdef HAVE_INTEL_AVX512 +void AES_XTS_encrypt_avx512(const unsigned char *in, unsigned char *out, + word32 sz, const unsigned char* i, + const unsigned char* key, const unsigned char* key2, + int nr) + XASM_LINK("AES_XTS_encrypt_avx512"); +#ifdef WOLFSSL_AESXTS_STREAM +void AES_XTS_init_avx512(unsigned char* i, const unsigned char* tweak_key, + int tweak_nr) + XASM_LINK("AES_XTS_init_avx512"); +void AES_XTS_encrypt_update_avx512(const unsigned char *in, unsigned char *out, word32 sz, + const unsigned char* key, unsigned char *i, int nr) + XASM_LINK("AES_XTS_encrypt_update_avx512"); +#endif +#endif /* HAVE_INTEL_AVX512 */ + #ifdef HAVE_AES_DECRYPT void AES_XTS_decrypt_aesni(const unsigned char *in, unsigned char *out, word32 sz, @@ -15820,6 +16451,30 @@ void AES_XTS_decrypt_update_avx1(const unsigned char *in, unsigned char *out, wo XASM_LINK("AES_XTS_decrypt_update_avx1"); #endif #endif /* HAVE_INTEL_AVX1 */ +#ifdef HAVE_INTEL_VAES +void AES_XTS_decrypt_vaes(const unsigned char *in, unsigned char *out, + word32 sz, const unsigned char* i, + const unsigned char* key, const unsigned char* key2, + int nr) + XASM_LINK("AES_XTS_decrypt_vaes"); +#ifdef WOLFSSL_AESXTS_STREAM +void AES_XTS_decrypt_update_vaes(const unsigned char *in, unsigned char *out, word32 sz, + const unsigned char* key, unsigned char *i, int nr) + XASM_LINK("AES_XTS_decrypt_update_vaes"); +#endif +#endif /* HAVE_INTEL_VAES */ +#ifdef HAVE_INTEL_AVX512 +void AES_XTS_decrypt_avx512(const unsigned char *in, unsigned char *out, + word32 sz, const unsigned char* i, + const unsigned char* key, const unsigned char* key2, + int nr) + XASM_LINK("AES_XTS_decrypt_avx512"); +#ifdef WOLFSSL_AESXTS_STREAM +void AES_XTS_decrypt_update_avx512(const unsigned char *in, unsigned char *out, word32 sz, + const unsigned char* key, unsigned char *i, int nr) + XASM_LINK("AES_XTS_decrypt_update_avx512"); +#endif +#endif /* HAVE_INTEL_AVX512 */ #endif /* HAVE_AES_DECRYPT */ #endif /* WOLFSSL_AESNI */ @@ -16078,6 +16733,26 @@ int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, #elif defined(WOLFSSL_AESNI) if (aes->use_aesni) { SAVE_VECTOR_REGISTERS(return _svr_ret;); +#if defined(HAVE_INTEL_AVX512) + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_encrypt_avx512(in, out, sz, i, + (const byte*)aes->key, + (const byte*)xaes->tweak.key, + (int)aes->rounds); + ret = 0; + } + else +#endif +#if defined(HAVE_INTEL_VAES) + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_encrypt_vaes(in, out, sz, i, + (const byte*)aes->key, + (const byte*)xaes->tweak.key, + (int)aes->rounds); + ret = 0; + } + else +#endif #if defined(HAVE_INTEL_AVX1) if (IS_INTEL_AVX1(intel_flags)) { AES_XTS_encrypt_avx1(in, out, sz, i, @@ -16180,6 +16855,24 @@ int wc_AesXtsEncryptInit(XtsAes* xaes, const byte* i, word32 iSz, #ifdef WOLFSSL_AESNI if (aes->use_aesni) { SAVE_VECTOR_REGISTERS(return _svr_ret;); +#if defined(HAVE_INTEL_AVX512) + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_init_avx512(stream->tweak_block, + (const byte*)xaes->tweak.key, + (int)xaes->tweak.rounds); + ret = 0; + } + else +#endif +#if defined(HAVE_INTEL_VAES) + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_init_vaes(stream->tweak_block, + (const byte*)xaes->tweak.key, + (int)xaes->tweak.rounds); + ret = 0; + } + else +#endif #if defined(HAVE_INTEL_AVX1) if (IS_INTEL_AVX1(intel_flags)) { AES_XTS_init_avx1(stream->tweak_block, @@ -16275,6 +16968,26 @@ static int AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s #ifdef WOLFSSL_AESNI if (aes->use_aesni) { SAVE_VECTOR_REGISTERS(return _svr_ret;); +#if defined(HAVE_INTEL_AVX512) + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_encrypt_update_avx512(in, out, sz, + (const byte*)aes->key, + stream->tweak_block, + (int)aes->rounds); + ret = 0; + } + else +#endif +#if defined(HAVE_INTEL_VAES) + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_encrypt_update_vaes(in, out, sz, + (const byte*)aes->key, + stream->tweak_block, + (int)aes->rounds); + ret = 0; + } + else +#endif #if defined(HAVE_INTEL_AVX1) if (IS_INTEL_AVX1(intel_flags)) { AES_XTS_encrypt_update_avx1(in, out, sz, @@ -16559,6 +17272,26 @@ int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, #elif defined(WOLFSSL_AESNI) if (aes->use_aesni) { SAVE_VECTOR_REGISTERS(return _svr_ret;); +#if defined(HAVE_INTEL_AVX512) + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_decrypt_avx512(in, out, sz, i, + (const byte*)aes->key, + (const byte*)xaes->tweak.key, + (int)aes->rounds); + ret = 0; + } + else +#endif +#if defined(HAVE_INTEL_VAES) + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_decrypt_vaes(in, out, sz, i, + (const byte*)aes->key, + (const byte*)xaes->tweak.key, + (int)aes->rounds); + ret = 0; + } + else +#endif #if defined(HAVE_INTEL_AVX1) if (IS_INTEL_AVX1(intel_flags)) { AES_XTS_decrypt_avx1(in, out, sz, i, @@ -16664,6 +17397,24 @@ int wc_AesXtsDecryptInit(XtsAes* xaes, const byte* i, word32 iSz, #ifdef WOLFSSL_AESNI if (aes->use_aesni) { SAVE_VECTOR_REGISTERS(return _svr_ret;); +#if defined(HAVE_INTEL_AVX512) + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_init_avx512(stream->tweak_block, + (const byte*)xaes->tweak.key, + (int)xaes->tweak.rounds); + ret = 0; + } + else +#endif +#if defined(HAVE_INTEL_VAES) + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_init_vaes(stream->tweak_block, + (const byte*)xaes->tweak.key, + (int)xaes->tweak.rounds); + ret = 0; + } + else +#endif #if defined(HAVE_INTEL_AVX1) if (IS_INTEL_AVX1(intel_flags)) { AES_XTS_init_avx1(stream->tweak_block, @@ -16751,6 +17502,26 @@ static int AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s #ifdef WOLFSSL_AESNI if (aes->use_aesni) { SAVE_VECTOR_REGISTERS(return _svr_ret;); +#if defined(HAVE_INTEL_AVX512) + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_decrypt_update_avx512(in, out, sz, + (const byte*)aes->key, + stream->tweak_block, + (int)aes->rounds); + ret = 0; + } + else +#endif +#if defined(HAVE_INTEL_VAES) + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_decrypt_update_vaes(in, out, sz, + (const byte*)aes->key, + stream->tweak_block, + (int)aes->rounds); + ret = 0; + } + else +#endif #if defined(HAVE_INTEL_AVX1) if (IS_INTEL_AVX1(intel_flags)) { AES_XTS_decrypt_update_avx1(in, out, sz, diff --git a/wolfcrypt/src/aes_asm.S b/wolfcrypt/src/aes_asm.S index 0371ca8cb22..d4131676542 100644 --- a/wolfcrypt/src/aes_asm.S +++ b/wolfcrypt/src/aes_asm.S @@ -46,1314 +46,7 @@ #endif /* WOLFSSL_USER_SETTINGS_ASM */ #endif /* WOLFSSL_USER_SETTINGS */ -#ifdef WOLFSSL_X86_64_BUILD - -/* -AES_CBC_encrypt_AESNI (const unsigned char *in, - unsigned char *out, - unsigned char ivec[16], - unsigned long length, - const unsigned char *KS, - int nr) -*/ -#ifndef __APPLE__ -.globl AES_CBC_encrypt_AESNI -AES_CBC_encrypt_AESNI: -#else -.globl _AES_CBC_encrypt_AESNI -_AES_CBC_encrypt_AESNI: -#endif -# parameter 1: %rdi -# parameter 2: %rsi -# parameter 3: %rdx -# parameter 4: %rcx -# parameter 5: %r8 -# parameter 6: %r9d -movq %rcx, %r10 -shrq $4, %rcx -shlq $60, %r10 -je NO_PARTS -addq $1, %rcx -NO_PARTS: -subq $16, %rsi -movdqa (%rdx), %xmm1 -LOOP: -pxor (%rdi), %xmm1 -pxor (%r8), %xmm1 -addq $16,%rsi -addq $16,%rdi -cmpl $12, %r9d -aesenc 16(%r8),%xmm1 -aesenc 32(%r8),%xmm1 -aesenc 48(%r8),%xmm1 -aesenc 64(%r8),%xmm1 -aesenc 80(%r8),%xmm1 -aesenc 96(%r8),%xmm1 -aesenc 112(%r8),%xmm1 -aesenc 128(%r8),%xmm1 -aesenc 144(%r8),%xmm1 -movdqa 160(%r8),%xmm2 -jb LAST -cmpl $14, %r9d - -aesenc 160(%r8),%xmm1 -aesenc 176(%r8),%xmm1 -movdqa 192(%r8),%xmm2 -jb LAST -aesenc 192(%r8),%xmm1 -aesenc 208(%r8),%xmm1 -movdqa 224(%r8),%xmm2 -LAST: -decq %rcx -aesenclast %xmm2,%xmm1 -movdqu %xmm1,(%rsi) -jne LOOP -ret - - -#if defined(WOLFSSL_AESNI_BY4) - -/* -AES_CBC_decrypt_AESNI_by4 (const unsigned char *in, - unsigned char *out, - unsigned char ivec[16], - unsigned long length, - const unsigned char *KS, - int nr) -*/ -#ifndef __APPLE__ -.globl AES_CBC_decrypt_AESNI_by4 -AES_CBC_decrypt_AESNI_by4: -#else -.globl _AES_CBC_decrypt_AESNI_by4 -_AES_CBC_decrypt_AESNI_by4: -#endif -# parameter 1: %rdi -# parameter 2: %rsi -# parameter 3: %rdx -# parameter 4: %rcx -# parameter 5: %r8 -# parameter 6: %r9d - - movq %rcx, %r10 - shrq $4, %rcx - shlq $60, %r10 - je DNO_PARTS_4 - addq $1, %rcx -DNO_PARTS_4: - movq %rcx, %r10 - shlq $62, %r10 - shrq $62, %r10 - shrq $2, %rcx - movdqu (%rdx),%xmm5 - je DREMAINDER_4 - subq $64, %rsi -DLOOP_4: - movdqu (%rdi), %xmm1 - movdqu 16(%rdi), %xmm2 - movdqu 32(%rdi), %xmm3 - movdqu 48(%rdi), %xmm4 - movdqa %xmm1, %xmm6 - movdqa %xmm2, %xmm7 - movdqa %xmm3, %xmm8 - movdqa %xmm4, %xmm15 - movdqa (%r8), %xmm9 - movdqa 16(%r8), %xmm10 - movdqa 32(%r8), %xmm11 - movdqa 48(%r8), %xmm12 - pxor %xmm9, %xmm1 - pxor %xmm9, %xmm2 - pxor %xmm9, %xmm3 - pxor %xmm9, %xmm4 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm3 - aesdec %xmm11, %xmm4 - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm3 - aesdec %xmm12, %xmm4 - movdqa 64(%r8), %xmm9 - movdqa 80(%r8), %xmm10 - movdqa 96(%r8), %xmm11 - movdqa 112(%r8), %xmm12 - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm3 - aesdec %xmm11, %xmm4 - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm3 - aesdec %xmm12, %xmm4 - movdqa 128(%r8), %xmm9 - movdqa 144(%r8), %xmm10 - movdqa 160(%r8), %xmm11 - cmpl $12, %r9d - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - jb DLAST_4 - movdqa 160(%r8), %xmm9 - movdqa 176(%r8), %xmm10 - movdqa 192(%r8), %xmm11 - cmpl $14, %r9d - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - jb DLAST_4 - movdqa 192(%r8), %xmm9 - movdqa 208(%r8), %xmm10 - movdqa 224(%r8), %xmm11 - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 -DLAST_4: - addq $64, %rdi - addq $64, %rsi - decq %rcx - aesdeclast %xmm11, %xmm1 - aesdeclast %xmm11, %xmm2 - aesdeclast %xmm11, %xmm3 - aesdeclast %xmm11, %xmm4 - pxor %xmm5, %xmm1 - pxor %xmm6, %xmm2 - pxor %xmm7, %xmm3 - pxor %xmm8, %xmm4 - movdqu %xmm1, (%rsi) - movdqu %xmm2, 16(%rsi) - movdqu %xmm3, 32(%rsi) - movdqu %xmm4, 48(%rsi) - movdqa %xmm15,%xmm5 - jne DLOOP_4 - addq $64, %rsi -DREMAINDER_4: - cmpq $0, %r10 - je DEND_4 -DLOOP_4_2: - movdqu (%rdi), %xmm1 - movdqa %xmm1, %xmm15 - addq $16, %rdi - pxor (%r8), %xmm1 - movdqu 160(%r8), %xmm2 - cmpl $12, %r9d - aesdec 16(%r8), %xmm1 - aesdec 32(%r8), %xmm1 - aesdec 48(%r8), %xmm1 - aesdec 64(%r8), %xmm1 - aesdec 80(%r8), %xmm1 - aesdec 96(%r8), %xmm1 - aesdec 112(%r8), %xmm1 - aesdec 128(%r8), %xmm1 - aesdec 144(%r8), %xmm1 - jb DLAST_4_2 - movdqu 192(%r8), %xmm2 - cmpl $14, %r9d - aesdec 160(%r8), %xmm1 - aesdec 176(%r8), %xmm1 - jb DLAST_4_2 - movdqu 224(%r8), %xmm2 - aesdec 192(%r8), %xmm1 - aesdec 208(%r8), %xmm1 -DLAST_4_2: - aesdeclast %xmm2, %xmm1 - pxor %xmm5, %xmm1 - movdqa %xmm15, %xmm5 - movdqu %xmm1, (%rsi) - addq $16, %rsi - decq %r10 - jne DLOOP_4_2 -DEND_4: - ret - -#elif defined(WOLFSSL_AESNI_BY6) - -/* -AES_CBC_decrypt_AESNI_by6 (const unsigned char *in, - unsigned char *out, - unsigned char ivec[16], - unsigned long length, - const unsigned char *KS, - int nr) -*/ -#ifndef __APPLE__ -.globl AES_CBC_decrypt_AESNI_by6 -AES_CBC_decrypt_AESNI_by6: -#else -.globl _AES_CBC_decrypt_AESNI_by6 -_AES_CBC_decrypt_AESNI_by6: -#endif -# parameter 1: %rdi - in -# parameter 2: %rsi - out -# parameter 3: %rdx - ivec -# parameter 4: %rcx - length -# parameter 5: %r8 - KS -# parameter 6: %r9d - nr - - movq %rcx, %r10 - shrq $4, %rcx - shlq $60, %r10 - je DNO_PARTS_6 - addq $1, %rcx -DNO_PARTS_6: - movq %rax, %r12 - movq %rdx, %r13 - movq %rbx, %r14 - movq $0, %rdx - movq %rcx, %rax - movq $6, %rbx - div %rbx - movq %rax, %rcx - movq %rdx, %r10 - movq %r12, %rax - movq %r13, %rdx - movq %r14, %rbx - cmpq $0, %rcx - movdqu (%rdx), %xmm7 - je DREMAINDER_6 - subq $96, %rsi -DLOOP_6: - movdqu (%rdi), %xmm1 - movdqu 16(%rdi), %xmm2 - movdqu 32(%rdi), %xmm3 - movdqu 48(%rdi), %xmm4 - movdqu 64(%rdi), %xmm5 - movdqu 80(%rdi), %xmm6 - movdqa (%r8), %xmm8 - movdqa 16(%r8), %xmm9 - movdqa 32(%r8), %xmm10 - movdqa 48(%r8), %xmm11 - pxor %xmm8, %xmm1 - pxor %xmm8, %xmm2 - pxor %xmm8, %xmm3 - pxor %xmm8, %xmm4 - pxor %xmm8, %xmm5 - pxor %xmm8, %xmm6 - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm9, %xmm5 - aesdec %xmm9, %xmm6 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - aesdec %xmm10, %xmm5 - aesdec %xmm10, %xmm6 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm3 - aesdec %xmm11, %xmm4 - aesdec %xmm11, %xmm5 - aesdec %xmm11, %xmm6 - movdqa 64(%r8), %xmm8 - movdqa 80(%r8), %xmm9 - movdqa 96(%r8), %xmm10 - movdqa 112(%r8), %xmm11 - aesdec %xmm8, %xmm1 - aesdec %xmm8, %xmm2 - aesdec %xmm8, %xmm3 - aesdec %xmm8, %xmm4 - aesdec %xmm8, %xmm5 - aesdec %xmm8, %xmm6 - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm9, %xmm5 - aesdec %xmm9, %xmm6 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - aesdec %xmm10, %xmm5 - aesdec %xmm10, %xmm6 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm3 - aesdec %xmm11, %xmm4 - aesdec %xmm11, %xmm5 - aesdec %xmm11, %xmm6 - movdqa 128(%r8), %xmm8 - movdqa 144(%r8), %xmm9 - movdqa 160(%r8), %xmm10 - cmpl $12, %r9d - aesdec %xmm8, %xmm1 - aesdec %xmm8, %xmm2 - aesdec %xmm8, %xmm3 - aesdec %xmm8, %xmm4 - aesdec %xmm8, %xmm5 - aesdec %xmm8, %xmm6 - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm9, %xmm5 - aesdec %xmm9, %xmm6 - jb DLAST_6 - movdqa 160(%r8), %xmm8 - movdqa 176(%r8), %xmm9 - movdqa 192(%r8), %xmm10 - cmpl $14, %r9d - aesdec %xmm8, %xmm1 - aesdec %xmm8, %xmm2 - aesdec %xmm8, %xmm3 - aesdec %xmm8, %xmm4 - aesdec %xmm8, %xmm5 - aesdec %xmm8, %xmm6 - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm9, %xmm5 - aesdec %xmm9, %xmm6 - jb DLAST_6 - movdqa 192(%r8), %xmm8 - movdqa 208(%r8), %xmm9 - movdqa 224(%r8), %xmm10 - aesdec %xmm8, %xmm1 - aesdec %xmm8, %xmm2 - aesdec %xmm8, %xmm3 - aesdec %xmm8, %xmm4 - aesdec %xmm8, %xmm5 - aesdec %xmm8, %xmm6 - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm9, %xmm5 - aesdec %xmm9, %xmm6 -DLAST_6: - addq $96, %rsi - aesdeclast %xmm10, %xmm1 - aesdeclast %xmm10, %xmm2 - aesdeclast %xmm10, %xmm3 - aesdeclast %xmm10, %xmm4 - aesdeclast %xmm10, %xmm5 - aesdeclast %xmm10, %xmm6 - movdqu (%rdi), %xmm8 - movdqu 16(%rdi), %xmm9 - movdqu 32(%rdi), %xmm10 - movdqu 48(%rdi), %xmm11 - movdqu 64(%rdi), %xmm12 - movdqu 80(%rdi), %xmm13 - pxor %xmm7, %xmm1 - pxor %xmm8, %xmm2 - pxor %xmm9, %xmm3 - pxor %xmm10, %xmm4 - pxor %xmm11, %xmm5 - pxor %xmm12, %xmm6 - movdqu %xmm13, %xmm7 - movdqu %xmm1, (%rsi) - movdqu %xmm2, 16(%rsi) - movdqu %xmm3, 32(%rsi) - movdqu %xmm4, 48(%rsi) - movdqu %xmm5, 64(%rsi) - movdqu %xmm6, 80(%rsi) - addq $96, %rdi - decq %rcx - jne DLOOP_6 - addq $96, %rsi -DREMAINDER_6: - cmpq $0, %r10 - je DEND_6 -DLOOP_6_2: - movdqu (%rdi), %xmm1 - movdqa %xmm1, %xmm10 - addq $16, %rdi - pxor (%r8), %xmm1 - movdqu 160(%r8), %xmm2 - cmpl $12, %r9d - aesdec 16(%r8), %xmm1 - aesdec 32(%r8), %xmm1 - aesdec 48(%r8), %xmm1 - aesdec 64(%r8), %xmm1 - aesdec 80(%r8), %xmm1 - aesdec 96(%r8), %xmm1 - aesdec 112(%r8), %xmm1 - aesdec 128(%r8), %xmm1 - aesdec 144(%r8), %xmm1 - jb DLAST_6_2 - movdqu 192(%r8), %xmm2 - cmpl $14, %r9d - aesdec 160(%r8), %xmm1 - aesdec 176(%r8), %xmm1 - jb DLAST_6_2 - movdqu 224(%r8), %xmm2 - aesdec 192(%r8), %xmm1 - aesdec 208(%r8), %xmm1 -DLAST_6_2: - aesdeclast %xmm2, %xmm1 - pxor %xmm7, %xmm1 - movdqa %xmm10, %xmm7 - movdqu %xmm1, (%rsi) - addq $16, %rsi - decq %r10 - jne DLOOP_6_2 -DEND_6: - ret - -#else /* WOLFSSL_AESNI_BYx */ - -/* -AES_CBC_decrypt_AESNI_by8 (const unsigned char *in, - unsigned char *out, - unsigned char ivec[16], - unsigned long length, - const unsigned char *KS, - int nr) -*/ -#ifndef __APPLE__ -.globl AES_CBC_decrypt_AESNI_by8 -AES_CBC_decrypt_AESNI_by8: -#else -.globl _AES_CBC_decrypt_AESNI_by8 -_AES_CBC_decrypt_AESNI_by8: -#endif -# parameter 1: %rdi - in -# parameter 2: %rsi - out -# parameter 3: %rdx - ivec -# parameter 4: %rcx - length -# parameter 5: %r8 - KS -# parameter 6: %r9d - nr - - movq %rcx, %r10 - shrq $4, %rcx - shlq $60, %r10 - je DNO_PARTS_8 - addq $1, %rcx -DNO_PARTS_8: - movq %rcx, %r10 - shlq $61, %r10 - shrq $61, %r10 - shrq $3, %rcx - movdqu (%rdx), %xmm9 - je DREMAINDER_8 - subq $128, %rsi -DLOOP_8: - movdqu (%rdi), %xmm1 - movdqu 16(%rdi), %xmm2 - movdqu 32(%rdi), %xmm3 - movdqu 48(%rdi), %xmm4 - movdqu 64(%rdi), %xmm5 - movdqu 80(%rdi), %xmm6 - movdqu 96(%rdi), %xmm7 - movdqu 112(%rdi), %xmm8 - movdqa (%r8), %xmm10 - movdqa 16(%r8), %xmm11 - movdqa 32(%r8), %xmm12 - movdqa 48(%r8), %xmm13 - pxor %xmm10, %xmm1 - pxor %xmm10, %xmm2 - pxor %xmm10, %xmm3 - pxor %xmm10, %xmm4 - pxor %xmm10, %xmm5 - pxor %xmm10, %xmm6 - pxor %xmm10, %xmm7 - pxor %xmm10, %xmm8 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm3 - aesdec %xmm11, %xmm4 - aesdec %xmm11, %xmm5 - aesdec %xmm11, %xmm6 - aesdec %xmm11, %xmm7 - aesdec %xmm11, %xmm8 - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm3 - aesdec %xmm12, %xmm4 - aesdec %xmm12, %xmm5 - aesdec %xmm12, %xmm6 - aesdec %xmm12, %xmm7 - aesdec %xmm12, %xmm8 - aesdec %xmm13, %xmm1 - aesdec %xmm13, %xmm2 - aesdec %xmm13, %xmm3 - aesdec %xmm13, %xmm4 - aesdec %xmm13, %xmm5 - aesdec %xmm13, %xmm6 - aesdec %xmm13, %xmm7 - aesdec %xmm13, %xmm8 - movdqa 64(%r8), %xmm10 - movdqa 80(%r8), %xmm11 - movdqa 96(%r8), %xmm12 - movdqa 112(%r8), %xmm13 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - aesdec %xmm10, %xmm5 - aesdec %xmm10, %xmm6 - aesdec %xmm10, %xmm7 - aesdec %xmm10, %xmm8 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm3 - aesdec %xmm11, %xmm4 - aesdec %xmm11, %xmm5 - aesdec %xmm11, %xmm6 - aesdec %xmm11, %xmm7 - aesdec %xmm11, %xmm8 - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm3 - aesdec %xmm12, %xmm4 - aesdec %xmm12, %xmm5 - aesdec %xmm12, %xmm6 - aesdec %xmm12, %xmm7 - aesdec %xmm12, %xmm8 - aesdec %xmm13, %xmm1 - aesdec %xmm13, %xmm2 - aesdec %xmm13, %xmm3 - aesdec %xmm13, %xmm4 - aesdec %xmm13, %xmm5 - aesdec %xmm13, %xmm6 - aesdec %xmm13, %xmm7 - aesdec %xmm13, %xmm8 - movdqa 128(%r8), %xmm10 - movdqa 144(%r8), %xmm11 - movdqa 160(%r8), %xmm12 - cmpl $12, %r9d - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - aesdec %xmm10, %xmm5 - aesdec %xmm10, %xmm6 - aesdec %xmm10, %xmm7 - aesdec %xmm10, %xmm8 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm3 - aesdec %xmm11, %xmm4 - aesdec %xmm11, %xmm5 - aesdec %xmm11, %xmm6 - aesdec %xmm11, %xmm7 - aesdec %xmm11, %xmm8 - jb DLAST_8 - movdqa 160(%r8), %xmm10 - movdqa 176(%r8), %xmm11 - movdqa 192(%r8), %xmm12 - cmpl $14, %r9d - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - aesdec %xmm10, %xmm5 - aesdec %xmm10, %xmm6 - aesdec %xmm10, %xmm7 - aesdec %xmm10, %xmm8 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm3 - aesdec %xmm11, %xmm4 - aesdec %xmm11, %xmm5 - aesdec %xmm11, %xmm6 - aesdec %xmm11, %xmm7 - aesdec %xmm11, %xmm8 - jb DLAST_8 - movdqa 192(%r8), %xmm10 - movdqa 208(%r8), %xmm11 - movdqa 224(%r8), %xmm12 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - aesdec %xmm10, %xmm5 - aesdec %xmm10, %xmm6 - aesdec %xmm10, %xmm7 - aesdec %xmm10, %xmm8 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm3 - aesdec %xmm11, %xmm4 - aesdec %xmm11, %xmm5 - aesdec %xmm11, %xmm6 - aesdec %xmm11, %xmm7 - aesdec %xmm11, %xmm8 -DLAST_8: - addq $128, %rsi - aesdeclast %xmm12, %xmm1 - aesdeclast %xmm12, %xmm2 - aesdeclast %xmm12, %xmm3 - aesdeclast %xmm12, %xmm4 - aesdeclast %xmm12, %xmm5 - aesdeclast %xmm12, %xmm6 - aesdeclast %xmm12, %xmm7 - aesdeclast %xmm12, %xmm8 - movdqu (%rdi), %xmm10 - movdqu 16(%rdi), %xmm11 - movdqu 32(%rdi), %xmm12 - movdqu 48(%rdi), %xmm13 - pxor %xmm9, %xmm1 - pxor %xmm10, %xmm2 - pxor %xmm11, %xmm3 - pxor %xmm12, %xmm4 - pxor %xmm13, %xmm5 - movdqu 64(%rdi), %xmm10 - movdqu 80(%rdi), %xmm11 - movdqu 96(%rdi), %xmm12 - movdqu 112(%rdi), %xmm9 - pxor %xmm10, %xmm6 - pxor %xmm11, %xmm7 - pxor %xmm12, %xmm8 - movdqu %xmm1, (%rsi) - movdqu %xmm2, 16(%rsi) - movdqu %xmm3, 32(%rsi) - movdqu %xmm4, 48(%rsi) - movdqu %xmm5, 64(%rsi) - movdqu %xmm6, 80(%rsi) - movdqu %xmm7, 96(%rsi) - movdqu %xmm8, 112(%rsi) - addq $128, %rdi - decq %rcx - jne DLOOP_8 - addq $128, %rsi -DREMAINDER_8: - cmpq $0, %r10 - je DEND_8 -DLOOP_8_2: - movdqu (%rdi), %xmm1 - movdqa %xmm1, %xmm10 - addq $16, %rdi - pxor (%r8), %xmm1 - movdqu 160(%r8), %xmm2 - cmpl $12, %r9d - aesdec 16(%r8), %xmm1 - aesdec 32(%r8), %xmm1 - aesdec 48(%r8), %xmm1 - aesdec 64(%r8), %xmm1 - aesdec 80(%r8), %xmm1 - aesdec 96(%r8), %xmm1 - aesdec 112(%r8), %xmm1 - aesdec 128(%r8), %xmm1 - aesdec 144(%r8), %xmm1 - jb DLAST_8_2 - movdqu 192(%r8), %xmm2 - cmpl $14, %r9d - aesdec 160(%r8), %xmm1 - aesdec 176(%r8), %xmm1 - jb DLAST_8_2 - movdqu 224(%r8), %xmm2 - aesdec 192(%r8), %xmm1 - aesdec 208(%r8), %xmm1 -DLAST_8_2: - aesdeclast %xmm2, %xmm1 - pxor %xmm9, %xmm1 - movdqa %xmm10, %xmm9 - movdqu %xmm1, (%rsi) - addq $16, %rsi - decq %r10 - jne DLOOP_8_2 -DEND_8: - ret - -#endif /* WOLFSSL_AESNI_BYx */ - - -/* -AES_ECB_encrypt_AESNI (const unsigned char *in, - unsigned char *out, - unsigned long length, - const unsigned char *KS, - int nr) -*/ -#ifndef __APPLE__ -.globl AES_ECB_encrypt_AESNI -AES_ECB_encrypt_AESNI: -#else -.globl _AES_ECB_encrypt_AESNI -_AES_ECB_encrypt_AESNI: -#endif -# parameter 1: %rdi -# parameter 2: %rsi -# parameter 3: %rdx -# parameter 4: %rcx -# parameter 5: %r8d - movq %rdx, %r10 - shrq $4, %rdx - shlq $60, %r10 - je EECB_NO_PARTS_4 - addq $1, %rdx -EECB_NO_PARTS_4: - movq %rdx, %r10 - shlq $62, %r10 - shrq $62, %r10 - shrq $2, %rdx - je EECB_REMAINDER_4 - subq $64, %rsi -EECB_LOOP_4: - movdqu (%rdi), %xmm1 - movdqu 16(%rdi), %xmm2 - movdqu 32(%rdi), %xmm3 - movdqu 48(%rdi), %xmm4 - movdqa (%rcx), %xmm9 - movdqa 16(%rcx), %xmm10 - movdqa 32(%rcx), %xmm11 - movdqa 48(%rcx), %xmm12 - pxor %xmm9, %xmm1 - pxor %xmm9, %xmm2 - pxor %xmm9, %xmm3 - pxor %xmm9, %xmm4 - aesenc %xmm10, %xmm1 - aesenc %xmm10, %xmm2 - aesenc %xmm10, %xmm3 - aesenc %xmm10, %xmm4 - aesenc %xmm11, %xmm1 - aesenc %xmm11, %xmm2 - aesenc %xmm11, %xmm3 - aesenc %xmm11, %xmm4 - aesenc %xmm12, %xmm1 - aesenc %xmm12, %xmm2 - aesenc %xmm12, %xmm3 - aesenc %xmm12, %xmm4 - movdqa 64(%rcx), %xmm9 - movdqa 80(%rcx), %xmm10 - movdqa 96(%rcx), %xmm11 - movdqa 112(%rcx), %xmm12 - aesenc %xmm9, %xmm1 - aesenc %xmm9, %xmm2 - aesenc %xmm9, %xmm3 - aesenc %xmm9, %xmm4 - aesenc %xmm10, %xmm1 - aesenc %xmm10, %xmm2 - aesenc %xmm10, %xmm3 - aesenc %xmm10, %xmm4 - aesenc %xmm11, %xmm1 - aesenc %xmm11, %xmm2 - aesenc %xmm11, %xmm3 - aesenc %xmm11, %xmm4 - aesenc %xmm12, %xmm1 - aesenc %xmm12, %xmm2 - aesenc %xmm12, %xmm3 - aesenc %xmm12, %xmm4 - movdqa 128(%rcx), %xmm9 - movdqa 144(%rcx), %xmm10 - movdqa 160(%rcx), %xmm11 - cmpl $12, %r8d - aesenc %xmm9, %xmm1 - aesenc %xmm9, %xmm2 - aesenc %xmm9, %xmm3 - aesenc %xmm9, %xmm4 - aesenc %xmm10, %xmm1 - aesenc %xmm10, %xmm2 - aesenc %xmm10, %xmm3 - aesenc %xmm10, %xmm4 - jb EECB_LAST_4 - movdqa 160(%rcx), %xmm9 - movdqa 176(%rcx), %xmm10 - movdqa 192(%rcx), %xmm11 - cmpl $14, %r8d - aesenc %xmm9, %xmm1 - aesenc %xmm9, %xmm2 - aesenc %xmm9, %xmm3 - aesenc %xmm9, %xmm4 - aesenc %xmm10, %xmm1 - aesenc %xmm10, %xmm2 - aesenc %xmm10, %xmm3 - aesenc %xmm10, %xmm4 - jb EECB_LAST_4 - movdqa 192(%rcx), %xmm9 - movdqa 208(%rcx), %xmm10 - movdqa 224(%rcx), %xmm11 - aesenc %xmm9, %xmm1 - aesenc %xmm9, %xmm2 - aesenc %xmm9, %xmm3 - aesenc %xmm9, %xmm4 - aesenc %xmm10, %xmm1 - aesenc %xmm10, %xmm2 - aesenc %xmm10, %xmm3 - aesenc %xmm10, %xmm4 -EECB_LAST_4: - addq $64, %rdi - addq $64, %rsi - decq %rdx - aesenclast %xmm11, %xmm1 - aesenclast %xmm11, %xmm2 - aesenclast %xmm11, %xmm3 - aesenclast %xmm11, %xmm4 - movdqu %xmm1, (%rsi) - movdqu %xmm2, 16(%rsi) - movdqu %xmm3, 32(%rsi) - movdqu %xmm4, 48(%rsi) - jne EECB_LOOP_4 - addq $64, %rsi -EECB_REMAINDER_4: - cmpq $0, %r10 - je EECB_END_4 -EECB_LOOP_4_2: - movdqu (%rdi), %xmm1 - addq $16, %rdi - pxor (%rcx), %xmm1 - movdqu 160(%rcx), %xmm2 - aesenc 16(%rcx), %xmm1 - aesenc 32(%rcx), %xmm1 - aesenc 48(%rcx), %xmm1 - aesenc 64(%rcx), %xmm1 - aesenc 80(%rcx), %xmm1 - aesenc 96(%rcx), %xmm1 - aesenc 112(%rcx), %xmm1 - aesenc 128(%rcx), %xmm1 - aesenc 144(%rcx), %xmm1 - cmpl $12, %r8d - jb EECB_LAST_4_2 - movdqu 192(%rcx), %xmm2 - aesenc 160(%rcx), %xmm1 - aesenc 176(%rcx), %xmm1 - cmpl $14, %r8d - jb EECB_LAST_4_2 - movdqu 224(%rcx), %xmm2 - aesenc 192(%rcx), %xmm1 - aesenc 208(%rcx), %xmm1 -EECB_LAST_4_2: - aesenclast %xmm2, %xmm1 - movdqu %xmm1, (%rsi) - addq $16, %rsi - decq %r10 - jne EECB_LOOP_4_2 -EECB_END_4: - ret - - -/* -AES_ECB_decrypt_AESNI (const unsigned char *in, - unsigned char *out, - unsigned long length, - const unsigned char *KS, - int nr) -*/ -#ifndef __APPLE__ -.globl AES_ECB_decrypt_AESNI -AES_ECB_decrypt_AESNI: -#else -.globl _AES_ECB_decrypt_AESNI -_AES_ECB_decrypt_AESNI: -#endif -# parameter 1: %rdi -# parameter 2: %rsi -# parameter 3: %rdx -# parameter 4: %rcx -# parameter 5: %r8d - - movq %rdx, %r10 - shrq $4, %rdx - shlq $60, %r10 - je DECB_NO_PARTS_4 - addq $1, %rdx -DECB_NO_PARTS_4: - movq %rdx, %r10 - shlq $62, %r10 - shrq $62, %r10 - shrq $2, %rdx - je DECB_REMAINDER_4 - subq $64, %rsi -DECB_LOOP_4: - movdqu (%rdi), %xmm1 - movdqu 16(%rdi), %xmm2 - movdqu 32(%rdi), %xmm3 - movdqu 48(%rdi), %xmm4 - movdqa (%rcx), %xmm9 - movdqa 16(%rcx), %xmm10 - movdqa 32(%rcx), %xmm11 - movdqa 48(%rcx), %xmm12 - pxor %xmm9, %xmm1 - pxor %xmm9, %xmm2 - pxor %xmm9, %xmm3 - pxor %xmm9, %xmm4 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm3 - aesdec %xmm11, %xmm4 - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm3 - aesdec %xmm12, %xmm4 - movdqa 64(%rcx), %xmm9 - movdqa 80(%rcx), %xmm10 - movdqa 96(%rcx), %xmm11 - movdqa 112(%rcx), %xmm12 - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm3 - aesdec %xmm11, %xmm4 - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm3 - aesdec %xmm12, %xmm4 - movdqa 128(%rcx), %xmm9 - movdqa 144(%rcx), %xmm10 - movdqa 160(%rcx), %xmm11 - cmpl $12, %r8d - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - jb DECB_LAST_4 - movdqa 160(%rcx), %xmm9 - movdqa 176(%rcx), %xmm10 - movdqa 192(%rcx), %xmm11 - cmpl $14, %r8d - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - jb DECB_LAST_4 - movdqa 192(%rcx), %xmm9 - movdqa 208(%rcx), %xmm10 - movdqa 224(%rcx), %xmm11 - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 -DECB_LAST_4: - addq $64, %rdi - addq $64, %rsi - decq %rdx - aesdeclast %xmm11, %xmm1 - aesdeclast %xmm11, %xmm2 - aesdeclast %xmm11, %xmm3 - aesdeclast %xmm11, %xmm4 - movdqu %xmm1, (%rsi) - movdqu %xmm2, 16(%rsi) - movdqu %xmm3, 32(%rsi) - movdqu %xmm4, 48(%rsi) - jne DECB_LOOP_4 - addq $64, %rsi -DECB_REMAINDER_4: - cmpq $0, %r10 - je DECB_END_4 -DECB_LOOP_4_2: - movdqu (%rdi), %xmm1 - addq $16, %rdi - pxor (%rcx), %xmm1 - movdqu 160(%rcx), %xmm2 - cmpl $12, %r8d - aesdec 16(%rcx), %xmm1 - aesdec 32(%rcx), %xmm1 - aesdec 48(%rcx), %xmm1 - aesdec 64(%rcx), %xmm1 - aesdec 80(%rcx), %xmm1 - aesdec 96(%rcx), %xmm1 - aesdec 112(%rcx), %xmm1 - aesdec 128(%rcx), %xmm1 - aesdec 144(%rcx), %xmm1 - jb DECB_LAST_4_2 - cmpl $14, %r8d - movdqu 192(%rcx), %xmm2 - aesdec 160(%rcx), %xmm1 - aesdec 176(%rcx), %xmm1 - jb DECB_LAST_4_2 - movdqu 224(%rcx), %xmm2 - aesdec 192(%rcx), %xmm1 - aesdec 208(%rcx), %xmm1 -DECB_LAST_4_2: - aesdeclast %xmm2, %xmm1 - movdqu %xmm1, (%rsi) - addq $16, %rsi - decq %r10 - jne DECB_LOOP_4_2 -DECB_END_4: - ret - - - - -/* -void AES_128_Key_Expansion_AESNI(const unsigned char* userkey, - unsigned char* key_schedule); -*/ -#ifndef __APPLE__ -.globl AES_128_Key_Expansion_AESNI -.align 16,0x90 -AES_128_Key_Expansion_AESNI: -#else -.globl _AES_128_Key_Expansion_AESNI -.p2align 4 -_AES_128_Key_Expansion_AESNI: -#endif -# parameter 1: %rdi -# parameter 2: %rsi - -movdqu (%rdi), %xmm1 -movdqa %xmm1, (%rsi) - - -ASSISTS: -aeskeygenassist $1, %xmm1, %xmm2 -call PREPARE_ROUNDKEY_128 -movdqa %xmm1, 16(%rsi) -aeskeygenassist $2, %xmm1, %xmm2 -call PREPARE_ROUNDKEY_128 -movdqa %xmm1, 32(%rsi) -aeskeygenassist $4, %xmm1, %xmm2 -call PREPARE_ROUNDKEY_128 -movdqa %xmm1, 48(%rsi) -aeskeygenassist $8, %xmm1, %xmm2 -call PREPARE_ROUNDKEY_128 -movdqa %xmm1, 64(%rsi) -aeskeygenassist $16, %xmm1, %xmm2 -call PREPARE_ROUNDKEY_128 -movdqa %xmm1, 80(%rsi) -aeskeygenassist $32, %xmm1, %xmm2 -call PREPARE_ROUNDKEY_128 -movdqa %xmm1, 96(%rsi) -aeskeygenassist $64, %xmm1, %xmm2 -call PREPARE_ROUNDKEY_128 -movdqa %xmm1, 112(%rsi) -aeskeygenassist $0x80, %xmm1, %xmm2 -call PREPARE_ROUNDKEY_128 -movdqa %xmm1, 128(%rsi) -aeskeygenassist $0x1b, %xmm1, %xmm2 -call PREPARE_ROUNDKEY_128 -movdqa %xmm1, 144(%rsi) -aeskeygenassist $0x36, %xmm1, %xmm2 -call PREPARE_ROUNDKEY_128 -movdqa %xmm1, 160(%rsi) -ret - -PREPARE_ROUNDKEY_128: -pshufd $255, %xmm2, %xmm2 -movdqa %xmm1, %xmm3 -pslldq $4, %xmm3 -pxor %xmm3, %xmm1 -pslldq $4, %xmm3 -pxor %xmm3, %xmm1 -pslldq $4, %xmm3 -pxor %xmm3, %xmm1 -pxor %xmm2, %xmm1 -ret - - -/* -void AES_192_Key_Expansion_AESNI (const unsigned char *userkey, - unsigned char *key) -*/ -#ifndef __APPLE__ -.globl AES_192_Key_Expansion_AESNI -AES_192_Key_Expansion_AESNI: -#else -.globl _AES_192_Key_Expansion_AESNI -_AES_192_Key_Expansion_AESNI: -#endif -# parameter 1: %rdi -# parameter 2: %rsi - -movdqu (%rdi), %xmm1 -movq 16(%rdi), %xmm3 -movdqa %xmm1, (%rsi) -movdqa %xmm3, %xmm5 - -aeskeygenassist $0x1, %xmm3, %xmm2 -call PREPARE_ROUNDKEY_192 -shufpd $0, %xmm1, %xmm5 -movdqa %xmm5, 16(%rsi) -movdqa %xmm1, %xmm6 -shufpd $1, %xmm3, %xmm6 -movdqa %xmm6, 32(%rsi) - -aeskeygenassist $0x2, %xmm3, %xmm2 -call PREPARE_ROUNDKEY_192 -movdqa %xmm1, 48(%rsi) -movdqa %xmm3, %xmm5 - -aeskeygenassist $0x4, %xmm3, %xmm2 -call PREPARE_ROUNDKEY_192 -shufpd $0, %xmm1, %xmm5 -movdqa %xmm5, 64(%rsi) -movdqa %xmm1, %xmm6 -shufpd $1, %xmm3, %xmm6 -movdqa %xmm6, 80(%rsi) - -aeskeygenassist $0x8, %xmm3, %xmm2 -call PREPARE_ROUNDKEY_192 -movdqa %xmm1, 96(%rsi) -movdqa %xmm3, %xmm5 - -aeskeygenassist $0x10, %xmm3, %xmm2 -call PREPARE_ROUNDKEY_192 -shufpd $0, %xmm1, %xmm5 -movdqa %xmm5, 112(%rsi) -movdqa %xmm1, %xmm6 -shufpd $1, %xmm3, %xmm6 -movdqa %xmm6, 128(%rsi) - -aeskeygenassist $0x20, %xmm3, %xmm2 -call PREPARE_ROUNDKEY_192 -movdqa %xmm1, 144(%rsi) -movdqa %xmm3, %xmm5 - -aeskeygenassist $0x40, %xmm3, %xmm2 -call PREPARE_ROUNDKEY_192 -shufpd $0, %xmm1, %xmm5 -movdqa %xmm5, 160(%rsi) -movdqa %xmm1, %xmm6 -shufpd $1, %xmm3, %xmm6 -movdqa %xmm6, 176(%rsi) - -aeskeygenassist $0x80, %xmm3, %xmm2 -call PREPARE_ROUNDKEY_192 -movdqa %xmm1, 192(%rsi) -movdqa %xmm3, 208(%rsi) -ret - -PREPARE_ROUNDKEY_192: -pshufd $0x55, %xmm2, %xmm2 -movdqu %xmm1, %xmm4 -pslldq $4, %xmm4 -pxor %xmm4, %xmm1 - -pslldq $4, %xmm4 -pxor %xmm4, %xmm1 -pslldq $4, %xmm4 -pxor %xmm4, %xmm1 -pxor %xmm2, %xmm1 -pshufd $0xff, %xmm1, %xmm2 -movdqu %xmm3, %xmm4 -pslldq $4, %xmm4 -pxor %xmm4, %xmm3 -pxor %xmm2, %xmm3 -ret - - -/* -void AES_256_Key_Expansion_AESNI (const unsigned char *userkey, - unsigned char *key) -*/ -#ifndef __APPLE__ -.globl AES_256_Key_Expansion_AESNI -AES_256_Key_Expansion_AESNI: -#else -.globl _AES_256_Key_Expansion_AESNI -_AES_256_Key_Expansion_AESNI: -#endif -# parameter 1: %rdi -# parameter 2: %rsi - -movdqu (%rdi), %xmm1 -movdqu 16(%rdi), %xmm3 -movdqa %xmm1, (%rsi) -movdqa %xmm3, 16(%rsi) - -aeskeygenassist $0x1, %xmm3, %xmm2 -call MAKE_RK256_a -movdqa %xmm1, 32(%rsi) -aeskeygenassist $0x0, %xmm1, %xmm2 -call MAKE_RK256_b -movdqa %xmm3, 48(%rsi) -aeskeygenassist $0x2, %xmm3, %xmm2 -call MAKE_RK256_a -movdqa %xmm1, 64(%rsi) -aeskeygenassist $0x0, %xmm1, %xmm2 -call MAKE_RK256_b -movdqa %xmm3, 80(%rsi) -aeskeygenassist $0x4, %xmm3, %xmm2 -call MAKE_RK256_a -movdqa %xmm1, 96(%rsi) -aeskeygenassist $0x0, %xmm1, %xmm2 -call MAKE_RK256_b -movdqa %xmm3, 112(%rsi) -aeskeygenassist $0x8, %xmm3, %xmm2 -call MAKE_RK256_a -movdqa %xmm1, 128(%rsi) -aeskeygenassist $0x0, %xmm1, %xmm2 -call MAKE_RK256_b -movdqa %xmm3, 144(%rsi) -aeskeygenassist $0x10, %xmm3, %xmm2 -call MAKE_RK256_a -movdqa %xmm1, 160(%rsi) -aeskeygenassist $0x0, %xmm1, %xmm2 -call MAKE_RK256_b -movdqa %xmm3, 176(%rsi) -aeskeygenassist $0x20, %xmm3, %xmm2 -call MAKE_RK256_a -movdqa %xmm1, 192(%rsi) - -aeskeygenassist $0x0, %xmm1, %xmm2 -call MAKE_RK256_b -movdqa %xmm3, 208(%rsi) -aeskeygenassist $0x40, %xmm3, %xmm2 -call MAKE_RK256_a -movdqa %xmm1, 224(%rsi) - -ret - -MAKE_RK256_a: -pshufd $0xff, %xmm2, %xmm2 -movdqa %xmm1, %xmm4 -pslldq $4, %xmm4 -pxor %xmm4, %xmm1 -pslldq $4, %xmm4 -pxor %xmm4, %xmm1 -pslldq $4, %xmm4 -pxor %xmm4, %xmm1 -pxor %xmm2, %xmm1 -ret - -MAKE_RK256_b: -pshufd $0xaa, %xmm2, %xmm2 -movdqa %xmm3, %xmm4 -pslldq $4, %xmm4 -pxor %xmm4, %xmm3 -pslldq $4, %xmm4 -pxor %xmm4, %xmm3 -pslldq $4, %xmm4 -pxor %xmm4, %xmm3 -pxor %xmm2, %xmm3 -ret - -#elif defined WOLFSSL_X86_BUILD +#if defined WOLFSSL_X86_BUILD /* AES_CBC_encrypt_AESNI (const unsigned char *in, @@ -2238,7 +931,7 @@ MAKE_RK256_b: pxor %xmm2, %xmm3 ret -#endif /* WOLFSSL_X86_64_BUILD */ +#endif /* WOLFSSL_X86_BUILD */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits diff --git a/wolfcrypt/src/aes_asm.asm b/wolfcrypt/src/aes_asm.asm index 4b5e1250d52..cb06a54ac52 100644 --- a/wolfcrypt/src/aes_asm.asm +++ b/wolfcrypt/src/aes_asm.asm @@ -1,1531 +1,54 @@ -; /* aes_asm.asm -; * -; * Copyright (C) 2006-2026 wolfSSL Inc. -; * -; * This file is part of wolfSSL. -; * -; * wolfSSL is free software; you can redistribute it and/or modify -; * it under the terms of the GNU General Public License as published by -; * the Free Software Foundation; either version 3 of the License, or -; * (at your option) any later version. -; * -; * wolfSSL is distributed in the hope that it will be useful, -; * but WITHOUT ANY WARRANTY; without even the implied warranty of -; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -; * GNU General Public License for more details. -; * -; * You should have received a copy of the GNU General Public License -; * along with this program; if not, write to the Free Software -; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA -; */ +; /* aes_asm.asm +; * +; * Copyright (C) 2006-2026 wolfSSL Inc. +; * +; * This file is part of wolfSSL. +; * +; * wolfSSL is free software; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 3 of the License, or +; * (at your option) any later version. +; * +; * wolfSSL is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA +; */ -; -; -; /* See Intel Advanced Encryption Standard (AES) Instructions Set White Paper -; * by Israel, Intel Mobility Group Development Center, Israel Shay Gueron -; */ -; -; /* This file is in intel asm syntax, see .s for at&t syntax */ -; - - -fips_version = 0 -IFDEF HAVE_FIPS - fips_version = 1 - IFDEF HAVE_FIPS_VERSION - fips_version = HAVE_FIPS_VERSION - ENDIF -ENDIF - -IF fips_version GE 2 - fipsAb SEGMENT ALIAS(".fipsA$b") 'CODE' -ELSE - _text SEGMENT -ENDIF - - -; /* -; AES_CBC_encrypt_AESNI[const ,unsigned char*in -; unsigned ,char*out -; unsigned ,char ivec+16 -; unsigned ,long length -; const ,unsigned char*KS -; int nr] -; */ -AES_CBC_encrypt_AESNI PROC -;# parameter 1: rdi -;# parameter 2: rsi -;# parameter 3: rdx -;# parameter 4: rcx -;# parameter 5: r8 -;# parameter 6: r9d - -; save rdi and rsi to rax and r11, restore before ret - mov rax,rdi - mov r11,rsi - -; convert to what we had for att&t convention - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,[rsp+40] - mov r9d,[rsp+48] - - mov r10,rcx - shr rcx,4 - shl r10,60 - je NO_PARTS - add rcx,1 -NO_PARTS: - sub rsi,16 - movdqa xmm1,[rdx] -LOOP_1: - pxor xmm1,[rdi] - pxor xmm1,[r8] - add rsi,16 - add rdi,16 - cmp r9d,12 - aesenc xmm1,16[r8] - aesenc xmm1,32[r8] - aesenc xmm1,48[r8] - aesenc xmm1,64[r8] - aesenc xmm1,80[r8] - aesenc xmm1,96[r8] - aesenc xmm1,112[r8] - aesenc xmm1,128[r8] - aesenc xmm1,144[r8] - movdqa xmm2,160[r8] - jb LAST - cmp r9d,14 - - aesenc xmm1,160[r8] - aesenc xmm1,176[r8] - movdqa xmm2,192[r8] - jb LAST - aesenc xmm1,192[r8] - aesenc xmm1,208[r8] - movdqa xmm2,224[r8] -LAST: - dec rcx - aesenclast xmm1,xmm2 - movdqu [rsi],xmm1 - jne LOOP_1 - ; restore non volatile rdi,rsi - mov rdi,rax - mov rsi,r11 - ret -AES_CBC_encrypt_AESNI ENDP - - -; void AES_CBC_decrypt_AESNI_by4(const unsigned char* in, -; unsigned char* out, -; unsigned char ivec[16], -; unsigned long length, -; const unsigned char* KS, -; int nr) -AES_CBC_decrypt_AESNI_by4 PROC -; parameter 1: rdi -; parameter 2: rsi -; parameter 3: rdx -; parameter 4: rcx -; parameter 5: r8 -; parameter 6: r9d - - ; save rdi and rsi to rax and r11, restore before ret - mov rax, rdi - mov r11, rsi - ; convert to what we had for att&t convention - mov rdi, rcx - mov rsi, rdx - mov rdx, r8 - mov rcx,r9 - mov r8, [rsp+40] - mov r9d, [rsp+48] - ; on microsoft xmm6-xmm15 are non volatile, - ; let's save on stack and restore at end - sub rsp, 8+8*16 ; 8 = align stack , 8 xmm6-12,15 16 bytes each - movdqa [rsp+0], xmm6 - movdqa [rsp+16], xmm7 - movdqa [rsp+32], xmm8 - movdqa [rsp+48], xmm9 - movdqa [rsp+64], xmm10 - movdqa [rsp+80], xmm11 - movdqa [rsp+96], xmm12 - movdqa [rsp+112], xmm15 - ; back to our original code, more or less - mov r10, rcx - shr rcx, 4 - shl r10, 60 - je DNO_PARTS_4 - add rcx, 1 -DNO_PARTS_4: - mov r10, rcx - shl r10, 62 - shr r10, 62 - shr rcx, 2 - movdqu xmm5, [rdx] - je DREMAINDER_4 - sub rsi, 64 -DLOOP_4: - movdqu xmm1, [rdi] - movdqu xmm2, 16[rdi] - movdqu xmm3, 32[rdi] - movdqu xmm4, 48[rdi] - movdqa xmm6, xmm1 - movdqa xmm7, xmm2 - movdqa xmm8, xmm3 - movdqa xmm15, xmm4 - movdqa xmm9, [r8] - movdqa xmm10, 16[r8] - movdqa xmm11, 32[r8] - movdqa xmm12, 48[r8] - pxor xmm1, xmm9 - pxor xmm2, xmm9 - pxor xmm3, xmm9 - pxor xmm4, xmm9 - aesdec xmm1, xmm10 - aesdec xmm2, xmm10 - aesdec xmm3, xmm10 - aesdec xmm4, xmm10 - aesdec xmm1, xmm11 - aesdec xmm2, xmm11 - aesdec xmm3, xmm11 - aesdec xmm4, xmm11 - aesdec xmm1, xmm12 - aesdec xmm2, xmm12 - aesdec xmm3, xmm12 - aesdec xmm4, xmm12 - movdqa xmm9, 64[r8] - movdqa xmm10, 80[r8] - movdqa xmm11, 96[r8] - movdqa xmm12, 112[r8] - aesdec xmm1, xmm9 - aesdec xmm2, xmm9 - aesdec xmm3, xmm9 - aesdec xmm4, xmm9 - aesdec xmm1, xmm10 - aesdec xmm2, xmm10 - aesdec xmm3, xmm10 - aesdec xmm4, xmm10 - aesdec xmm1, xmm11 - aesdec xmm2, xmm11 - aesdec xmm3, xmm11 - aesdec xmm4, xmm11 - aesdec xmm1, xmm12 - aesdec xmm2, xmm12 - aesdec xmm3, xmm12 - aesdec xmm4, xmm12 - movdqa xmm9, 128[r8] - movdqa xmm10, 144[r8] - movdqa xmm11, 160[r8] - cmp r9d, 12 - aesdec xmm1, xmm9 - aesdec xmm2, xmm9 - aesdec xmm3, xmm9 - aesdec xmm4, xmm9 - aesdec xmm1, xmm10 - aesdec xmm2, xmm10 - aesdec xmm3, xmm10 - aesdec xmm4, xmm10 - jb DLAST_4 - movdqa xmm9, 160[r8] - movdqa xmm10, 176[r8] - movdqa xmm11, 192[r8] - cmp r9d, 14 - aesdec xmm1, xmm9 - aesdec xmm2, xmm9 - aesdec xmm3, xmm9 - aesdec xmm4, xmm9 - aesdec xmm1, xmm10 - aesdec xmm2, xmm10 - aesdec xmm3, xmm10 - aesdec xmm4, xmm10 - jb DLAST_4 - movdqa xmm9, 192[r8] - movdqa xmm10, 208[r8] - movdqa xmm11, 224[r8] - aesdec xmm1, xmm9 - aesdec xmm2, xmm9 - aesdec xmm3, xmm9 - aesdec xmm4, xmm9 - aesdec xmm1, xmm10 - aesdec xmm2, xmm10 - aesdec xmm3, xmm10 - aesdec xmm4, xmm10 -DLAST_4: - add rdi, 64 - add rsi, 64 - dec rcx - aesdeclast xmm1, xmm11 - aesdeclast xmm2, xmm11 - aesdeclast xmm3, xmm11 - aesdeclast xmm4, xmm11 - pxor xmm1, xmm5 - pxor xmm2, xmm6 - pxor xmm3, xmm7 - pxor xmm4, xmm8 - movdqu [rsi], xmm1 - movdqu 16[rsi], xmm2 - movdqu 32[rsi], xmm3 - movdqu 48[rsi], xmm4 - movdqa xmm5, xmm15 - jne DLOOP_4 - add rsi, 64 -DREMAINDER_4: - cmp r10, 0 - je DEND_4 -DLOOP_4_2: - movdqu xmm1, [rdi] - movdqa xmm15, xmm1 - add rdi, 16 - pxor xmm1, [r8] - movdqu xmm2, 160[r8] - cmp r9d, 12 - aesdec xmm1, 16[r8] - aesdec xmm1, 32[r8] - aesdec xmm1, 48[r8] - aesdec xmm1, 64[r8] - aesdec xmm1, 80[r8] - aesdec xmm1, 96[r8] - aesdec xmm1, 112[r8] - aesdec xmm1, 128[r8] - aesdec xmm1, 144[r8] - jb DLAST_4_2 - movdqu xmm2, 192[r8] - cmp r9d, 14 - aesdec xmm1, 160[r8] - aesdec xmm1, 176[r8] - jb DLAST_4_2 - movdqu xmm2, 224[r8] - aesdec xmm1, 192[r8] - aesdec xmm1, 208[r8] -DLAST_4_2: - aesdeclast xmm1, xmm2 - pxor xmm1, xmm5 - movdqa xmm5, xmm15 - movdqu [rsi], xmm1 - add rsi, 16 - dec r10 - jne DLOOP_4_2 -DEND_4: - ; restore non volatile rdi,rsi - mov rdi, rax - mov rsi, r11 - ; restore non volatile xmms from stack - movdqa xmm6, [rsp+0] - movdqa xmm7, [rsp+16] - movdqa xmm8, [rsp+32] - movdqa xmm9, [rsp+48] - movdqa xmm10, [rsp+64] - movdqa xmm11, [rsp+80] - movdqa xmm12, [rsp+96] - movdqa xmm15, [rsp+112] - add rsp, 8+8*16 ; 8 = align stack , 8 xmm6-12,15 16 bytes each - ret -AES_CBC_decrypt_AESNI_by4 ENDP - - -; void AES_CBC_decrypt_AESNI_by6(const unsigned char *in, -; unsigned char *out, -; unsigned char ivec[16], -; unsigned long length, -; const unsigned char *KS, -; int nr) -AES_CBC_decrypt_AESNI_by6 PROC -; parameter 1: rdi - in -; parameter 2: rsi - out -; parameter 3: rdx - ivec -; parameter 4: rcx - length -; parameter 5: r8 - KS -; parameter 6: r9d - nr - - ; save rdi and rsi to rax and r11, restore before ret - mov rax, rdi - mov r11, rsi - ; convert to what we had for att&t convention - mov rdi, rcx - mov rsi, rdx - mov rdx, r8 - mov rcx, r9 - mov r8, [rsp+40] - mov r9d, [rsp+48] - ; on microsoft xmm6-xmm15 are non volatile, - ; let's save on stack and restore at end - sub rsp, 8+9*16 ; 8 = align stack , 9 xmm6-14 16 bytes each - movdqa [rsp+0], xmm6 - movdqa [rsp+16], xmm7 - movdqa [rsp+32], xmm8 - movdqa [rsp+48], xmm9 - movdqa [rsp+64], xmm10 - movdqa [rsp+80], xmm11 - movdqa [rsp+96], xmm12 - movdqa [rsp+112], xmm13 - movdqa [rsp+128], xmm14 - ; back to our original code, more or less - mov r10, rcx - shr rcx, 4 - shl r10, 60 - je DNO_PARTS_6 - add rcx, 1 -DNO_PARTS_6: - mov r12, rax - mov r13, rdx - mov r14, rbx - mov rdx, 0 - mov rax, rcx - mov rbx, 6 - div rbx - mov rcx, rax - mov r10, rdx - mov rax, r12 - mov rdx, r13 - mov rbx, r14 - cmp rcx, 0 - movdqu xmm7, [rdx] - je DREMAINDER_6 - sub rsi, 96 -DLOOP_6: - movdqu xmm1, [rdi] - movdqu xmm2, 16[rdi] - movdqu xmm3, 32[rdi] - movdqu xmm4, 48[rdi] - movdqu xmm5, 64[rdi] - movdqu xmm6, 80[rdi] - movdqa xmm8, [r8] - movdqa xmm9, 16[r8] - movdqa xmm10, 32[r8] - movdqa xmm11, 48[r8] - pxor xmm1, xmm8 - pxor xmm2, xmm8 - pxor xmm3, xmm8 - pxor xmm4, xmm8 - pxor xmm5, xmm8 - pxor xmm6, xmm8 - aesdec xmm1, xmm9 - aesdec xmm2, xmm9 - aesdec xmm3, xmm9 - aesdec xmm4, xmm9 - aesdec xmm5, xmm9 - aesdec xmm6, xmm9 - aesdec xmm1, xmm10 - aesdec xmm2, xmm10 - aesdec xmm3, xmm10 - aesdec xmm4, xmm10 - aesdec xmm5, xmm10 - aesdec xmm6, xmm10 - aesdec xmm1, xmm11 - aesdec xmm2, xmm11 - aesdec xmm3, xmm11 - aesdec xmm4, xmm11 - aesdec xmm5, xmm11 - aesdec xmm6, xmm11 - movdqa xmm8, 64[r8] - movdqa xmm9, 80[r8] - movdqa xmm10, 96[r8] - movdqa xmm11, 112[r8] - aesdec xmm1, xmm8 - aesdec xmm2, xmm8 - aesdec xmm3, xmm8 - aesdec xmm4, xmm8 - aesdec xmm5, xmm8 - aesdec xmm6, xmm8 - aesdec xmm1, xmm9 - aesdec xmm2, xmm9 - aesdec xmm3, xmm9 - aesdec xmm4, xmm9 - aesdec xmm5, xmm9 - aesdec xmm6, xmm9 - aesdec xmm1, xmm10 - aesdec xmm2, xmm10 - aesdec xmm3, xmm10 - aesdec xmm4, xmm10 - aesdec xmm5, xmm10 - aesdec xmm6, xmm10 - aesdec xmm1, xmm11 - aesdec xmm2, xmm11 - aesdec xmm3, xmm11 - aesdec xmm4, xmm11 - aesdec xmm5, xmm11 - aesdec xmm6, xmm11 - movdqa xmm8, 128[r8] - movdqa xmm9, 144[r8] - movdqa xmm10, 160[r8] - cmp r9d, 12 - aesdec xmm1, xmm8 - aesdec xmm2, xmm8 - aesdec xmm3, xmm8 - aesdec xmm4, xmm8 - aesdec xmm5, xmm8 - aesdec xmm6, xmm8 - aesdec xmm1, xmm9 - aesdec xmm2, xmm9 - aesdec xmm3, xmm9 - aesdec xmm4, xmm9 - aesdec xmm5, xmm9 - aesdec xmm6, xmm9 - jb DLAST_6 - movdqa xmm8, 160[r8] - movdqa xmm9, 176[r8] - movdqa xmm10, 192[r8] - cmp r9d, 14 - aesdec xmm1, xmm8 - aesdec xmm2, xmm8 - aesdec xmm3, xmm8 - aesdec xmm4, xmm8 - aesdec xmm5, xmm8 - aesdec xmm6, xmm8 - aesdec xmm1, xmm9 - aesdec xmm2, xmm9 - aesdec xmm3, xmm9 - aesdec xmm4, xmm9 - aesdec xmm5, xmm9 - aesdec xmm6, xmm9 - jb DLAST_6 - movdqa xmm8, 192[r8] - movdqa xmm9, 208[r8] - movdqa xmm10, 224[r8] - aesdec xmm1, xmm8 - aesdec xmm2, xmm8 - aesdec xmm3, xmm8 - aesdec xmm4, xmm8 - aesdec xmm5, xmm8 - aesdec xmm6, xmm8 - aesdec xmm1, xmm9 - aesdec xmm2, xmm9 - aesdec xmm3, xmm9 - aesdec xmm4, xmm9 - aesdec xmm5, xmm9 - aesdec xmm6, xmm9 -DLAST_6: - add rsi, 96 - aesdeclast xmm1, xmm10 - aesdeclast xmm2, xmm10 - aesdeclast xmm3, xmm10 - aesdeclast xmm4, xmm10 - aesdeclast xmm5, xmm10 - aesdeclast xmm6, xmm10 - movdqu xmm8, [rdi] - movdqu xmm9, 16[rdi] - movdqu xmm10, 32[rdi] - movdqu xmm11, 48[rdi] - movdqu xmm12, 64[rdi] - movdqu xmm13, 80[rdi] - pxor xmm1, xmm7 - pxor xmm2, xmm8 - pxor xmm3, xmm9 - pxor xmm4, xmm10 - pxor xmm5, xmm11 - pxor xmm6, xmm12 - movdqu xmm7, xmm13 - movdqu [rsi], xmm1 - movdqu 16[rsi], xmm2 - movdqu 32[rsi], xmm3 - movdqu 48[rsi], xmm4 - movdqu 64[rsi], xmm5 - movdqu 80[rsi], xmm6 - add rdi, 96 - dec rcx - jne DLOOP_6 - add rsi, 96 -DREMAINDER_6: - cmp r10, 0 - je DEND_6 -DLOOP_6_2: - movdqu xmm1, [rdi] - movdqa xmm10, xmm1 - add rdi, 16 - pxor xmm1, [r8] - movdqu xmm2, 160[r8] - cmp r9d, 12 - aesdec xmm1, 16[r8] - aesdec xmm1, 32[r8] - aesdec xmm1, 48[r8] - aesdec xmm1, 64[r8] - aesdec xmm1, 80[r8] - aesdec xmm1, 96[r8] - aesdec xmm1, 112[r8] - aesdec xmm1, 128[r8] - aesdec xmm1, 144[r8] - jb DLAST_6_2 - movdqu xmm2, 192[r8] - cmp r9d, 14 - aesdec xmm1, 160[r8] - aesdec xmm1, 176[r8] - jb DLAST_6_2 - movdqu xmm2, 224[r8] - aesdec xmm1, 192[r8] - aesdec xmm1, 208[r8] -DLAST_6_2: - aesdeclast xmm1, xmm2 - pxor xmm1, xmm7 - movdqa xmm7, xmm10 - movdqu [rsi], xmm1 - add rsi, 16 - dec r10 - jne DLOOP_6_2 -DEND_6: - ; restore non volatile rdi,rsi - mov rdi, rax - mov rsi, r11 - ; restore non volatile xmms from stack - movdqa xmm6, [rsp+0] - movdqa xmm7, [rsp+16] - movdqa xmm8, [rsp+32] - movdqa xmm9, [rsp+48] - movdqa xmm10, [rsp+64] - movdqa xmm11, [rsp+80] - movdqa xmm12, [rsp+96] - movdqa xmm13, [rsp+112] - movdqa xmm14, [rsp+128] - add rsp, 8+9*16 ; 8 = align stack , 9 xmm6-14 16 bytes each - ret -AES_CBC_decrypt_AESNI_by6 ENDP - - -; void AES_CBC_decrypt_AESNI_by8(const unsigned char *in, -; unsigned char *out, -; unsigned char ivec[16], -; unsigned long length, -; const unsigned char *KS, -; int nr) -AES_CBC_decrypt_AESNI_by8 PROC -; parameter 1: rdi - in -; parameter 2: rsi - out -; parameter 3: rdx - ivec -; parameter 4: rcx - length -; parameter 5: r8 - KS -; parameter 6: r9d - nr - - ; save rdi and rsi to rax and r11, restore before ret - mov rax, rdi - mov r11, rsi - ; convert to what we had for att&t convention - mov rdi, rcx - mov rsi, rdx - mov rdx, r8 - mov rcx,r9 - mov r8, [rsp+40] - mov r9d, [rsp+48] - ; on microsoft xmm6-xmm15 are non volatile, - ; let's save on stack and restore at end - sub rsp, 8+8*16 ; 8 = align stack , 8 xmm6-13 16 bytes each - movdqa [rsp+0], xmm6 - movdqa [rsp+16], xmm7 - movdqa [rsp+32], xmm8 - movdqa [rsp+48], xmm9 - movdqa [rsp+64], xmm10 - movdqa [rsp+80], xmm11 - movdqa [rsp+96], xmm12 - movdqa [rsp+112], xmm13 - ; back to our original code, more or less - mov r10, rcx - shr rcx, 4 - shl r10, 60 - je DNO_PARTS_8 - add rcx, 1 -DNO_PARTS_8: - mov r10, rcx - shl r10, 61 - shr r10, 61 - shr rcx, 3 - movdqu xmm9, [rdx] - je DREMAINDER_8 - sub rsi, 128 -DLOOP_8: - movdqu xmm1, [rdi] - movdqu xmm2, 16[rdi] - movdqu xmm3, 32[rdi] - movdqu xmm4, 48[rdi] - movdqu xmm5, 64[rdi] - movdqu xmm6, 80[rdi] - movdqu xmm7, 96[rdi] - movdqu xmm8, 112[rdi] - movdqa xmm10, [r8] - movdqa xmm11, 16[r8] - movdqa xmm12, 32[r8] - movdqa xmm13, 48[r8] - pxor xmm1, xmm10 - pxor xmm2, xmm10 - pxor xmm3, xmm10 - pxor xmm4, xmm10 - pxor xmm5, xmm10 - pxor xmm6, xmm10 - pxor xmm7, xmm10 - pxor xmm8, xmm10 - aesdec xmm1, xmm11 - aesdec xmm2, xmm11 - aesdec xmm3, xmm11 - aesdec xmm4, xmm11 - aesdec xmm5, xmm11 - aesdec xmm6, xmm11 - aesdec xmm7, xmm11 - aesdec xmm8, xmm11 - aesdec xmm1, xmm12 - aesdec xmm2, xmm12 - aesdec xmm3, xmm12 - aesdec xmm4, xmm12 - aesdec xmm5, xmm12 - aesdec xmm6, xmm12 - aesdec xmm7, xmm12 - aesdec xmm8, xmm12 - aesdec xmm1, xmm13 - aesdec xmm2, xmm13 - aesdec xmm3, xmm13 - aesdec xmm4, xmm13 - aesdec xmm5, xmm13 - aesdec xmm6, xmm13 - aesdec xmm7, xmm13 - aesdec xmm8, xmm13 - movdqa xmm10, 64[r8] - movdqa xmm11, 80[r8] - movdqa xmm12, 96[r8] - movdqa xmm13, 112[r8] - aesdec xmm1, xmm10 - aesdec xmm2, xmm10 - aesdec xmm3, xmm10 - aesdec xmm4, xmm10 - aesdec xmm5, xmm10 - aesdec xmm6, xmm10 - aesdec xmm7, xmm10 - aesdec xmm8, xmm10 - aesdec xmm1, xmm11 - aesdec xmm2, xmm11 - aesdec xmm3, xmm11 - aesdec xmm4, xmm11 - aesdec xmm5, xmm11 - aesdec xmm6, xmm11 - aesdec xmm7, xmm11 - aesdec xmm8, xmm11 - aesdec xmm1, xmm12 - aesdec xmm2, xmm12 - aesdec xmm3, xmm12 - aesdec xmm4, xmm12 - aesdec xmm5, xmm12 - aesdec xmm6, xmm12 - aesdec xmm7, xmm12 - aesdec xmm8, xmm12 - aesdec xmm1, xmm13 - aesdec xmm2, xmm13 - aesdec xmm3, xmm13 - aesdec xmm4, xmm13 - aesdec xmm5, xmm13 - aesdec xmm6, xmm13 - aesdec xmm7, xmm13 - aesdec xmm8, xmm13 - movdqa xmm10, 128[r8] - movdqa xmm11, 144[r8] - movdqa xmm12, 160[r8] - cmp r9d, 12 - aesdec xmm1, xmm10 - aesdec xmm2, xmm10 - aesdec xmm3, xmm10 - aesdec xmm4, xmm10 - aesdec xmm5, xmm10 - aesdec xmm6, xmm10 - aesdec xmm7, xmm10 - aesdec xmm8, xmm10 - aesdec xmm1, xmm11 - aesdec xmm2, xmm11 - aesdec xmm3, xmm11 - aesdec xmm4, xmm11 - aesdec xmm5, xmm11 - aesdec xmm6, xmm11 - aesdec xmm7, xmm11 - aesdec xmm8, xmm11 - jb DLAST_8 - movdqa xmm10, 160[r8] - movdqa xmm11, 176[r8] - movdqa xmm12, 192[r8] - cmp r9d, 14 - aesdec xmm1, xmm10 - aesdec xmm2, xmm10 - aesdec xmm3, xmm10 - aesdec xmm4, xmm10 - aesdec xmm5, xmm10 - aesdec xmm6, xmm10 - aesdec xmm7, xmm10 - aesdec xmm8, xmm10 - aesdec xmm1, xmm11 - aesdec xmm2, xmm11 - aesdec xmm3, xmm11 - aesdec xmm4, xmm11 - aesdec xmm5, xmm11 - aesdec xmm6, xmm11 - aesdec xmm7, xmm11 - aesdec xmm8, xmm11 - jb DLAST_8 - movdqa xmm10, 192[r8] - movdqa xmm11, 208[r8] - movdqa xmm12, 224[r8] - aesdec xmm1, xmm10 - aesdec xmm2, xmm10 - aesdec xmm3, xmm10 - aesdec xmm4, xmm10 - aesdec xmm5, xmm10 - aesdec xmm6, xmm10 - aesdec xmm7, xmm10 - aesdec xmm8, xmm10 - aesdec xmm1, xmm11 - aesdec xmm2, xmm11 - aesdec xmm3, xmm11 - aesdec xmm4, xmm11 - aesdec xmm5, xmm11 - aesdec xmm6, xmm11 - aesdec xmm7, xmm11 - aesdec xmm8, xmm11 -DLAST_8: - add rsi, 128 - aesdeclast xmm1, xmm12 - aesdeclast xmm2, xmm12 - aesdeclast xmm3, xmm12 - aesdeclast xmm4, xmm12 - aesdeclast xmm5, xmm12 - aesdeclast xmm6, xmm12 - aesdeclast xmm7, xmm12 - aesdeclast xmm8, xmm12 - movdqu xmm10, [rdi] - movdqu xmm11, 16[rdi] - movdqu xmm12, 32[rdi] - movdqu xmm13, 48[rdi] - pxor xmm1, xmm9 - pxor xmm2, xmm10 - pxor xmm3, xmm11 - pxor xmm4, xmm12 - pxor xmm5, xmm13 - movdqu xmm10, 64[rdi] - movdqu xmm11, 80[rdi] - movdqu xmm12, 96[rdi] - movdqu xmm9, 112[rdi] - pxor xmm6, xmm10 - pxor xmm7, xmm11 - pxor xmm8, xmm12 - movdqu [rsi], xmm1 - movdqu 16[rsi], xmm2 - movdqu 32[rsi], xmm3 - movdqu 48[rsi], xmm4 - movdqu 64[rsi], xmm5 - movdqu 80[rsi], xmm6 - movdqu 96[rsi], xmm7 - movdqu 112[rsi], xmm8 - add rdi, 128 - dec rcx - jne DLOOP_8 - add rsi, 128 -DREMAINDER_8: - cmp r10, 0 - je DEND_8 -DLOOP_8_2: - movdqu xmm1, [rdi] - movdqa xmm10, xmm1 - add rdi, 16 - pxor xmm1, [r8] - movdqu xmm2, 160[r8] - cmp r9d, 12 - aesdec xmm1, 16[r8] - aesdec xmm1, 32[r8] - aesdec xmm1, 48[r8] - aesdec xmm1, 64[r8] - aesdec xmm1, 80[r8] - aesdec xmm1, 96[r8] - aesdec xmm1, 112[r8] - aesdec xmm1, 128[r8] - aesdec xmm1, 144[r8] - jb DLAST_8_2 - movdqu xmm2, 192[r8] - cmp r9d, 14 - aesdec xmm1, 160[r8] - aesdec xmm1, 176[r8] - jb DLAST_8_2 - movdqu xmm2, 224[r8] - aesdec xmm1, 192[r8] - aesdec xmm1, 208[r8] -DLAST_8_2: - aesdeclast xmm1, xmm2 - pxor xmm1, xmm9 - movdqa xmm9, xmm10 - movdqu [rsi], xmm1 - add rsi, 16 - dec r10 - jne DLOOP_8_2 -DEND_8: - ; restore non volatile rdi,rsi - mov rdi, rax - mov rsi, r11 - ; restore non volatile xmms from stack - movdqa xmm6, [rsp+0] - movdqa xmm7, [rsp+16] - movdqa xmm8, [rsp+32] - movdqa xmm9, [rsp+48] - movdqa xmm10, [rsp+64] - movdqa xmm11, [rsp+80] - movdqa xmm12, [rsp+96] - movdqa xmm13, [rsp+112] - add rsp, 8+8*16 ; 8 = align stack , 8 xmm6-13 16 bytes each - ret -AES_CBC_decrypt_AESNI_by8 ENDP - - -; /* -; AES_ECB_encrypt_AESNI[const ,unsigned char*in -; unsigned ,char*out -; unsigned ,long length -; const ,unsigned char*KS -; int nr] -; */ -; . globl AES_ECB_encrypt_AESNI -AES_ECB_encrypt_AESNI PROC -;# parameter 1: rdi -;# parameter 2: rsi -;# parameter 3: rdx -;# parameter 4: rcx -;# parameter 5: r8d - -; save rdi and rsi to rax and r11, restore before ret - mov rax,rdi - mov r11,rsi - -; convert to what we had for att&t convention - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8d,[rsp+40] - -; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end - sub rsp,8+4*16 ; 8 = align stack , 4 xmm9-12, 16 bytes each - movdqa [rsp+0], xmm9 - movdqa [rsp+16], xmm10 - movdqa [rsp+32], xmm11 - movdqa [rsp+48], xmm12 - - - mov r10,rdx - shr rdx,4 - shl r10,60 - je EECB_NO_PARTS_4 - add rdx,1 -EECB_NO_PARTS_4: - mov r10,rdx - shl r10,62 - shr r10,62 - shr rdx,2 - je EECB_REMAINDER_4 - sub rsi,64 -EECB_LOOP_4: - movdqu xmm1,[rdi] - movdqu xmm2,16[rdi] - movdqu xmm3,32[rdi] - movdqu xmm4,48[rdi] - movdqa xmm9,[rcx] - movdqa xmm10,16[rcx] - movdqa xmm11,32[rcx] - movdqa xmm12,48[rcx] - pxor xmm1,xmm9 - pxor xmm2,xmm9 - pxor xmm3,xmm9 - pxor xmm4,xmm9 - aesenc xmm1,xmm10 - aesenc xmm2,xmm10 - aesenc xmm3,xmm10 - aesenc xmm4,xmm10 - aesenc xmm1,xmm11 - aesenc xmm2,xmm11 - aesenc xmm3,xmm11 - aesenc xmm4,xmm11 - aesenc xmm1,xmm12 - aesenc xmm2,xmm12 - aesenc xmm3,xmm12 - aesenc xmm4,xmm12 - movdqa xmm9,64[rcx] - movdqa xmm10,80[rcx] - movdqa xmm11,96[rcx] - movdqa xmm12,112[rcx] - aesenc xmm1,xmm9 - aesenc xmm2,xmm9 - aesenc xmm3,xmm9 - aesenc xmm4,xmm9 - aesenc xmm1,xmm10 - aesenc xmm2,xmm10 - aesenc xmm3,xmm10 - aesenc xmm4,xmm10 - aesenc xmm1,xmm11 - aesenc xmm2,xmm11 - aesenc xmm3,xmm11 - aesenc xmm4,xmm11 - aesenc xmm1,xmm12 - aesenc xmm2,xmm12 - aesenc xmm3,xmm12 - aesenc xmm4,xmm12 - movdqa xmm9,128[rcx] - movdqa xmm10,144[rcx] - movdqa xmm11,160[rcx] - cmp r8d,12 - aesenc xmm1,xmm9 - aesenc xmm2,xmm9 - aesenc xmm3,xmm9 - aesenc xmm4,xmm9 - aesenc xmm1,xmm10 - aesenc xmm2,xmm10 - aesenc xmm3,xmm10 - aesenc xmm4,xmm10 - jb EECB_LAST_4 - movdqa xmm9,160[rcx] - movdqa xmm10,176[rcx] - movdqa xmm11,192[rcx] - cmp r8d,14 - aesenc xmm1,xmm9 - aesenc xmm2,xmm9 - aesenc xmm3,xmm9 - aesenc xmm4,xmm9 - aesenc xmm1,xmm10 - aesenc xmm2,xmm10 - aesenc xmm3,xmm10 - aesenc xmm4,xmm10 - jb EECB_LAST_4 - movdqa xmm9,192[rcx] - movdqa xmm10,208[rcx] - movdqa xmm11,224[rcx] - aesenc xmm1,xmm9 - aesenc xmm2,xmm9 - aesenc xmm3,xmm9 - aesenc xmm4,xmm9 - aesenc xmm1,xmm10 - aesenc xmm2,xmm10 - aesenc xmm3,xmm10 - aesenc xmm4,xmm10 -EECB_LAST_4: - add rdi,64 - add rsi,64 - dec rdx - aesenclast xmm1,xmm11 - aesenclast xmm2,xmm11 - aesenclast xmm3,xmm11 - aesenclast xmm4,xmm11 - movdqu [rsi],xmm1 - movdqu 16[rsi],xmm2 - movdqu 32[rsi],xmm3 - movdqu 48[rsi],xmm4 - jne EECB_LOOP_4 - add rsi,64 -EECB_REMAINDER_4: - cmp r10,0 - je EECB_END_4 -EECB_LOOP_4_2: - movdqu xmm1,[rdi] - add rdi,16 - pxor xmm1,[rcx] - movdqu xmm2,160[rcx] - aesenc xmm1,16[rcx] - aesenc xmm1,32[rcx] - aesenc xmm1,48[rcx] - aesenc xmm1,64[rcx] - aesenc xmm1,80[rcx] - aesenc xmm1,96[rcx] - aesenc xmm1,112[rcx] - aesenc xmm1,128[rcx] - aesenc xmm1,144[rcx] - cmp r8d,12 - jb EECB_LAST_4_2 - movdqu xmm2,192[rcx] - aesenc xmm1,160[rcx] - aesenc xmm1,176[rcx] - cmp r8d,14 - jb EECB_LAST_4_2 - movdqu xmm2,224[rcx] - aesenc xmm1,192[rcx] - aesenc xmm1,208[rcx] -EECB_LAST_4_2: - aesenclast xmm1,xmm2 - movdqu [rsi],xmm1 - add rsi,16 - dec r10 - jne EECB_LOOP_4_2 -EECB_END_4: - ; restore non volatile rdi,rsi - mov rdi,rax - mov rsi,r11 - ; restore non volatile xmms from stack - movdqa xmm9, [rsp+0] - movdqa xmm10, [rsp+16] - movdqa xmm11, [rsp+32] - movdqa xmm12, [rsp+48] - add rsp,8+4*16 ; 8 = align stack , 4 xmm9-12 16 bytes each - ret -AES_ECB_encrypt_AESNI ENDP - -; /* -; AES_ECB_decrypt_AESNI[const ,unsigned char*in -; unsigned ,char*out -; unsigned ,long length -; const ,unsigned char*KS -; int nr] -; */ -; . globl AES_ECB_decrypt_AESNI -AES_ECB_decrypt_AESNI PROC -;# parameter 1: rdi -;# parameter 2: rsi -;# parameter 3: rdx -;# parameter 4: rcx -;# parameter 5: r8d - -; save rdi and rsi to rax and r11, restore before ret - mov rax,rdi - mov r11,rsi - -; convert to what we had for att&t convention - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8d,[rsp+40] - -; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end - sub rsp,8+4*16 ; 8 = align stack , 4 xmm9-12, 16 bytes each - movdqa [rsp+0], xmm9 - movdqa [rsp+16], xmm10 - movdqa [rsp+32], xmm11 - movdqa [rsp+48], xmm12 - - mov r10,rdx - shr rdx,4 - shl r10,60 - je DECB_NO_PARTS_4 - add rdx,1 -DECB_NO_PARTS_4: - mov r10,rdx - shl r10,62 - shr r10,62 - shr rdx,2 - je DECB_REMAINDER_4 - sub rsi,64 -DECB_LOOP_4: - movdqu xmm1,[rdi] - movdqu xmm2,16[rdi] - movdqu xmm3,32[rdi] - movdqu xmm4,48[rdi] - movdqa xmm9,[rcx] - movdqa xmm10,16[rcx] - movdqa xmm11,32[rcx] - movdqa xmm12,48[rcx] - pxor xmm1,xmm9 - pxor xmm2,xmm9 - pxor xmm3,xmm9 - pxor xmm4,xmm9 - aesdec xmm1,xmm10 - aesdec xmm2,xmm10 - aesdec xmm3,xmm10 - aesdec xmm4,xmm10 - aesdec xmm1,xmm11 - aesdec xmm2,xmm11 - aesdec xmm3,xmm11 - aesdec xmm4,xmm11 - aesdec xmm1,xmm12 - aesdec xmm2,xmm12 - aesdec xmm3,xmm12 - aesdec xmm4,xmm12 - movdqa xmm9,64[rcx] - movdqa xmm10,80[rcx] - movdqa xmm11,96[rcx] - movdqa xmm12,112[rcx] - aesdec xmm1,xmm9 - aesdec xmm2,xmm9 - aesdec xmm3,xmm9 - aesdec xmm4,xmm9 - aesdec xmm1,xmm10 - aesdec xmm2,xmm10 - aesdec xmm3,xmm10 - aesdec xmm4,xmm10 - aesdec xmm1,xmm11 - aesdec xmm2,xmm11 - aesdec xmm3,xmm11 - aesdec xmm4,xmm11 - aesdec xmm1,xmm12 - aesdec xmm2,xmm12 - aesdec xmm3,xmm12 - aesdec xmm4,xmm12 - movdqa xmm9,128[rcx] - movdqa xmm10,144[rcx] - movdqa xmm11,160[rcx] - cmp r8d,12 - aesdec xmm1,xmm9 - aesdec xmm2,xmm9 - aesdec xmm3,xmm9 - aesdec xmm4,xmm9 - aesdec xmm1,xmm10 - aesdec xmm2,xmm10 - aesdec xmm3,xmm10 - aesdec xmm4,xmm10 - jb DECB_LAST_4 - movdqa xmm9,160[rcx] - movdqa xmm10,176[rcx] - movdqa xmm11,192[rcx] - cmp r8d,14 - aesdec xmm1,xmm9 - aesdec xmm2,xmm9 - aesdec xmm3,xmm9 - aesdec xmm4,xmm9 - aesdec xmm1,xmm10 - aesdec xmm2,xmm10 - aesdec xmm3,xmm10 - aesdec xmm4,xmm10 - jb DECB_LAST_4 - movdqa xmm9,192[rcx] - movdqa xmm10,208[rcx] - movdqa xmm11,224[rcx] - aesdec xmm1,xmm9 - aesdec xmm2,xmm9 - aesdec xmm3,xmm9 - aesdec xmm4,xmm9 - aesdec xmm1,xmm10 - aesdec xmm2,xmm10 - aesdec xmm3,xmm10 - aesdec xmm4,xmm10 -DECB_LAST_4: - add rdi,64 - add rsi,64 - dec rdx - aesdeclast xmm1,xmm11 - aesdeclast xmm2,xmm11 - aesdeclast xmm3,xmm11 - aesdeclast xmm4,xmm11 - movdqu [rsi],xmm1 - movdqu 16[rsi],xmm2 - movdqu 32[rsi],xmm3 - movdqu 48[rsi],xmm4 - jne DECB_LOOP_4 - add rsi,64 -DECB_REMAINDER_4: - cmp r10,0 - je DECB_END_4 -DECB_LOOP_4_2: - movdqu xmm1,[rdi] - add rdi,16 - pxor xmm1,[rcx] - movdqu xmm2,160[rcx] - cmp r8d,12 - aesdec xmm1,16[rcx] - aesdec xmm1,32[rcx] - aesdec xmm1,48[rcx] - aesdec xmm1,64[rcx] - aesdec xmm1,80[rcx] - aesdec xmm1,96[rcx] - aesdec xmm1,112[rcx] - aesdec xmm1,128[rcx] - aesdec xmm1,144[rcx] - jb DECB_LAST_4_2 - cmp r8d,14 - movdqu xmm2,192[rcx] - aesdec xmm1,160[rcx] - aesdec xmm1,176[rcx] - jb DECB_LAST_4_2 - movdqu xmm2,224[rcx] - aesdec xmm1,192[rcx] - aesdec xmm1,208[rcx] -DECB_LAST_4_2: - aesdeclast xmm1,xmm2 - movdqu [rsi],xmm1 - add rsi,16 - dec r10 - jne DECB_LOOP_4_2 -DECB_END_4: - ; restore non volatile rdi,rsi - mov rdi,rax - mov rsi,r11 - ; restore non volatile xmms from stack - movdqa xmm9, [rsp+0] - movdqa xmm10, [rsp+16] - movdqa xmm11, [rsp+32] - movdqa xmm12, [rsp+48] - add rsp,8+4*16 ; 8 = align stack , 4 xmm9-12 16 bytes each - ret -AES_ECB_decrypt_AESNI ENDP - - - -; /* -; void ,AES_128_Key_Expansion_AESNI[const unsigned char*userkey -; unsigned char*key_schedule]/ -; */ -; . align 16,0x90 -; . globl AES_128_Key_Expansion_AESNI -AES_128_Key_Expansion_AESNI PROC -;# parameter 1: rdi -;# parameter 2: rsi - -; save rdi and rsi to rax and r11, restore before ret - mov rax,rdi - mov r11,rsi - -; convert to what we had for att&t convention - mov rdi,rcx - mov rsi,rdx - - mov dword ptr 240[rsi],10 - - movdqu xmm1,[rdi] - movdqa [rsi],xmm1 - - -ASSISTS: - aeskeygenassist xmm2,xmm1,1 - call PREPARE_ROUNDKEY_128 - movdqa 16[rsi],xmm1 - - aeskeygenassist xmm2,xmm1,2 - call PREPARE_ROUNDKEY_128 - movdqa 32[rsi],xmm1 - - aeskeygenassist xmm2,xmm1,4 - call PREPARE_ROUNDKEY_128 - movdqa 48[rsi],xmm1 - - aeskeygenassist xmm2,xmm1,8 - call PREPARE_ROUNDKEY_128 - movdqa 64[rsi],xmm1 - - aeskeygenassist xmm2,xmm1,16 - call PREPARE_ROUNDKEY_128 - movdqa 80[rsi],xmm1 - - aeskeygenassist xmm2,xmm1,32 - call PREPARE_ROUNDKEY_128 - movdqa 96[rsi],xmm1 - - aeskeygenassist xmm2,xmm1,64 - call PREPARE_ROUNDKEY_128 - movdqa 112[rsi],xmm1 - aeskeygenassist xmm2,xmm1,80h - call PREPARE_ROUNDKEY_128 - movdqa 128[rsi],xmm1 - aeskeygenassist xmm2,xmm1,1bh - call PREPARE_ROUNDKEY_128 - movdqa 144[rsi],xmm1 - aeskeygenassist xmm2,xmm1,36h - call PREPARE_ROUNDKEY_128 - movdqa 160[rsi],xmm1 - ; restore non volatile rdi,rsi - mov rdi,rax - mov rsi,r11 - ret - -PREPARE_ROUNDKEY_128: - pshufd xmm2,xmm2,255 - movdqa xmm3,xmm1 - pslldq xmm3,4 - pxor xmm1,xmm3 - pslldq xmm3,4 - pxor xmm1,xmm3 - pslldq xmm3,4 - pxor xmm1,xmm3 - pxor xmm1,xmm2 - ret -AES_128_Key_Expansion_AESNI ENDP - -; /* -; void ,AES_192_Key_Expansion_AESNI[const unsigned char*userkey -; unsigned char*key] -; */ -; . globl AES_192_Key_Expansion_AESNI -AES_192_Key_Expansion_AESNI PROC -;# parameter 1: rdi -;# parameter 2: rsi - -; save rdi and rsi to rax and r11, restore before ret - mov rax,rdi - mov r11,rsi - -; convert to what we had for att&t convention - mov rdi,rcx - mov rsi,rdx - -; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end - sub rsp,8+1*16 ; 8 = align stack , 1 xmm6, 16 bytes each - movdqa [rsp+0], xmm6 - - movdqu xmm1,[rdi] - movq xmm3,qword ptr 16[rdi] - movdqa [rsi],xmm1 - movdqa xmm5,xmm3 - - aeskeygenassist xmm2,xmm3,1h - call PREPARE_ROUNDKEY_192 - shufpd xmm5,xmm1,0 - movdqa 16[rsi],xmm5 - movdqa xmm6,xmm1 - shufpd xmm6,xmm3,1 - movdqa 32[rsi],xmm6 - - aeskeygenassist xmm2,xmm3,2h - call PREPARE_ROUNDKEY_192 - movdqa 48[rsi],xmm1 - movdqa xmm5,xmm3 - - aeskeygenassist xmm2,xmm3,4h - call PREPARE_ROUNDKEY_192 - shufpd xmm5,xmm1,0 - movdqa 64[rsi],xmm5 - movdqa xmm6,xmm1 - shufpd xmm6,xmm3,1 - movdqa 80[rsi],xmm6 - - aeskeygenassist xmm2,xmm3,8h - call PREPARE_ROUNDKEY_192 - movdqa 96[rsi],xmm1 - movdqa xmm5,xmm3 - - aeskeygenassist xmm2,xmm3,10h - call PREPARE_ROUNDKEY_192 - shufpd xmm5,xmm1,0 - movdqa 112[rsi],xmm5 - movdqa xmm6,xmm1 - shufpd xmm6,xmm3,1 - movdqa 128[rsi],xmm6 - - aeskeygenassist xmm2,xmm3,20h - call PREPARE_ROUNDKEY_192 - movdqa 144[rsi],xmm1 - movdqa xmm5,xmm3 - - aeskeygenassist xmm2,xmm3,40h - call PREPARE_ROUNDKEY_192 - shufpd xmm5,xmm1,0 - movdqa 160[rsi],xmm5 - movdqa xmm6,xmm1 - shufpd xmm6,xmm3,1 - movdqa 176[rsi],xmm6 - - aeskeygenassist xmm2,xmm3,80h - call PREPARE_ROUNDKEY_192 - movdqa 192[rsi],xmm1 - movdqa 208[rsi],xmm3 - ; restore non volatile rdi,rsi - mov rdi,rax - mov rsi,r11 -; restore non volatile xmms from stack - movdqa xmm6, [rsp+0] - add rsp,8+1*16 ; 8 = align stack , 1 xmm6 16 bytes each - ret - -PREPARE_ROUNDKEY_192: - pshufd xmm2,xmm2,55h - movdqu xmm4,xmm1 - pslldq xmm4,4 - pxor xmm1,xmm4 - - pslldq xmm4,4 - pxor xmm1,xmm4 - pslldq xmm4,4 - pxor xmm1,xmm4 - pxor xmm1,xmm2 - pshufd xmm2,xmm1,0ffh - movdqu xmm4,xmm3 - pslldq xmm4,4 - pxor xmm3,xmm4 - pxor xmm3,xmm2 - ret -AES_192_Key_Expansion_AESNI ENDP - -; /* -; void ,AES_256_Key_Expansion_AESNI[const unsigned char*userkey -; unsigned char*key] -; */ -; . globl AES_256_Key_Expansion_AESNI -AES_256_Key_Expansion_AESNI PROC -;# parameter 1: rdi -;# parameter 2: rsi - -; save rdi and rsi to rax and r11, restore before ret - mov rax,rdi - mov r11,rsi - -; convert to what we had for att&t convention - mov rdi,rcx - mov rsi,rdx - - movdqu xmm1,[rdi] - movdqu xmm3,16[rdi] - movdqa [rsi],xmm1 - movdqa 16[rsi],xmm3 - - aeskeygenassist xmm2,xmm3,1h - call MAKE_RK256_a - movdqa 32[rsi],xmm1 - aeskeygenassist xmm2,xmm1,0h - call MAKE_RK256_b - movdqa 48[rsi],xmm3 - aeskeygenassist xmm2,xmm3,2h - call MAKE_RK256_a - movdqa 64[rsi],xmm1 - aeskeygenassist xmm2,xmm1,0h - call MAKE_RK256_b - movdqa 80[rsi],xmm3 - aeskeygenassist xmm2,xmm3,4h - call MAKE_RK256_a - movdqa 96[rsi],xmm1 - aeskeygenassist xmm2,xmm1,0h - call MAKE_RK256_b - movdqa 112[rsi],xmm3 - aeskeygenassist xmm2,xmm3,8h - call MAKE_RK256_a - movdqa 128[rsi],xmm1 - aeskeygenassist xmm2,xmm1,0h - call MAKE_RK256_b - movdqa 144[rsi],xmm3 - aeskeygenassist xmm2,xmm3,10h - call MAKE_RK256_a - movdqa 160[rsi],xmm1 - aeskeygenassist xmm2,xmm1,0h - call MAKE_RK256_b - movdqa 176[rsi],xmm3 - aeskeygenassist xmm2,xmm3,20h - call MAKE_RK256_a - movdqa 192[rsi],xmm1 - - aeskeygenassist xmm2,xmm1,0h - call MAKE_RK256_b - movdqa 208[rsi],xmm3 - aeskeygenassist xmm2,xmm3,40h - call MAKE_RK256_a - movdqa 224[rsi],xmm1 - - ; restore non volatile rdi,rsi - mov rdi,rax - mov rsi,r11 - ret -AES_256_Key_Expansion_AESNI ENDP - -MAKE_RK256_a: - pshufd xmm2,xmm2,0ffh - movdqa xmm4,xmm1 - pslldq xmm4,4 - pxor xmm1,xmm4 - pslldq xmm4,4 - pxor xmm1,xmm4 - pslldq xmm4,4 - pxor xmm1,xmm4 - pxor xmm1,xmm2 - ret - -MAKE_RK256_b: - pshufd xmm2,xmm2,0aah - movdqa xmm4,xmm3 - pslldq xmm4,4 - pxor xmm3,xmm4 - pslldq xmm4,4 - pxor xmm3,xmm4 - pslldq xmm4,4 - pxor xmm3,xmm4 - pxor xmm3,xmm2 - ret - - -IF fips_version GE 2 - fipsAb ENDS -ELSE - _text ENDS -ENDIF - -END +; +; +; /* See Intel Advanced Encryption Standard (AES) Instructions Set White Paper +; * by Israel, Intel Mobility Group Development Center, Israel Shay Gueron +; */ +; +; /* This file is in intel asm syntax, see .s for at&t syntax */ +; + + +fips_version = 0 +IFDEF HAVE_FIPS + fips_version = 1 + IFDEF HAVE_FIPS_VERSION + fips_version = HAVE_FIPS_VERSION + ENDIF +ENDIF + +IF fips_version GE 2 + fipsAb SEGMENT ALIAS(".fipsA$b") 'CODE' +ELSE + _text SEGMENT +ENDIF + +IF fips_version GE 2 + fipsAb ENDS +ELSE + _text ENDS +ENDIF + +END diff --git a/wolfcrypt/src/aes_gcm_asm.S b/wolfcrypt/src/aes_gcm_asm.S index e75f2c9b942..e82445fca15 100644 --- a/wolfcrypt/src/aes_gcm_asm.S +++ b/wolfcrypt/src/aes_gcm_asm.S @@ -46,6 +46,16 @@ #define HAVE_INTEL_AVX2 #endif /* HAVE_INTEL_AVX2 */ #endif /* NO_AVX2_SUPPORT */ +#ifndef NO_VAES_SUPPORT +#ifndef HAVE_INTEL_VAES +#define HAVE_INTEL_VAES +#endif /* HAVE_INTEL_VAES */ +#endif /* NO_VAES_SUPPORT */ +#ifndef NO_AVX512_SUPPORT +#ifndef HAVE_INTEL_AVX512 +#define HAVE_INTEL_AVX512 +#endif /* HAVE_INTEL_AVX512 */ +#endif /* NO_AVX512_SUPPORT */ #ifdef WOLFSSL_X86_64_BUILD #ifndef __APPLE__ @@ -194,10 +204,10 @@ _GCM_generate_m0_aesni: por %xmm5, %xmm1 por %xmm6, %xmm2 por %xmm7, %xmm3 - vpshufb %xmm9, %xmm0, %xmm0 - vpshufb %xmm9, %xmm1, %xmm1 - vpshufb %xmm9, %xmm2, %xmm2 - vpshufb %xmm9, %xmm3, %xmm3 + pshufb %xmm9, %xmm0 + pshufb %xmm9, %xmm1 + pshufb %xmm9, %xmm2 + pshufb %xmm9, %xmm3 movdqu %xmm0, 256(%rsi) movdqu %xmm1, 272(%rsi) movdqu %xmm2, 288(%rsi) @@ -230,10 +240,10 @@ _GCM_generate_m0_aesni: por %xmm5, %xmm1 por %xmm6, %xmm2 por %xmm7, %xmm3 - vpshufb %xmm9, %xmm0, %xmm0 - vpshufb %xmm9, %xmm1, %xmm1 - vpshufb %xmm9, %xmm2, %xmm2 - vpshufb %xmm9, %xmm3, %xmm3 + pshufb %xmm9, %xmm0 + pshufb %xmm9, %xmm1 + pshufb %xmm9, %xmm2 + pshufb %xmm9, %xmm3 movdqu %xmm0, 320(%rsi) movdqu %xmm1, 336(%rsi) movdqu %xmm2, 352(%rsi) @@ -266,10 +276,10 @@ _GCM_generate_m0_aesni: por %xmm5, %xmm1 por %xmm6, %xmm2 por %xmm7, %xmm3 - vpshufb %xmm9, %xmm0, %xmm0 - vpshufb %xmm9, %xmm1, %xmm1 - vpshufb %xmm9, %xmm2, %xmm2 - vpshufb %xmm9, %xmm3, %xmm3 + pshufb %xmm9, %xmm0 + pshufb %xmm9, %xmm1 + pshufb %xmm9, %xmm2 + pshufb %xmm9, %xmm3 movdqu %xmm0, 384(%rsi) movdqu %xmm1, 400(%rsi) movdqu %xmm2, 416(%rsi) @@ -302,10 +312,10 @@ _GCM_generate_m0_aesni: por %xmm5, %xmm1 por %xmm6, %xmm2 por %xmm7, %xmm3 - vpshufb %xmm9, %xmm0, %xmm0 - vpshufb %xmm9, %xmm1, %xmm1 - vpshufb %xmm9, %xmm2, %xmm2 - vpshufb %xmm9, %xmm3, %xmm3 + pshufb %xmm9, %xmm0 + pshufb %xmm9, %xmm1 + pshufb %xmm9, %xmm2 + pshufb %xmm9, %xmm3 movdqu %xmm0, 448(%rsi) movdqu %xmm1, 464(%rsi) movdqu %xmm2, 480(%rsi) @@ -16577,6 +16587,14213 @@ L_AES_GCM_decrypt_final_avx2_cmp_tag_done: #endif /* __APPLE__ */ #endif /* WOLFSSL_AESGCM_STREAM */ #endif /* HAVE_INTEL_AVX2 */ +#ifdef HAVE_INTEL_VAES +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_inc_y0: +.quad 0x0000000000000000,0x0000000000000000 +.quad 0x0000000000000000,0x0000000000000001 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_inc_y1: +.quad 0x0000000000000000,0x0000000000000002 +.quad 0x0000000000000000,0x0000000000000003 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_inc_y2: +.quad 0x0000000000000000,0x0000000000000004 +.quad 0x0000000000000000,0x0000000000000005 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_inc_y3: +.quad 0x0000000000000000,0x0000000000000006 +.quad 0x0000000000000000,0x0000000000000007 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_GCM_generate_m0_vaes_rev8: +.quad 0x08090a0b0c0d0e0f,0x0001020304050607 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_GCM_generate_m0_vaes_mod2_128: +.quad 0x0000000000000000,0xe100000000000000 +#ifndef __APPLE__ +.text +.globl GCM_generate_m0_vaes +.type GCM_generate_m0_vaes,@function +.align 16 +GCM_generate_m0_vaes: +#else +.section __TEXT,__text +.globl _GCM_generate_m0_vaes +.p2align 4 +_GCM_generate_m0_vaes: +#endif /* __APPLE__ */ + vmovdqu L_GCM_generate_m0_vaes_rev8(%rip), %xmm9 + vmovdqu L_GCM_generate_m0_vaes_mod2_128(%rip), %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqu (%rdi), %xmm0 + vmovdqu %xmm8, (%rsi) + vmovdqu %xmm0, %xmm8 + vpshufb %xmm9, %xmm0, %xmm0 + vpsllq $63, %xmm0, %xmm5 + vpsrlq $0x01, %xmm0, %xmm4 + vpslldq $8, %xmm5, %xmm1 + vpsrldq $8, %xmm5, %xmm5 + vpshufd $0xff, %xmm1, %xmm1 + vpor %xmm5, %xmm4, %xmm4 + vpsrad $31, %xmm1, %xmm1 + vpand %xmm10, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpsllq $63, %xmm1, %xmm5 + vpsrlq $0x01, %xmm1, %xmm4 + vpslldq $8, %xmm5, %xmm2 + vpsrldq $8, %xmm5, %xmm5 + vpshufd $0xff, %xmm2, %xmm2 + vpor %xmm5, %xmm4, %xmm4 + vpsrad $31, %xmm2, %xmm2 + vpand %xmm10, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpsllq $63, %xmm2, %xmm5 + vpsrlq $0x01, %xmm2, %xmm4 + vpslldq $8, %xmm5, %xmm3 + vpsrldq $8, %xmm5, %xmm5 + vpshufd $0xff, %xmm3, %xmm3 + vpor %xmm5, %xmm4, %xmm4 + vpsrad $31, %xmm3, %xmm3 + vpand %xmm10, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpshufb %xmm9, %xmm3, %xmm3 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm0, %xmm0 + vpxor %xmm2, %xmm3, %xmm8 + vmovdqu %xmm3, 16(%rsi) + vmovdqu %xmm2, 32(%rsi) + vmovdqu %xmm8, 48(%rsi) + vmovdqu %xmm1, 64(%rsi) + vpxor %xmm1, %xmm3, %xmm4 + vpxor %xmm1, %xmm2, %xmm5 + vpxor %xmm1, %xmm8, %xmm6 + vmovdqu %xmm4, 80(%rsi) + vmovdqu %xmm5, 96(%rsi) + vmovdqu %xmm6, 112(%rsi) + vmovdqu %xmm0, 128(%rsi) + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm0, %xmm3, %xmm4 + vpxor %xmm0, %xmm2, %xmm6 + vmovdqu %xmm4, 144(%rsi) + vmovdqu %xmm6, 160(%rsi) + vpxor %xmm6, %xmm3, %xmm6 + vmovdqu %xmm6, 176(%rsi) + vmovdqu %xmm1, 192(%rsi) + vpxor %xmm1, %xmm3, %xmm4 + vpxor %xmm1, %xmm2, %xmm5 + vpxor %xmm1, %xmm8, %xmm6 + vmovdqu %xmm4, 208(%rsi) + vmovdqu %xmm5, 224(%rsi) + vmovdqu %xmm6, 240(%rsi) + vmovdqu (%rsi), %xmm0 + vmovdqu 16(%rsi), %xmm1 + vmovdqu 32(%rsi), %xmm2 + vmovdqu 48(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 256(%rsi) + vmovdqu %xmm1, 272(%rsi) + vmovdqu %xmm2, 288(%rsi) + vmovdqu %xmm3, 304(%rsi) + vmovdqu 64(%rsi), %xmm0 + vmovdqu 80(%rsi), %xmm1 + vmovdqu 96(%rsi), %xmm2 + vmovdqu 112(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 320(%rsi) + vmovdqu %xmm1, 336(%rsi) + vmovdqu %xmm2, 352(%rsi) + vmovdqu %xmm3, 368(%rsi) + vmovdqu 128(%rsi), %xmm0 + vmovdqu 144(%rsi), %xmm1 + vmovdqu 160(%rsi), %xmm2 + vmovdqu 176(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 384(%rsi) + vmovdqu %xmm1, 400(%rsi) + vmovdqu %xmm2, 416(%rsi) + vmovdqu %xmm3, 432(%rsi) + vmovdqu 192(%rsi), %xmm0 + vmovdqu 208(%rsi), %xmm1 + vmovdqu 224(%rsi), %xmm2 + vmovdqu 240(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 448(%rsi) + vmovdqu %xmm1, 464(%rsi) + vmovdqu %xmm2, 480(%rsi) + vmovdqu %xmm3, 496(%rsi) + repz retq +#ifndef __APPLE__ +.size GCM_generate_m0_vaes,.-GCM_generate_m0_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_one: +.quad 0x0000000000000000,0x0000000000000001 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_two: +.quad 0x0000000000000000,0x0000000000000002 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_three: +.quad 0x0000000000000000,0x0000000000000003 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_four: +.quad 0x0000000000000000,0x0000000000000004 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_five: +.quad 0x0000000000000000,0x0000000000000005 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_six: +.quad 0x0000000000000000,0x0000000000000006 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_seven: +.quad 0x0000000000000000,0x0000000000000007 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_eight: +.quad 0x0000000000000000,0x0000000000000008 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_bswap_epi64: +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_bswap_mask: +.quad 0x08090a0b0c0d0e0f,0x0001020304050607 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_mod2_128: +.quad 0x0000000000000001,0xc200000000000000 +#ifndef __APPLE__ +.text +.globl AES_GCM_encrypt_vaes +.type AES_GCM_encrypt_vaes,@function +.align 16 +AES_GCM_encrypt_vaes: +#else +.section __TEXT,__text +.globl _AES_GCM_encrypt_vaes +.p2align 4 +_AES_GCM_encrypt_vaes: +#endif /* __APPLE__ */ + pushq %r13 + pushq %r12 + pushq %rbx + pushq %r14 + pushq %r15 + movq %rdx, %r12 + movq %rcx, %rax + movl 48(%rsp), %r11d + movl 56(%rsp), %ebx + movl 64(%rsp), %r14d + movq 72(%rsp), %r15 + movl 80(%rsp), %r10d + subq $0x230, %rsp + vpxor %xmm5, %xmm5, %xmm5 + vpxor %xmm15, %xmm15, %xmm15 + movl %ebx, %edx + cmpl $12, %edx + jne L_AES_GCM_encrypt_vaes_iv_not_12 + # # Calculate values when IV is 12 bytes + # Set counter based on IV + movl $0x1000000, %ecx + vmovq (%rax), %xmm5 + vpinsrd $2, 8(%rax), %xmm5, %xmm5 + vpinsrd $3, %ecx, %xmm5, %xmm5 + # H = Encrypt X(=0) and T = Encrypt counter + vmovdqa (%r15), %xmm6 + vpxor %xmm6, %xmm5, %xmm1 + vmovdqa 16(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 32(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 48(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 64(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 80(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 96(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 112(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 128(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 144(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm4 + jl L_AES_GCM_encrypt_vaes_calc_iv_12_last + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 176(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm4 + jl L_AES_GCM_encrypt_vaes_calc_iv_12_last + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 208(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 224(%r15), %xmm4 +L_AES_GCM_encrypt_vaes_calc_iv_12_last: + vaesenclast %xmm4, %xmm6, %xmm6 + vaesenclast %xmm4, %xmm1, %xmm1 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6 + vmovdqu %xmm1, 528(%rsp) + jmp L_AES_GCM_encrypt_vaes_iv_done +L_AES_GCM_encrypt_vaes_iv_not_12: + # Calculate values when IV is not 12 bytes + # H = Encrypt X(=0) + vmovdqa (%r15), %xmm6 + vaesenc 16(%r15), %xmm6, %xmm6 + vaesenc 32(%r15), %xmm6, %xmm6 + vaesenc 48(%r15), %xmm6, %xmm6 + vaesenc 64(%r15), %xmm6, %xmm6 + vaesenc 80(%r15), %xmm6, %xmm6 + vaesenc 96(%r15), %xmm6, %xmm6 + vaesenc 112(%r15), %xmm6, %xmm6 + vaesenc 128(%r15), %xmm6, %xmm6 + vaesenc 144(%r15), %xmm6, %xmm6 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm8 + jl L_AES_GCM_encrypt_vaes_calc_iv_1_aesenc_avx_last + vaesenc %xmm8, %xmm6, %xmm6 + vaesenc 176(%r15), %xmm6, %xmm6 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm8 + jl L_AES_GCM_encrypt_vaes_calc_iv_1_aesenc_avx_last + vaesenc %xmm8, %xmm6, %xmm6 + vaesenc 208(%r15), %xmm6, %xmm6 + vmovdqa 224(%r15), %xmm8 +L_AES_GCM_encrypt_vaes_calc_iv_1_aesenc_avx_last: + vaesenclast %xmm8, %xmm6, %xmm6 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6 + # Calc counter + # Initialization vector + cmpl $0x00, %edx + movq $0x00, %rcx + je L_AES_GCM_encrypt_vaes_calc_iv_done + cmpl $16, %edx + jl L_AES_GCM_encrypt_vaes_calc_iv_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_encrypt_vaes_calc_iv_16_loop: + vmovdqu (%rax,%rcx,1), %xmm7 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm5, %xmm5 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm5 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm5, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm5, %xmm5 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm5, %xmm5 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm5, %xmm5 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_vaes_calc_iv_16_loop + movl %ebx, %edx + cmpl %edx, %ecx + je L_AES_GCM_encrypt_vaes_calc_iv_done +L_AES_GCM_encrypt_vaes_calc_iv_lt16: + subq $16, %rsp + vpxor %xmm7, %xmm7, %xmm7 + xorl %ebx, %ebx + vmovdqu %xmm7, (%rsp) +L_AES_GCM_encrypt_vaes_calc_iv_loop: + movzbl (%rax,%rcx,1), %r13d + movb %r13b, (%rsp,%rbx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_vaes_calc_iv_loop + vmovdqu (%rsp), %xmm7 + addq $16, %rsp + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm5, %xmm5 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm5 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm5, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm5, %xmm5 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm5, %xmm5 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm5, %xmm5 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 +L_AES_GCM_encrypt_vaes_calc_iv_done: + # T = Encrypt counter + vpxor %xmm0, %xmm0, %xmm0 + shll $3, %edx + vmovq %rdx, %xmm0 + vpxor %xmm0, %xmm5, %xmm5 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm5 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm5, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm5, %xmm5 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm5, %xmm5 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm5, %xmm5 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + # Encrypt counter + vmovdqa (%r15), %xmm7 + vpxor %xmm5, %xmm7, %xmm7 + vaesenc 16(%r15), %xmm7, %xmm7 + vaesenc 32(%r15), %xmm7, %xmm7 + vaesenc 48(%r15), %xmm7, %xmm7 + vaesenc 64(%r15), %xmm7, %xmm7 + vaesenc 80(%r15), %xmm7, %xmm7 + vaesenc 96(%r15), %xmm7, %xmm7 + vaesenc 112(%r15), %xmm7, %xmm7 + vaesenc 128(%r15), %xmm7, %xmm7 + vaesenc 144(%r15), %xmm7, %xmm7 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm8 + jl L_AES_GCM_encrypt_vaes_calc_iv_2_aesenc_avx_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 176(%r15), %xmm7, %xmm7 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm8 + jl L_AES_GCM_encrypt_vaes_calc_iv_2_aesenc_avx_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 208(%r15), %xmm7, %xmm7 + vmovdqa 224(%r15), %xmm8 +L_AES_GCM_encrypt_vaes_calc_iv_2_aesenc_avx_last: + vaesenclast %xmm8, %xmm7, %xmm7 + vmovdqu %xmm7, 528(%rsp) +L_AES_GCM_encrypt_vaes_iv_done: + # Additional authentication data + movl %r11d, %edx + cmpl $0x00, %edx + je L_AES_GCM_encrypt_vaes_calc_aad_done + xorl %ecx, %ecx + cmpl $16, %edx + jl L_AES_GCM_encrypt_vaes_calc_aad_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_encrypt_vaes_calc_aad_16_loop: + vmovdqu (%r12,%rcx,1), %xmm7 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm15, %xmm15 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm15, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm15, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm15, %xmm6, %xmm0 + vpxor %xmm15, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm15 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm15, %xmm15 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm15, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm15, %xmm15 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm15, %xmm15 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm15, %xmm15 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm15, %xmm15 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_vaes_calc_aad_16_loop + movl %r11d, %edx + cmpl %edx, %ecx + je L_AES_GCM_encrypt_vaes_calc_aad_done +L_AES_GCM_encrypt_vaes_calc_aad_lt16: + subq $16, %rsp + vpxor %xmm7, %xmm7, %xmm7 + xorl %ebx, %ebx + vmovdqu %xmm7, (%rsp) +L_AES_GCM_encrypt_vaes_calc_aad_loop: + movzbl (%r12,%rcx,1), %r13d + movb %r13b, (%rsp,%rbx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_vaes_calc_aad_loop + vmovdqu (%rsp), %xmm7 + addq $16, %rsp + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm15, %xmm15 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm15, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm15, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm15, %xmm6, %xmm0 + vpxor %xmm15, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm15 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm15, %xmm15 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm15, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm15, %xmm15 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm15, %xmm15 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm15, %xmm15 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm15, %xmm15 +L_AES_GCM_encrypt_vaes_calc_aad_done: + # Calculate counter and H + vpsrlq $63, %xmm6, %xmm8 + vpsllq $0x01, %xmm6, %xmm7 + vpslldq $8, %xmm8, %xmm8 + vpor %xmm8, %xmm7, %xmm7 + vpshufd $0xff, %xmm6, %xmm6 + vpsrad $31, %xmm6, %xmm6 + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm5, %xmm5 + vpand L_vaes_aes_gcm_mod2_128(%rip), %xmm6, %xmm6 + vpaddd L_vaes_aes_gcm_one(%rip), %xmm5, %xmm5 + vpxor %xmm7, %xmm6, %xmm6 + vmovdqu %xmm5, 512(%rsp) + xorl %ebx, %ebx + cmpl $0x80, %r9d + jl L_AES_GCM_encrypt_vaes_done_128 + vmovdqa %xmm15, %xmm2 + # H ^ 1 + vmovdqu %xmm6, (%rsp) + # H ^ 2 + vpclmulqdq $0x00, %xmm6, %xmm6, %xmm7 + vpclmulqdq $0x11, %xmm6, %xmm6, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm0 + vmovdqu %xmm0, 16(%rsp) + # H ^ 3 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm6, %xmm0, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm0, %xmm8 + vpclmulqdq $16, %xmm6, %xmm0, %xmm9 + vpclmulqdq $0x11, %xmm6, %xmm0, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm1 + vmovdqu %xmm1, 32(%rsp) + # H ^ 4 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm3 + vmovdqu %xmm3, 48(%rsp) + # H ^ 5 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 64(%rsp) + # H ^ 6 + vpclmulqdq $0x00, %xmm1, %xmm1, %xmm7 + vpclmulqdq $0x11, %xmm1, %xmm1, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 80(%rsp) + # H ^ 7 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm1, %xmm3, %xmm7 + vpclmulqdq $0x01, %xmm1, %xmm3, %xmm8 + vpclmulqdq $16, %xmm1, %xmm3, %xmm9 + vpclmulqdq $0x11, %xmm1, %xmm3, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 96(%rsp) + # H ^ 8 + vpclmulqdq $0x00, %xmm3, %xmm3, %xmm7 + vpclmulqdq $0x11, %xmm3, %xmm3, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 112(%rsp) + cmpl $0x100, %r9d + jl L_AES_GCM_encrypt_vaes_no_ext + # H ^ 9 + vmovdqu 48(%rsp), %xmm0 + vmovdqu 64(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 128(%rsp) + # H ^ 10 + vmovdqu 64(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 144(%rsp) + # H ^ 11 + vmovdqu 64(%rsp), %xmm0 + vmovdqu 80(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 160(%rsp) + # H ^ 12 + vmovdqu 80(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 176(%rsp) + # H ^ 13 + vmovdqu 80(%rsp), %xmm0 + vmovdqu 96(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 192(%rsp) + # H ^ 14 + vmovdqu 96(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 208(%rsp) + # H ^ 15 + vmovdqu 96(%rsp), %xmm0 + vmovdqu 112(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 224(%rsp) + # H ^ 16 + vmovdqu 112(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 240(%rsp) + vmovdqu 224(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 192(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 160(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu 128(%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + vmovdqu %ymm7, 256(%rsp) + vmovdqu %ymm8, 288(%rsp) + vmovdqu %ymm9, 320(%rsp) + vmovdqu %ymm10, 352(%rsp) + vmovdqu 96(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 64(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 32(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu (%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + vmovdqu %ymm7, 384(%rsp) + vmovdqu %ymm8, 416(%rsp) + vmovdqu %ymm9, 448(%rsp) + vmovdqu %ymm10, 480(%rsp) +L_AES_GCM_encrypt_vaes_no_ext: + vbroadcasti128 L_vaes_aes_gcm_mod2_128(%rip), %ymm14 + cmpl $0x100, %r9d + jl L_AES_GCM_encrypt_vaes_after_256 + movl %r9d, %r13d + andl $0xffffff00, %r13d +L_AES_GCM_encrypt_vaes_loop_256: + # 256 bytes of input + leaq (%rsi,%rbx,1), %rcx + movq %rcx, 544(%rsp) + vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6 + vbroadcasti128 512(%rsp), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu 512(%rsp), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, 512(%rsp) + vbroadcasti128 (%r15), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %r10d + vbroadcasti128 160(%r15), %ymm4 + jl L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %r10d + vbroadcasti128 192(%r15), %ymm4 + jl L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%r15), %ymm4 +L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %ebx + vbroadcasti128 512(%rsp), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu 512(%rsp), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, 512(%rsp) + vbroadcasti128 (%r15), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %r10d + vbroadcasti128 160(%r15), %ymm4 + jl L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %r10d + vbroadcasti128 192(%r15), %ymm4 + jl L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%r15), %ymm4 +L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %ebx + vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6 + movq 544(%rsp), %rcx + vpxor %ymm4, %ymm4, %ymm4 + vinserti128 $0x00, %xmm15, %ymm4, %ymm4 + vmovdqu 256(%rsp), %ymm7 + vmovdqu 288(%rsp), %ymm8 + vmovdqu 320(%rsp), %ymm9 + vmovdqu 352(%rsp), %ymm10 + vmovdqu (%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpxor %ymm4, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vmovdqa %ymm0, %ymm11 + vpxor %ymm1, %ymm2, %ymm12 + vmovdqa %ymm3, %ymm13 + vmovdqu 32(%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 64(%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 96(%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 384(%rsp), %ymm7 + vmovdqu 416(%rsp), %ymm8 + vmovdqu 448(%rsp), %ymm9 + vmovdqu 480(%rsp), %ymm10 + vmovdqu 128(%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 160(%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 192(%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 224(%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5 + vpshufd $0x4e, %ymm11, %ymm11 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm11, %ymm12, %ymm12 + vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5 + vpshufd $0x4e, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm12, %ymm13, %ymm13 + vextracti128 $0x01, %ymm13, %xmm0 + vpxor %xmm0, %xmm13, %xmm15 + cmpl %r13d, %ebx + jl L_AES_GCM_encrypt_vaes_loop_256 +L_AES_GCM_encrypt_vaes_after_256: + movl %r9d, %r13d + andl $0xffffff80, %r13d + cmpl %r13d, %ebx + jge L_AES_GCM_encrypt_vaes_after_128 + # 128 bytes of input + leaq (%rsi,%rbx,1), %rcx + movq %rcx, 544(%rsp) + vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6 + vbroadcasti128 512(%rsp), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu 512(%rsp), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, 512(%rsp) + vbroadcasti128 (%r15), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %r10d + vbroadcasti128 160(%r15), %ymm4 + jl L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %r10d + vbroadcasti128 192(%r15), %ymm4 + jl L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%r15), %ymm4 +L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %ebx + vmovdqu 96(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 64(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 32(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu (%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6 + movq 544(%rsp), %rcx + vpxor %ymm4, %ymm4, %ymm4 + vinserti128 $0x00, %xmm15, %ymm4, %ymm4 + vmovdqu (%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpxor %ymm4, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vmovdqa %ymm0, %ymm11 + vpxor %ymm1, %ymm2, %ymm12 + vmovdqa %ymm3, %ymm13 + vmovdqu 32(%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 64(%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 96(%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5 + vpshufd $0x4e, %ymm11, %ymm11 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm11, %ymm12, %ymm12 + vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5 + vpshufd $0x4e, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm12, %ymm13, %ymm13 + vextracti128 $0x01, %ymm13, %xmm0 + vpxor %xmm0, %xmm13, %xmm15 +L_AES_GCM_encrypt_vaes_after_128: + vmovdqu (%rsp), %xmm6 +L_AES_GCM_encrypt_vaes_done_128: + movl %r9d, %edx + cmpl %edx, %ebx + jge L_AES_GCM_encrypt_vaes_done_enc + movl %r9d, %r13d + andl $0xfffffff0, %r13d + cmpl %r13d, %ebx + jge L_AES_GCM_encrypt_vaes_last_block_done + vmovdqu 512(%rsp), %xmm8 + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm8, %xmm7 + vpaddd L_vaes_aes_gcm_one(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 512(%rsp) + vpxor (%r15), %xmm7, %xmm7 + vaesenc 16(%r15), %xmm7, %xmm7 + vaesenc 32(%r15), %xmm7, %xmm7 + vaesenc 48(%r15), %xmm7, %xmm7 + vaesenc 64(%r15), %xmm7, %xmm7 + vaesenc 80(%r15), %xmm7, %xmm7 + vaesenc 96(%r15), %xmm7, %xmm7 + vaesenc 112(%r15), %xmm7, %xmm7 + vaesenc 128(%r15), %xmm7, %xmm7 + vaesenc 144(%r15), %xmm7, %xmm7 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm8 + jl L_AES_GCM_encrypt_vaes_aesenc_block_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 176(%r15), %xmm7, %xmm7 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm8 + jl L_AES_GCM_encrypt_vaes_aesenc_block_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 208(%r15), %xmm7, %xmm7 + vmovdqa 224(%r15), %xmm8 +L_AES_GCM_encrypt_vaes_aesenc_block_last: + vaesenclast %xmm8, %xmm7, %xmm7 + vmovdqu (%rdi,%rbx,1), %xmm8 + vpxor %xmm8, %xmm7, %xmm7 + vmovdqu %xmm7, (%rsi,%rbx,1) + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm15, %xmm15 + addl $16, %ebx + cmpl %r13d, %ebx + jge L_AES_GCM_encrypt_vaes_last_block_ghash +L_AES_GCM_encrypt_vaes_last_block_start: + vmovdqu (%rdi,%rbx,1), %xmm12 + vmovdqu 512(%rsp), %xmm8 + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm8, %xmm7 + vpaddd L_vaes_aes_gcm_one(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 512(%rsp) + vpxor (%r15), %xmm7, %xmm7 + vpclmulqdq $16, %xmm6, %xmm15, %xmm9 + vaesenc 16(%r15), %xmm7, %xmm7 + vaesenc 32(%r15), %xmm7, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm15, %xmm10 + vaesenc 48(%r15), %xmm7, %xmm7 + vaesenc 64(%r15), %xmm7, %xmm7 + vpclmulqdq $0x00, %xmm6, %xmm15, %xmm11 + vaesenc 80(%r15), %xmm7, %xmm7 + vpclmulqdq $0x11, %xmm6, %xmm15, %xmm1 + vaesenc 96(%r15), %xmm7, %xmm7 + vpxor %xmm10, %xmm9, %xmm9 + vpslldq $8, %xmm9, %xmm2 + vpsrldq $8, %xmm9, %xmm9 + vaesenc 112(%r15), %xmm7, %xmm7 + vpxor %xmm11, %xmm2, %xmm2 + vpxor %xmm9, %xmm1, %xmm3 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm0 + vpclmulqdq $16, %xmm0, %xmm2, %xmm10 + vaesenc 128(%r15), %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm9 + vpxor %xmm10, %xmm9, %xmm9 + vpclmulqdq $16, %xmm0, %xmm9, %xmm10 + vaesenc 144(%r15), %xmm7, %xmm7 + vpshufd $0x4e, %xmm9, %xmm9 + vpxor %xmm10, %xmm9, %xmm9 + vpxor %xmm3, %xmm9, %xmm15 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm8 + jl L_AES_GCM_encrypt_vaes_aesenc_gfmul_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 176(%r15), %xmm7, %xmm7 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm8 + jl L_AES_GCM_encrypt_vaes_aesenc_gfmul_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 208(%r15), %xmm7, %xmm7 + vmovdqa 224(%r15), %xmm8 +L_AES_GCM_encrypt_vaes_aesenc_gfmul_last: + vaesenclast %xmm8, %xmm7, %xmm7 + vmovdqa %xmm12, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vmovdqu %xmm7, (%rsi,%rbx,1) + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + addl $16, %ebx + vpxor %xmm7, %xmm15, %xmm15 + cmpl %r13d, %ebx + jl L_AES_GCM_encrypt_vaes_last_block_start +L_AES_GCM_encrypt_vaes_last_block_ghash: + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm6, %xmm15, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm15, %xmm8 + vpclmulqdq $16, %xmm6, %xmm15, %xmm9 + vpclmulqdq $0x11, %xmm6, %xmm15, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm15 +L_AES_GCM_encrypt_vaes_last_block_done: + movl %r9d, %ecx + movl %ecx, %edx + andl $15, %ecx + jz L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_done + vmovdqu 512(%rsp), %xmm5 + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm5, %xmm5 + vpxor (%r15), %xmm5, %xmm5 + vaesenc 16(%r15), %xmm5, %xmm5 + vaesenc 32(%r15), %xmm5, %xmm5 + vaesenc 48(%r15), %xmm5, %xmm5 + vaesenc 64(%r15), %xmm5, %xmm5 + vaesenc 80(%r15), %xmm5, %xmm5 + vaesenc 96(%r15), %xmm5, %xmm5 + vaesenc 112(%r15), %xmm5, %xmm5 + vaesenc 128(%r15), %xmm5, %xmm5 + vaesenc 144(%r15), %xmm5, %xmm5 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm8 + jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc %xmm8, %xmm5, %xmm5 + vaesenc 176(%r15), %xmm5, %xmm5 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm8 + jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc %xmm8, %xmm5, %xmm5 + vaesenc 208(%r15), %xmm5, %xmm5 + vmovdqa 224(%r15), %xmm8 +L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_aesenc_avx_last: + vaesenclast %xmm8, %xmm5, %xmm5 + subq $16, %rsp + xorl %ecx, %ecx + vmovdqu %xmm5, (%rsp) +L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_loop: + movzbl (%rdi,%rbx,1), %r13d + xorb (%rsp,%rcx,1), %r13b + movb %r13b, (%rsi,%rbx,1) + movb %r13b, (%rsp,%rcx,1) + incl %ebx + incl %ecx + cmpl %edx, %ebx + jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_loop + xorq %r13, %r13 + cmpl $16, %ecx + je L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_finish_enc +L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_byte_loop: + movb %r13b, (%rsp,%rcx,1) + incl %ecx + cmpl $16, %ecx + jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_byte_loop +L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_finish_enc: + vmovdqu (%rsp), %xmm5 + addq $16, %rsp + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + vpxor %xmm5, %xmm15, %xmm15 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm6, %xmm15, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm15, %xmm8 + vpclmulqdq $16, %xmm6, %xmm15, %xmm9 + vpclmulqdq $0x11, %xmm6, %xmm15, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm15 +L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_done: +L_AES_GCM_encrypt_vaes_done_enc: + movl %r9d, %edx + movl %r11d, %ecx + shlq $3, %rdx + shlq $3, %rcx + vmovq %rdx, %xmm0 + vmovq %rcx, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 + vpxor %xmm0, %xmm15, %xmm15 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm6, %xmm15, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm15, %xmm8 + vpclmulqdq $16, %xmm6, %xmm15, %xmm9 + vpclmulqdq $0x11, %xmm6, %xmm15, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm15 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm15, %xmm15 + vmovdqu 528(%rsp), %xmm0 + vpxor %xmm15, %xmm0, %xmm0 + cmpl $16, %r14d + je L_AES_GCM_encrypt_vaes_store_tag_16 + xorq %rcx, %rcx + vmovdqu %xmm0, (%rsp) +L_AES_GCM_encrypt_vaes_store_tag_loop: + movzbl (%rsp,%rcx,1), %r13d + movb %r13b, (%r8,%rcx,1) + incl %ecx + cmpl %r14d, %ecx + jne L_AES_GCM_encrypt_vaes_store_tag_loop + jmp L_AES_GCM_encrypt_vaes_store_tag_done +L_AES_GCM_encrypt_vaes_store_tag_16: + vmovdqu %xmm0, (%r8) +L_AES_GCM_encrypt_vaes_store_tag_done: + vzeroupper + addq $0x230, %rsp + popq %r15 + popq %r14 + popq %rbx + popq %r12 + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_encrypt_vaes,.-AES_GCM_encrypt_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_decrypt_vaes +.type AES_GCM_decrypt_vaes,@function +.align 16 +AES_GCM_decrypt_vaes: +#else +.section __TEXT,__text +.globl _AES_GCM_decrypt_vaes +.p2align 4 +_AES_GCM_decrypt_vaes: +#endif /* __APPLE__ */ + pushq %r13 + pushq %r12 + pushq %rbx + pushq %r14 + pushq %r15 + pushq %rbp + movq %rdx, %r12 + movq %rcx, %rax + movl 56(%rsp), %r11d + movl 64(%rsp), %ebx + movl 72(%rsp), %r14d + movq 80(%rsp), %r15 + movl 88(%rsp), %r10d + movq 96(%rsp), %rbp + subq $0x220, %rsp + vpxor %xmm5, %xmm5, %xmm5 + vpxor %xmm15, %xmm15, %xmm15 + cmpl $12, %ebx + movl %ebx, %edx + jne L_AES_GCM_decrypt_vaes_iv_not_12 + # # Calculate values when IV is 12 bytes + # Set counter based on IV + movl $0x1000000, %ecx + vmovq (%rax), %xmm5 + vpinsrd $2, 8(%rax), %xmm5, %xmm5 + vpinsrd $3, %ecx, %xmm5, %xmm5 + # H = Encrypt X(=0) and T = Encrypt counter + vmovdqa (%r15), %xmm6 + vpxor %xmm6, %xmm5, %xmm1 + vmovdqa 16(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 32(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 48(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 64(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 80(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 96(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 112(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 128(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 144(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm4 + jl L_AES_GCM_decrypt_vaes_calc_iv_12_last + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 176(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm4 + jl L_AES_GCM_decrypt_vaes_calc_iv_12_last + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 208(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 224(%r15), %xmm4 +L_AES_GCM_decrypt_vaes_calc_iv_12_last: + vaesenclast %xmm4, %xmm6, %xmm6 + vaesenclast %xmm4, %xmm1, %xmm1 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6 + vmovdqu %xmm1, 528(%rsp) + jmp L_AES_GCM_decrypt_vaes_iv_done +L_AES_GCM_decrypt_vaes_iv_not_12: + # Calculate values when IV is not 12 bytes + # H = Encrypt X(=0) + vmovdqa (%r15), %xmm6 + vaesenc 16(%r15), %xmm6, %xmm6 + vaesenc 32(%r15), %xmm6, %xmm6 + vaesenc 48(%r15), %xmm6, %xmm6 + vaesenc 64(%r15), %xmm6, %xmm6 + vaesenc 80(%r15), %xmm6, %xmm6 + vaesenc 96(%r15), %xmm6, %xmm6 + vaesenc 112(%r15), %xmm6, %xmm6 + vaesenc 128(%r15), %xmm6, %xmm6 + vaesenc 144(%r15), %xmm6, %xmm6 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm8 + jl L_AES_GCM_decrypt_vaes_calc_iv_1_aesenc_avx_last + vaesenc %xmm8, %xmm6, %xmm6 + vaesenc 176(%r15), %xmm6, %xmm6 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm8 + jl L_AES_GCM_decrypt_vaes_calc_iv_1_aesenc_avx_last + vaesenc %xmm8, %xmm6, %xmm6 + vaesenc 208(%r15), %xmm6, %xmm6 + vmovdqa 224(%r15), %xmm8 +L_AES_GCM_decrypt_vaes_calc_iv_1_aesenc_avx_last: + vaesenclast %xmm8, %xmm6, %xmm6 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6 + # Calc counter + # Initialization vector + cmpl $0x00, %edx + movq $0x00, %rcx + je L_AES_GCM_decrypt_vaes_calc_iv_done + cmpl $16, %edx + jl L_AES_GCM_decrypt_vaes_calc_iv_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_decrypt_vaes_calc_iv_16_loop: + vmovdqu (%rax,%rcx,1), %xmm7 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm5, %xmm5 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm5 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm5, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm5, %xmm5 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm5, %xmm5 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm5, %xmm5 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_vaes_calc_iv_16_loop + movl %ebx, %edx + cmpl %edx, %ecx + je L_AES_GCM_decrypt_vaes_calc_iv_done +L_AES_GCM_decrypt_vaes_calc_iv_lt16: + subq $16, %rsp + vpxor %xmm7, %xmm7, %xmm7 + xorl %ebx, %ebx + vmovdqu %xmm7, (%rsp) +L_AES_GCM_decrypt_vaes_calc_iv_loop: + movzbl (%rax,%rcx,1), %r13d + movb %r13b, (%rsp,%rbx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_vaes_calc_iv_loop + vmovdqu (%rsp), %xmm7 + addq $16, %rsp + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm5, %xmm5 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm5 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm5, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm5, %xmm5 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm5, %xmm5 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm5, %xmm5 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 +L_AES_GCM_decrypt_vaes_calc_iv_done: + # T = Encrypt counter + vpxor %xmm0, %xmm0, %xmm0 + shll $3, %edx + vmovq %rdx, %xmm0 + vpxor %xmm0, %xmm5, %xmm5 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm5 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm5, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm5, %xmm5 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm5, %xmm5 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm5, %xmm5 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + # Encrypt counter + vmovdqa (%r15), %xmm7 + vpxor %xmm5, %xmm7, %xmm7 + vaesenc 16(%r15), %xmm7, %xmm7 + vaesenc 32(%r15), %xmm7, %xmm7 + vaesenc 48(%r15), %xmm7, %xmm7 + vaesenc 64(%r15), %xmm7, %xmm7 + vaesenc 80(%r15), %xmm7, %xmm7 + vaesenc 96(%r15), %xmm7, %xmm7 + vaesenc 112(%r15), %xmm7, %xmm7 + vaesenc 128(%r15), %xmm7, %xmm7 + vaesenc 144(%r15), %xmm7, %xmm7 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm8 + jl L_AES_GCM_decrypt_vaes_calc_iv_2_aesenc_avx_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 176(%r15), %xmm7, %xmm7 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm8 + jl L_AES_GCM_decrypt_vaes_calc_iv_2_aesenc_avx_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 208(%r15), %xmm7, %xmm7 + vmovdqa 224(%r15), %xmm8 +L_AES_GCM_decrypt_vaes_calc_iv_2_aesenc_avx_last: + vaesenclast %xmm8, %xmm7, %xmm7 + vmovdqu %xmm7, 528(%rsp) +L_AES_GCM_decrypt_vaes_iv_done: + # Additional authentication data + movl %r11d, %edx + cmpl $0x00, %edx + je L_AES_GCM_decrypt_vaes_calc_aad_done + xorl %ecx, %ecx + cmpl $16, %edx + jl L_AES_GCM_decrypt_vaes_calc_aad_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_decrypt_vaes_calc_aad_16_loop: + vmovdqu (%r12,%rcx,1), %xmm7 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm15, %xmm15 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm15, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm15, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm15, %xmm6, %xmm0 + vpxor %xmm15, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm15 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm15, %xmm15 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm15, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm15, %xmm15 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm15, %xmm15 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm15, %xmm15 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm15, %xmm15 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_vaes_calc_aad_16_loop + movl %r11d, %edx + cmpl %edx, %ecx + je L_AES_GCM_decrypt_vaes_calc_aad_done +L_AES_GCM_decrypt_vaes_calc_aad_lt16: + subq $16, %rsp + vpxor %xmm7, %xmm7, %xmm7 + xorl %ebx, %ebx + vmovdqu %xmm7, (%rsp) +L_AES_GCM_decrypt_vaes_calc_aad_loop: + movzbl (%r12,%rcx,1), %r13d + movb %r13b, (%rsp,%rbx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_vaes_calc_aad_loop + vmovdqu (%rsp), %xmm7 + addq $16, %rsp + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm15, %xmm15 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm15, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm15, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm15, %xmm6, %xmm0 + vpxor %xmm15, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm15 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm15, %xmm15 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm15, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm15, %xmm15 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm15, %xmm15 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm15, %xmm15 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm15, %xmm15 +L_AES_GCM_decrypt_vaes_calc_aad_done: + # Calculate counter and H + vpsrlq $63, %xmm6, %xmm8 + vpsllq $0x01, %xmm6, %xmm7 + vpslldq $8, %xmm8, %xmm8 + vpor %xmm8, %xmm7, %xmm7 + vpshufd $0xff, %xmm6, %xmm6 + vpsrad $31, %xmm6, %xmm6 + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm5, %xmm5 + vpand L_vaes_aes_gcm_mod2_128(%rip), %xmm6, %xmm6 + vpaddd L_vaes_aes_gcm_one(%rip), %xmm5, %xmm5 + vpxor %xmm7, %xmm6, %xmm6 + vmovdqu %xmm5, 512(%rsp) + xorl %ebx, %ebx + cmpl $0x80, %r9d + jl L_AES_GCM_decrypt_vaes_done_128 + vmovdqa %xmm15, %xmm2 + # H ^ 1 + vmovdqu %xmm6, (%rsp) + # H ^ 2 + vpclmulqdq $0x00, %xmm6, %xmm6, %xmm7 + vpclmulqdq $0x11, %xmm6, %xmm6, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm0 + vmovdqu %xmm0, 16(%rsp) + # H ^ 3 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm6, %xmm0, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm0, %xmm8 + vpclmulqdq $16, %xmm6, %xmm0, %xmm9 + vpclmulqdq $0x11, %xmm6, %xmm0, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm1 + vmovdqu %xmm1, 32(%rsp) + # H ^ 4 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm3 + vmovdqu %xmm3, 48(%rsp) + # H ^ 5 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 64(%rsp) + # H ^ 6 + vpclmulqdq $0x00, %xmm1, %xmm1, %xmm7 + vpclmulqdq $0x11, %xmm1, %xmm1, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 80(%rsp) + # H ^ 7 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm1, %xmm3, %xmm7 + vpclmulqdq $0x01, %xmm1, %xmm3, %xmm8 + vpclmulqdq $16, %xmm1, %xmm3, %xmm9 + vpclmulqdq $0x11, %xmm1, %xmm3, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 96(%rsp) + # H ^ 8 + vpclmulqdq $0x00, %xmm3, %xmm3, %xmm7 + vpclmulqdq $0x11, %xmm3, %xmm3, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 112(%rsp) + cmpl $0x100, %r9d + jl L_AES_GCM_decrypt_vaes_no_ext + # H ^ 9 + vmovdqu 48(%rsp), %xmm0 + vmovdqu 64(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 128(%rsp) + # H ^ 10 + vmovdqu 64(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 144(%rsp) + # H ^ 11 + vmovdqu 64(%rsp), %xmm0 + vmovdqu 80(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 160(%rsp) + # H ^ 12 + vmovdqu 80(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 176(%rsp) + # H ^ 13 + vmovdqu 80(%rsp), %xmm0 + vmovdqu 96(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 192(%rsp) + # H ^ 14 + vmovdqu 96(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 208(%rsp) + # H ^ 15 + vmovdqu 96(%rsp), %xmm0 + vmovdqu 112(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 224(%rsp) + # H ^ 16 + vmovdqu 112(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 240(%rsp) + vmovdqu 224(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 192(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 160(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu 128(%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + vmovdqu %ymm7, 256(%rsp) + vmovdqu %ymm8, 288(%rsp) + vmovdqu %ymm9, 320(%rsp) + vmovdqu %ymm10, 352(%rsp) + vmovdqu 96(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 64(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 32(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu (%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + vmovdqu %ymm7, 384(%rsp) + vmovdqu %ymm8, 416(%rsp) + vmovdqu %ymm9, 448(%rsp) + vmovdqu %ymm10, 480(%rsp) +L_AES_GCM_decrypt_vaes_no_ext: + vbroadcasti128 L_vaes_aes_gcm_mod2_128(%rip), %ymm14 + cmpl $0x100, %r9d + jl L_AES_GCM_decrypt_vaes_after_256 + movl %r9d, %r13d + andl $0xffffff00, %r13d +L_AES_GCM_decrypt_vaes_loop_256: + # 256 bytes of input + leaq (%rdi,%rbx,1), %rax + vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6 + vpxor %ymm4, %ymm4, %ymm4 + vinserti128 $0x00, %xmm15, %ymm4, %ymm4 + vmovdqu 256(%rsp), %ymm7 + vmovdqu 288(%rsp), %ymm8 + vmovdqu 320(%rsp), %ymm9 + vmovdqu 352(%rsp), %ymm10 + vmovdqu (%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpxor %ymm4, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vmovdqa %ymm0, %ymm11 + vpxor %ymm1, %ymm2, %ymm12 + vmovdqa %ymm3, %ymm13 + vmovdqu 32(%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 64(%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 96(%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 384(%rsp), %ymm7 + vmovdqu 416(%rsp), %ymm8 + vmovdqu 448(%rsp), %ymm9 + vmovdqu 480(%rsp), %ymm10 + vmovdqu 128(%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 160(%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 192(%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 224(%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5 + vpshufd $0x4e, %ymm11, %ymm11 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm11, %ymm12, %ymm12 + vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5 + vpshufd $0x4e, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm12, %ymm13, %ymm13 + vextracti128 $0x01, %ymm13, %xmm0 + vpxor %xmm0, %xmm13, %xmm15 + vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6 + vbroadcasti128 512(%rsp), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu 512(%rsp), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, 512(%rsp) + vbroadcasti128 (%r15), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %r10d + vbroadcasti128 160(%r15), %ymm4 + jl L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %r10d + vbroadcasti128 192(%r15), %ymm4 + jl L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%r15), %ymm4 +L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %ebx + vbroadcasti128 512(%rsp), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu 512(%rsp), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, 512(%rsp) + vbroadcasti128 (%r15), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %r10d + vbroadcasti128 160(%r15), %ymm4 + jl L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %r10d + vbroadcasti128 192(%r15), %ymm4 + jl L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%r15), %ymm4 +L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %ebx + cmpl %r13d, %ebx + jl L_AES_GCM_decrypt_vaes_loop_256 +L_AES_GCM_decrypt_vaes_after_256: + vmovdqu 96(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 64(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 32(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu (%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + movl %r9d, %r13d + andl $0xffffff80, %r13d + cmpl %r13d, %ebx + jge L_AES_GCM_decrypt_vaes_after_128 + # 128 bytes of input + leaq (%rdi,%rbx,1), %rax + vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6 + vpxor %ymm4, %ymm4, %ymm4 + vinserti128 $0x00, %xmm15, %ymm4, %ymm4 + vmovdqu (%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpxor %ymm4, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vmovdqa %ymm0, %ymm11 + vpxor %ymm1, %ymm2, %ymm12 + vmovdqa %ymm3, %ymm13 + vmovdqu 32(%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 64(%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 96(%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5 + vpshufd $0x4e, %ymm11, %ymm11 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm11, %ymm12, %ymm12 + vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5 + vpshufd $0x4e, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm12, %ymm13, %ymm13 + vextracti128 $0x01, %ymm13, %xmm0 + vpxor %xmm0, %xmm13, %xmm15 + vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6 + vbroadcasti128 512(%rsp), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu 512(%rsp), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, 512(%rsp) + vbroadcasti128 (%r15), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %r10d + vbroadcasti128 160(%r15), %ymm4 + jl L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %r10d + vbroadcasti128 192(%r15), %ymm4 + jl L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%r15), %ymm4 +L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %ebx +L_AES_GCM_decrypt_vaes_after_128: + vmovdqu (%rsp), %xmm6 +L_AES_GCM_decrypt_vaes_done_128: + movl %r9d, %edx + cmpl %edx, %ebx + jge L_AES_GCM_decrypt_vaes_done_dec + movl %r9d, %r13d + andl $0xfffffff0, %r13d + cmpl %r13d, %ebx + jge L_AES_GCM_decrypt_vaes_last_block_done +L_AES_GCM_decrypt_vaes_last_block_start: + vmovdqu (%rdi,%rbx,1), %xmm12 + vmovdqa %xmm6, %xmm0 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm12, %xmm1 + vpxor %xmm15, %xmm1, %xmm1 + vmovdqu 512(%rsp), %xmm8 + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm8, %xmm7 + vpaddd L_vaes_aes_gcm_one(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 512(%rsp) + vpxor (%r15), %xmm7, %xmm7 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vaesenc 16(%r15), %xmm7, %xmm7 + vaesenc 32(%r15), %xmm7, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm10 + vaesenc 48(%r15), %xmm7, %xmm7 + vaesenc 64(%r15), %xmm7, %xmm7 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm11 + vaesenc 80(%r15), %xmm7, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vaesenc 96(%r15), %xmm7, %xmm7 + vpxor %xmm10, %xmm9, %xmm9 + vpslldq $8, %xmm9, %xmm2 + vpsrldq $8, %xmm9, %xmm9 + vaesenc 112(%r15), %xmm7, %xmm7 + vpxor %xmm11, %xmm2, %xmm2 + vpxor %xmm9, %xmm1, %xmm3 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm0 + vpclmulqdq $16, %xmm0, %xmm2, %xmm10 + vaesenc 128(%r15), %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm9 + vpxor %xmm10, %xmm9, %xmm9 + vpclmulqdq $16, %xmm0, %xmm9, %xmm10 + vaesenc 144(%r15), %xmm7, %xmm7 + vpshufd $0x4e, %xmm9, %xmm9 + vpxor %xmm10, %xmm9, %xmm9 + vpxor %xmm3, %xmm9, %xmm15 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm8 + jl L_AES_GCM_decrypt_vaes_aesenc_gfmul_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 176(%r15), %xmm7, %xmm7 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm8 + jl L_AES_GCM_decrypt_vaes_aesenc_gfmul_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 208(%r15), %xmm7, %xmm7 + vmovdqa 224(%r15), %xmm8 +L_AES_GCM_decrypt_vaes_aesenc_gfmul_last: + vaesenclast %xmm8, %xmm7, %xmm7 + vmovdqa %xmm12, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vmovdqu %xmm7, (%rsi,%rbx,1) + addl $16, %ebx + cmpl %r13d, %ebx + jl L_AES_GCM_decrypt_vaes_last_block_start +L_AES_GCM_decrypt_vaes_last_block_done: + movl %r9d, %ecx + movl %ecx, %edx + andl $15, %ecx + jz L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_done + vmovdqu 512(%rsp), %xmm5 + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm5, %xmm5 + vpxor (%r15), %xmm5, %xmm5 + vaesenc 16(%r15), %xmm5, %xmm5 + vaesenc 32(%r15), %xmm5, %xmm5 + vaesenc 48(%r15), %xmm5, %xmm5 + vaesenc 64(%r15), %xmm5, %xmm5 + vaesenc 80(%r15), %xmm5, %xmm5 + vaesenc 96(%r15), %xmm5, %xmm5 + vaesenc 112(%r15), %xmm5, %xmm5 + vaesenc 128(%r15), %xmm5, %xmm5 + vaesenc 144(%r15), %xmm5, %xmm5 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm8 + jl L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc %xmm8, %xmm5, %xmm5 + vaesenc 176(%r15), %xmm5, %xmm5 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm8 + jl L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc %xmm8, %xmm5, %xmm5 + vaesenc 208(%r15), %xmm5, %xmm5 + vmovdqa 224(%r15), %xmm8 +L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_aesenc_avx_last: + vaesenclast %xmm8, %xmm5, %xmm5 + subq $32, %rsp + xorl %ecx, %ecx + vmovdqu %xmm5, (%rsp) + vpxor %xmm0, %xmm0, %xmm0 + vmovdqu %xmm0, 16(%rsp) +L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_loop: + movzbl (%rdi,%rbx,1), %r13d + movb %r13b, 16(%rsp,%rcx,1) + xorb (%rsp,%rcx,1), %r13b + movb %r13b, (%rsi,%rbx,1) + incl %ebx + incl %ecx + cmpl %edx, %ebx + jl L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_loop + vmovdqu 16(%rsp), %xmm5 + addq $32, %rsp + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + vpxor %xmm5, %xmm15, %xmm15 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm6, %xmm15, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm15, %xmm8 + vpclmulqdq $16, %xmm6, %xmm15, %xmm9 + vpclmulqdq $0x11, %xmm6, %xmm15, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm15 +L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_done: +L_AES_GCM_decrypt_vaes_done_dec: + movl %r9d, %edx + movl %r11d, %ecx + shlq $3, %rdx + shlq $3, %rcx + vmovq %rdx, %xmm0 + vmovq %rcx, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 + vpxor %xmm0, %xmm15, %xmm15 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm6, %xmm15, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm15, %xmm8 + vpclmulqdq $16, %xmm6, %xmm15, %xmm9 + vpclmulqdq $0x11, %xmm6, %xmm15, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm15 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm15, %xmm15 + vmovdqu 528(%rsp), %xmm0 + vpxor %xmm15, %xmm0, %xmm0 + cmpl $16, %r14d + je L_AES_GCM_decrypt_vaes_cmp_tag_16 + subq $16, %rsp + xorq %rcx, %rcx + xorq %rbx, %rbx + vmovdqu %xmm0, (%rsp) +L_AES_GCM_decrypt_vaes_cmp_tag_loop: + movzbl (%rsp,%rcx,1), %r13d + xorb (%r8,%rcx,1), %r13b + orb %r13b, %bl + incl %ecx + cmpl %r14d, %ecx + jne L_AES_GCM_decrypt_vaes_cmp_tag_loop + cmpb $0x00, %bl + sete %bl + addq $16, %rsp + xorq %rcx, %rcx + jmp L_AES_GCM_decrypt_vaes_cmp_tag_done +L_AES_GCM_decrypt_vaes_cmp_tag_16: + vmovdqu (%r8), %xmm1 + vpcmpeqb %xmm1, %xmm0, %xmm0 + vpmovmskb %xmm0, %rdx + # %%edx == 0xFFFF then return 1 else => return 0 + xorl %ebx, %ebx + cmpl $0xffff, %edx + sete %bl +L_AES_GCM_decrypt_vaes_cmp_tag_done: + movl %ebx, (%rbp) + vzeroupper + addq $0x220, %rsp + popq %rbp + popq %r15 + popq %r14 + popq %rbx + popq %r12 + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_decrypt_vaes,.-AES_GCM_decrypt_vaes +#endif /* __APPLE__ */ +#ifdef WOLFSSL_AESGCM_STREAM +#ifndef __APPLE__ +.text +.globl AES_GCM_init_vaes +.type AES_GCM_init_vaes,@function +.align 16 +AES_GCM_init_vaes: +#else +.section __TEXT,__text +.globl _AES_GCM_init_vaes +.p2align 4 +_AES_GCM_init_vaes: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + movq %rdx, %r10 + movl %ecx, %r11d + movq 24(%rsp), %rax + subq $16, %rsp + vpxor %xmm4, %xmm4, %xmm4 + movl %r11d, %edx + cmpl $12, %edx + jne L_AES_GCM_init_vaes_iv_not_12 + # # Calculate values when IV is 12 bytes + # Set counter based on IV + movl $0x1000000, %ecx + vmovq (%r10), %xmm4 + vpinsrd $2, 8(%r10), %xmm4, %xmm4 + vpinsrd $3, %ecx, %xmm4, %xmm4 + # H = Encrypt X(=0) and T = Encrypt counter + vmovdqa (%rdi), %xmm5 + vpxor %xmm5, %xmm4, %xmm1 + vmovdqa 16(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 32(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 48(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 64(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 80(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 96(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 112(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 128(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 144(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm6 + jl L_AES_GCM_init_vaes_calc_iv_12_last + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 176(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm6 + jl L_AES_GCM_init_vaes_calc_iv_12_last + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 208(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 224(%rdi), %xmm6 +L_AES_GCM_init_vaes_calc_iv_12_last: + vaesenclast %xmm6, %xmm5, %xmm5 + vaesenclast %xmm6, %xmm1, %xmm1 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + vmovdqu %xmm1, %xmm15 + jmp L_AES_GCM_init_vaes_iv_done +L_AES_GCM_init_vaes_iv_not_12: + # Calculate values when IV is not 12 bytes + # H = Encrypt X(=0) + vmovdqa (%rdi), %xmm5 + vaesenc 16(%rdi), %xmm5, %xmm5 + vaesenc 32(%rdi), %xmm5, %xmm5 + vaesenc 48(%rdi), %xmm5, %xmm5 + vaesenc 64(%rdi), %xmm5, %xmm5 + vaesenc 80(%rdi), %xmm5, %xmm5 + vaesenc 96(%rdi), %xmm5, %xmm5 + vaesenc 112(%rdi), %xmm5, %xmm5 + vaesenc 128(%rdi), %xmm5, %xmm5 + vaesenc 144(%rdi), %xmm5, %xmm5 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm8 + jl L_AES_GCM_init_vaes_calc_iv_1_aesenc_avx_last + vaesenc %xmm8, %xmm5, %xmm5 + vaesenc 176(%rdi), %xmm5, %xmm5 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm8 + jl L_AES_GCM_init_vaes_calc_iv_1_aesenc_avx_last + vaesenc %xmm8, %xmm5, %xmm5 + vaesenc 208(%rdi), %xmm5, %xmm5 + vmovdqa 224(%rdi), %xmm8 +L_AES_GCM_init_vaes_calc_iv_1_aesenc_avx_last: + vaesenclast %xmm8, %xmm5, %xmm5 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + # Calc counter + # Initialization vector + cmpl $0x00, %edx + movq $0x00, %rcx + je L_AES_GCM_init_vaes_calc_iv_done + cmpl $16, %edx + jl L_AES_GCM_init_vaes_calc_iv_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_init_vaes_calc_iv_16_loop: + vmovdqu (%r10,%rcx,1), %xmm7 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm6 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm6, %xmm6 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm6, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm6, %xmm6 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm6, %xmm6 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm6, %xmm0 + vpslld $30, %xmm6, %xmm1 + vpslld $25, %xmm6, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm2 + vpsrld $2, %xmm6, %xmm3 + vpsrld $7, %xmm6, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm6, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_init_vaes_calc_iv_16_loop + movl %r11d, %edx + cmpl %edx, %ecx + je L_AES_GCM_init_vaes_calc_iv_done +L_AES_GCM_init_vaes_calc_iv_lt16: + subq $16, %rsp + vpxor %xmm7, %xmm7, %xmm7 + xorl %r13d, %r13d + vmovdqu %xmm7, (%rsp) +L_AES_GCM_init_vaes_calc_iv_loop: + movzbl (%r10,%rcx,1), %r12d + movb %r12b, (%rsp,%r13,1) + incl %ecx + incl %r13d + cmpl %edx, %ecx + jl L_AES_GCM_init_vaes_calc_iv_loop + vmovdqu (%rsp), %xmm7 + addq $16, %rsp + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm6 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm6, %xmm6 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm6, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm6, %xmm6 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm6, %xmm6 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm6, %xmm0 + vpslld $30, %xmm6, %xmm1 + vpslld $25, %xmm6, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm2 + vpsrld $2, %xmm6, %xmm3 + vpsrld $7, %xmm6, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm6, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 +L_AES_GCM_init_vaes_calc_iv_done: + # T = Encrypt counter + vpxor %xmm0, %xmm0, %xmm0 + shll $3, %edx + vmovq %rdx, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm6 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm6, %xmm6 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm6, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm6, %xmm6 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm6, %xmm6 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm6, %xmm0 + vpslld $30, %xmm6, %xmm1 + vpslld $25, %xmm6, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm2 + vpsrld $2, %xmm6, %xmm3 + vpsrld $7, %xmm6, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm6, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4 + # Encrypt counter + vmovdqa (%rdi), %xmm7 + vpxor %xmm4, %xmm7, %xmm7 + vaesenc 16(%rdi), %xmm7, %xmm7 + vaesenc 32(%rdi), %xmm7, %xmm7 + vaesenc 48(%rdi), %xmm7, %xmm7 + vaesenc 64(%rdi), %xmm7, %xmm7 + vaesenc 80(%rdi), %xmm7, %xmm7 + vaesenc 96(%rdi), %xmm7, %xmm7 + vaesenc 112(%rdi), %xmm7, %xmm7 + vaesenc 128(%rdi), %xmm7, %xmm7 + vaesenc 144(%rdi), %xmm7, %xmm7 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm8 + jl L_AES_GCM_init_vaes_calc_iv_2_aesenc_avx_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 176(%rdi), %xmm7, %xmm7 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm8 + jl L_AES_GCM_init_vaes_calc_iv_2_aesenc_avx_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 208(%rdi), %xmm7, %xmm7 + vmovdqa 224(%rdi), %xmm8 +L_AES_GCM_init_vaes_calc_iv_2_aesenc_avx_last: + vaesenclast %xmm8, %xmm7, %xmm7 + vmovdqu %xmm7, %xmm15 +L_AES_GCM_init_vaes_iv_done: + vmovdqa %xmm15, (%rax) + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4 + vpaddd L_vaes_aes_gcm_one(%rip), %xmm4, %xmm4 + vmovdqa %xmm5, (%r8) + vmovdqa %xmm4, (%r9) + addq $16, %rsp + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_GCM_init_vaes,.-AES_GCM_init_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_aad_update_vaes +.type AES_GCM_aad_update_vaes,@function +.align 16 +AES_GCM_aad_update_vaes: +#else +.section __TEXT,__text +.globl _AES_GCM_aad_update_vaes +.p2align 4 +_AES_GCM_aad_update_vaes: +#endif /* __APPLE__ */ + movq %rcx, %rax + vmovdqa (%rdx), %xmm5 + vmovdqa (%rax), %xmm6 + xorl %ecx, %ecx +L_AES_GCM_aad_update_vaes_16_loop: + vmovdqu (%rdi,%rcx,1), %xmm7 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm5, %xmm5 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm5 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm5, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm5, %xmm5 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm5, %xmm5 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm5, %xmm5 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + addl $16, %ecx + cmpl %esi, %ecx + jl L_AES_GCM_aad_update_vaes_16_loop + vmovdqa %xmm5, (%rdx) + repz retq +#ifndef __APPLE__ +.size AES_GCM_aad_update_vaes,.-AES_GCM_aad_update_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_encrypt_block_vaes +.type AES_GCM_encrypt_block_vaes,@function +.align 16 +AES_GCM_encrypt_block_vaes: +#else +.section __TEXT,__text +.globl _AES_GCM_encrypt_block_vaes +.p2align 4 +_AES_GCM_encrypt_block_vaes: +#endif /* __APPLE__ */ + movq %rdx, %r10 + movq %rcx, %r11 + vmovdqu (%r8), %xmm1 + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm1, %xmm0 + vpaddd L_vaes_aes_gcm_one(%rip), %xmm1, %xmm1 + vmovdqu %xmm1, (%r8) + vpxor (%rdi), %xmm0, %xmm0 + vaesenc 16(%rdi), %xmm0, %xmm0 + vaesenc 32(%rdi), %xmm0, %xmm0 + vaesenc 48(%rdi), %xmm0, %xmm0 + vaesenc 64(%rdi), %xmm0, %xmm0 + vaesenc 80(%rdi), %xmm0, %xmm0 + vaesenc 96(%rdi), %xmm0, %xmm0 + vaesenc 112(%rdi), %xmm0, %xmm0 + vaesenc 128(%rdi), %xmm0, %xmm0 + vaesenc 144(%rdi), %xmm0, %xmm0 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm1 + jl L_AES_GCM_encrypt_block_vaes_aesenc_block_last + vaesenc %xmm1, %xmm0, %xmm0 + vaesenc 176(%rdi), %xmm0, %xmm0 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm1 + jl L_AES_GCM_encrypt_block_vaes_aesenc_block_last + vaesenc %xmm1, %xmm0, %xmm0 + vaesenc 208(%rdi), %xmm0, %xmm0 + vmovdqa 224(%rdi), %xmm1 +L_AES_GCM_encrypt_block_vaes_aesenc_block_last: + vaesenclast %xmm1, %xmm0, %xmm0 + vmovdqu (%r11), %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vmovdqu %xmm0, (%r10) + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 + vzeroupper + repz retq +#ifndef __APPLE__ +.size AES_GCM_encrypt_block_vaes,.-AES_GCM_encrypt_block_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_ghash_block_vaes +.type AES_GCM_ghash_block_vaes,@function +.align 16 +AES_GCM_ghash_block_vaes: +#else +.section __TEXT,__text +.globl _AES_GCM_ghash_block_vaes +.p2align 4 +_AES_GCM_ghash_block_vaes: +#endif /* __APPLE__ */ + vmovdqa (%rsi), %xmm4 + vmovdqa (%rdx), %xmm5 + vmovdqu (%rdi), %xmm7 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm6 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm6, %xmm6 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm6, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm6, %xmm6 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm6, %xmm6 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm6, %xmm0 + vpslld $30, %xmm6, %xmm1 + vpslld $25, %xmm6, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm2 + vpsrld $2, %xmm6, %xmm3 + vpsrld $7, %xmm6, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm6, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vmovdqa %xmm4, (%rsi) + vzeroupper + repz retq +#ifndef __APPLE__ +.size AES_GCM_ghash_block_vaes,.-AES_GCM_ghash_block_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_encrypt_update_vaes +.type AES_GCM_encrypt_update_vaes,@function +.align 16 +AES_GCM_encrypt_update_vaes: +#else +.section __TEXT,__text +.globl _AES_GCM_encrypt_update_vaes +.p2align 4 +_AES_GCM_encrypt_update_vaes: +#endif /* __APPLE__ */ + pushq %r13 + pushq %r12 + pushq %r14 + pushq %r15 + pushq %rbx + movq %rdx, %r10 + movq %rcx, %r11 + movq 48(%rsp), %rax + movq 56(%rsp), %r12 + subq $0x210, %rsp + vmovdqa (%r9), %xmm15 + vmovdqa (%rax), %xmm6 + vpsrlq $63, %xmm6, %xmm8 + vpsllq $0x01, %xmm6, %xmm7 + vpslldq $8, %xmm8, %xmm8 + vpor %xmm8, %xmm7, %xmm7 + vpshufd $0xff, %xmm6, %xmm6 + vpsrad $31, %xmm6, %xmm6 + vpand L_vaes_aes_gcm_mod2_128(%rip), %xmm6, %xmm6 + vpxor %xmm7, %xmm6, %xmm6 + xorl %r14d, %r14d + cmpl $0x80, %r8d + jl L_AES_GCM_encrypt_update_vaes_done_128 + vmovdqa %xmm15, %xmm2 + # H ^ 1 + vmovdqu %xmm6, (%rsp) + # H ^ 2 + vpclmulqdq $0x00, %xmm6, %xmm6, %xmm7 + vpclmulqdq $0x11, %xmm6, %xmm6, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm0 + vmovdqu %xmm0, 16(%rsp) + # H ^ 3 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm6, %xmm0, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm0, %xmm8 + vpclmulqdq $16, %xmm6, %xmm0, %xmm9 + vpclmulqdq $0x11, %xmm6, %xmm0, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm1 + vmovdqu %xmm1, 32(%rsp) + # H ^ 4 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm3 + vmovdqu %xmm3, 48(%rsp) + # H ^ 5 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 64(%rsp) + # H ^ 6 + vpclmulqdq $0x00, %xmm1, %xmm1, %xmm7 + vpclmulqdq $0x11, %xmm1, %xmm1, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 80(%rsp) + # H ^ 7 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm1, %xmm3, %xmm7 + vpclmulqdq $0x01, %xmm1, %xmm3, %xmm8 + vpclmulqdq $16, %xmm1, %xmm3, %xmm9 + vpclmulqdq $0x11, %xmm1, %xmm3, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 96(%rsp) + # H ^ 8 + vpclmulqdq $0x00, %xmm3, %xmm3, %xmm7 + vpclmulqdq $0x11, %xmm3, %xmm3, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 112(%rsp) + cmpl $0x100, %r8d + jl L_AES_GCM_encrypt_update_vaes_no_ext + # H ^ 9 + vmovdqu 48(%rsp), %xmm0 + vmovdqu 64(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 128(%rsp) + # H ^ 10 + vmovdqu 64(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 144(%rsp) + # H ^ 11 + vmovdqu 64(%rsp), %xmm0 + vmovdqu 80(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 160(%rsp) + # H ^ 12 + vmovdqu 80(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 176(%rsp) + # H ^ 13 + vmovdqu 80(%rsp), %xmm0 + vmovdqu 96(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 192(%rsp) + # H ^ 14 + vmovdqu 96(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 208(%rsp) + # H ^ 15 + vmovdqu 96(%rsp), %xmm0 + vmovdqu 112(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 224(%rsp) + # H ^ 16 + vmovdqu 112(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 240(%rsp) + vmovdqu 224(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 192(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 160(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu 128(%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + vmovdqu %ymm7, 256(%rsp) + vmovdqu %ymm8, 288(%rsp) + vmovdqu %ymm9, 320(%rsp) + vmovdqu %ymm10, 352(%rsp) + vmovdqu 96(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 64(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 32(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu (%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + vmovdqu %ymm7, 384(%rsp) + vmovdqu %ymm8, 416(%rsp) + vmovdqu %ymm9, 448(%rsp) + vmovdqu %ymm10, 480(%rsp) +L_AES_GCM_encrypt_update_vaes_no_ext: + vbroadcasti128 L_vaes_aes_gcm_mod2_128(%rip), %ymm14 + cmpl $0x100, %r8d + jl L_AES_GCM_encrypt_update_vaes_after_256 + movl %r8d, %r13d + andl $0xffffff00, %r13d +L_AES_GCM_encrypt_update_vaes_loop_256: + # 256 bytes of input + leaq (%r10,%r14,1), %r15 + vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6 + vbroadcasti128 (%r12), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu (%r12), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, (%r12) + vbroadcasti128 (%rdi), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %esi + vbroadcasti128 160(%rdi), %ymm4 + jl L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %esi + vbroadcasti128 192(%rdi), %ymm4 + jl L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%rdi), %ymm4 +L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %r14d + vbroadcasti128 (%r12), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu (%r12), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, (%r12) + vbroadcasti128 (%rdi), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %esi + vbroadcasti128 160(%rdi), %ymm4 + jl L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %esi + vbroadcasti128 192(%rdi), %ymm4 + jl L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%rdi), %ymm4 +L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %r14d + vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6 + vpxor %ymm4, %ymm4, %ymm4 + vinserti128 $0x00, %xmm15, %ymm4, %ymm4 + vmovdqu 256(%rsp), %ymm7 + vmovdqu 288(%rsp), %ymm8 + vmovdqu 320(%rsp), %ymm9 + vmovdqu 352(%rsp), %ymm10 + vmovdqu (%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpxor %ymm4, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vmovdqa %ymm0, %ymm11 + vpxor %ymm1, %ymm2, %ymm12 + vmovdqa %ymm3, %ymm13 + vmovdqu 32(%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 64(%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 96(%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 384(%rsp), %ymm7 + vmovdqu 416(%rsp), %ymm8 + vmovdqu 448(%rsp), %ymm9 + vmovdqu 480(%rsp), %ymm10 + vmovdqu 128(%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 160(%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 192(%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 224(%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5 + vpshufd $0x4e, %ymm11, %ymm11 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm11, %ymm12, %ymm12 + vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5 + vpshufd $0x4e, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm12, %ymm13, %ymm13 + vextracti128 $0x01, %ymm13, %xmm0 + vpxor %xmm0, %xmm13, %xmm15 + cmpl %r13d, %r14d + jl L_AES_GCM_encrypt_update_vaes_loop_256 +L_AES_GCM_encrypt_update_vaes_after_256: + movl %r8d, %r13d + andl $0xffffff80, %r13d + cmpl %r13d, %r14d + jge L_AES_GCM_encrypt_update_vaes_after_128 + # 128 bytes of input + leaq (%r10,%r14,1), %r15 + vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6 + vbroadcasti128 (%r12), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu (%r12), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, (%r12) + vbroadcasti128 (%rdi), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %esi + vbroadcasti128 160(%rdi), %ymm4 + jl L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %esi + vbroadcasti128 192(%rdi), %ymm4 + jl L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%rdi), %ymm4 +L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %r14d + vmovdqu 96(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 64(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 32(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu (%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6 + vpxor %ymm4, %ymm4, %ymm4 + vinserti128 $0x00, %xmm15, %ymm4, %ymm4 + vmovdqu (%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpxor %ymm4, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vmovdqa %ymm0, %ymm11 + vpxor %ymm1, %ymm2, %ymm12 + vmovdqa %ymm3, %ymm13 + vmovdqu 32(%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 64(%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 96(%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5 + vpshufd $0x4e, %ymm11, %ymm11 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm11, %ymm12, %ymm12 + vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5 + vpshufd $0x4e, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm12, %ymm13, %ymm13 + vextracti128 $0x01, %ymm13, %xmm0 + vpxor %xmm0, %xmm13, %xmm15 +L_AES_GCM_encrypt_update_vaes_after_128: + vmovdqu (%rsp), %xmm6 +L_AES_GCM_encrypt_update_vaes_done_128: + movl %r8d, %edx + cmpl %edx, %r14d + jge L_AES_GCM_encrypt_update_vaes_done_enc + movl %r8d, %r13d + andl $0xfffffff0, %r13d + cmpl %r13d, %r14d + jge L_AES_GCM_encrypt_update_vaes_last_block_done + vmovdqu (%r12), %xmm8 + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm8, %xmm7 + vpaddd L_vaes_aes_gcm_one(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxor (%rdi), %xmm7, %xmm7 + vaesenc 16(%rdi), %xmm7, %xmm7 + vaesenc 32(%rdi), %xmm7, %xmm7 + vaesenc 48(%rdi), %xmm7, %xmm7 + vaesenc 64(%rdi), %xmm7, %xmm7 + vaesenc 80(%rdi), %xmm7, %xmm7 + vaesenc 96(%rdi), %xmm7, %xmm7 + vaesenc 112(%rdi), %xmm7, %xmm7 + vaesenc 128(%rdi), %xmm7, %xmm7 + vaesenc 144(%rdi), %xmm7, %xmm7 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm8 + jl L_AES_GCM_encrypt_update_vaes_aesenc_block_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 176(%rdi), %xmm7, %xmm7 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm8 + jl L_AES_GCM_encrypt_update_vaes_aesenc_block_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 208(%rdi), %xmm7, %xmm7 + vmovdqa 224(%rdi), %xmm8 +L_AES_GCM_encrypt_update_vaes_aesenc_block_last: + vaesenclast %xmm8, %xmm7, %xmm7 + vmovdqu (%r11,%r14,1), %xmm8 + vpxor %xmm8, %xmm7, %xmm7 + vmovdqu %xmm7, (%r10,%r14,1) + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm15, %xmm15 + addl $16, %r14d + cmpl %r13d, %r14d + jge L_AES_GCM_encrypt_update_vaes_last_block_ghash +L_AES_GCM_encrypt_update_vaes_last_block_start: + vmovdqu (%r11,%r14,1), %xmm12 + vmovdqu (%r12), %xmm8 + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm8, %xmm7 + vpaddd L_vaes_aes_gcm_one(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxor (%rdi), %xmm7, %xmm7 + vpclmulqdq $16, %xmm6, %xmm15, %xmm9 + vaesenc 16(%rdi), %xmm7, %xmm7 + vaesenc 32(%rdi), %xmm7, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm15, %xmm10 + vaesenc 48(%rdi), %xmm7, %xmm7 + vaesenc 64(%rdi), %xmm7, %xmm7 + vpclmulqdq $0x00, %xmm6, %xmm15, %xmm11 + vaesenc 80(%rdi), %xmm7, %xmm7 + vpclmulqdq $0x11, %xmm6, %xmm15, %xmm1 + vaesenc 96(%rdi), %xmm7, %xmm7 + vpxor %xmm10, %xmm9, %xmm9 + vpslldq $8, %xmm9, %xmm2 + vpsrldq $8, %xmm9, %xmm9 + vaesenc 112(%rdi), %xmm7, %xmm7 + vpxor %xmm11, %xmm2, %xmm2 + vpxor %xmm9, %xmm1, %xmm3 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm0 + vpclmulqdq $16, %xmm0, %xmm2, %xmm10 + vaesenc 128(%rdi), %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm9 + vpxor %xmm10, %xmm9, %xmm9 + vpclmulqdq $16, %xmm0, %xmm9, %xmm10 + vaesenc 144(%rdi), %xmm7, %xmm7 + vpshufd $0x4e, %xmm9, %xmm9 + vpxor %xmm10, %xmm9, %xmm9 + vpxor %xmm3, %xmm9, %xmm15 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm8 + jl L_AES_GCM_encrypt_update_vaes_aesenc_gfmul_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 176(%rdi), %xmm7, %xmm7 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm8 + jl L_AES_GCM_encrypt_update_vaes_aesenc_gfmul_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 208(%rdi), %xmm7, %xmm7 + vmovdqa 224(%rdi), %xmm8 +L_AES_GCM_encrypt_update_vaes_aesenc_gfmul_last: + vaesenclast %xmm8, %xmm7, %xmm7 + vmovdqa %xmm12, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vmovdqu %xmm7, (%r10,%r14,1) + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + addl $16, %r14d + vpxor %xmm7, %xmm15, %xmm15 + cmpl %r13d, %r14d + jl L_AES_GCM_encrypt_update_vaes_last_block_start +L_AES_GCM_encrypt_update_vaes_last_block_ghash: + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm6, %xmm15, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm15, %xmm8 + vpclmulqdq $16, %xmm6, %xmm15, %xmm9 + vpclmulqdq $0x11, %xmm6, %xmm15, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm15 +L_AES_GCM_encrypt_update_vaes_last_block_done: +L_AES_GCM_encrypt_update_vaes_done_enc: + vmovdqa %xmm15, (%r9) + vzeroupper + addq $0x210, %rsp + popq %rbx + popq %r15 + popq %r14 + popq %r12 + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_encrypt_update_vaes,.-AES_GCM_encrypt_update_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_encrypt_final_vaes +.type AES_GCM_encrypt_final_vaes,@function +.align 16 +AES_GCM_encrypt_final_vaes: +#else +.section __TEXT,__text +.globl _AES_GCM_encrypt_final_vaes +.p2align 4 +_AES_GCM_encrypt_final_vaes: +#endif /* __APPLE__ */ + pushq %r13 + movl %edx, %eax + movl %ecx, %r10d + movl %r8d, %r11d + movq 16(%rsp), %r8 + subq $16, %rsp + vmovdqa (%rdi), %xmm4 + vmovdqa (%r9), %xmm5 + vmovdqa (%r8), %xmm6 + vpsrlq $63, %xmm5, %xmm8 + vpsllq $0x01, %xmm5, %xmm7 + vpslldq $8, %xmm8, %xmm8 + vpor %xmm8, %xmm7, %xmm7 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpand L_vaes_aes_gcm_mod2_128(%rip), %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + movl %r10d, %edx + movl %r11d, %ecx + shlq $3, %rdx + shlq $3, %rcx + vmovq %rdx, %xmm0 + vmovq %rcx, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm5, %xmm4, %xmm7 + vpclmulqdq $0x01, %xmm5, %xmm4, %xmm8 + vpclmulqdq $16, %xmm5, %xmm4, %xmm9 + vpclmulqdq $0x11, %xmm5, %xmm4, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm0 + cmpl $16, %eax + je L_AES_GCM_encrypt_final_vaes_store_tag_16 + xorq %rcx, %rcx + vmovdqu %xmm0, (%rsp) +L_AES_GCM_encrypt_final_vaes_store_tag_loop: + movzbl (%rsp,%rcx,1), %r13d + movb %r13b, (%rsi,%rcx,1) + incl %ecx + cmpl %eax, %ecx + jne L_AES_GCM_encrypt_final_vaes_store_tag_loop + jmp L_AES_GCM_encrypt_final_vaes_store_tag_done +L_AES_GCM_encrypt_final_vaes_store_tag_16: + vmovdqu %xmm0, (%rsi) +L_AES_GCM_encrypt_final_vaes_store_tag_done: + vzeroupper + addq $16, %rsp + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_encrypt_final_vaes,.-AES_GCM_encrypt_final_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_decrypt_update_vaes +.type AES_GCM_decrypt_update_vaes,@function +.align 16 +AES_GCM_decrypt_update_vaes: +#else +.section __TEXT,__text +.globl _AES_GCM_decrypt_update_vaes +.p2align 4 +_AES_GCM_decrypt_update_vaes: +#endif /* __APPLE__ */ + pushq %r13 + pushq %r12 + pushq %r14 + pushq %r15 + pushq %rbx + movq %rdx, %r10 + movq %rcx, %r11 + movq 48(%rsp), %rax + movq 56(%rsp), %r12 + subq $0x210, %rsp + vmovdqa (%r9), %xmm15 + vmovdqa (%rax), %xmm6 + vpsrlq $63, %xmm6, %xmm8 + vpsllq $0x01, %xmm6, %xmm7 + vpslldq $8, %xmm8, %xmm8 + vpor %xmm8, %xmm7, %xmm7 + vpshufd $0xff, %xmm6, %xmm6 + vpsrad $31, %xmm6, %xmm6 + vpand L_vaes_aes_gcm_mod2_128(%rip), %xmm6, %xmm6 + vpxor %xmm7, %xmm6, %xmm6 + xorl %r14d, %r14d + cmpl $0x80, %r8d + jl L_AES_GCM_decrypt_update_vaes_done_128 + vmovdqa %xmm15, %xmm2 + # H ^ 1 + vmovdqu %xmm6, (%rsp) + # H ^ 2 + vpclmulqdq $0x00, %xmm6, %xmm6, %xmm7 + vpclmulqdq $0x11, %xmm6, %xmm6, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm0 + vmovdqu %xmm0, 16(%rsp) + # H ^ 3 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm6, %xmm0, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm0, %xmm8 + vpclmulqdq $16, %xmm6, %xmm0, %xmm9 + vpclmulqdq $0x11, %xmm6, %xmm0, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm1 + vmovdqu %xmm1, 32(%rsp) + # H ^ 4 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm3 + vmovdqu %xmm3, 48(%rsp) + # H ^ 5 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 64(%rsp) + # H ^ 6 + vpclmulqdq $0x00, %xmm1, %xmm1, %xmm7 + vpclmulqdq $0x11, %xmm1, %xmm1, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 80(%rsp) + # H ^ 7 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm1, %xmm3, %xmm7 + vpclmulqdq $0x01, %xmm1, %xmm3, %xmm8 + vpclmulqdq $16, %xmm1, %xmm3, %xmm9 + vpclmulqdq $0x11, %xmm1, %xmm3, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 96(%rsp) + # H ^ 8 + vpclmulqdq $0x00, %xmm3, %xmm3, %xmm7 + vpclmulqdq $0x11, %xmm3, %xmm3, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 112(%rsp) + cmpl $0x100, %r8d + jl L_AES_GCM_decrypt_update_vaes_no_ext + # H ^ 9 + vmovdqu 48(%rsp), %xmm0 + vmovdqu 64(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 128(%rsp) + # H ^ 10 + vmovdqu 64(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 144(%rsp) + # H ^ 11 + vmovdqu 64(%rsp), %xmm0 + vmovdqu 80(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 160(%rsp) + # H ^ 12 + vmovdqu 80(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 176(%rsp) + # H ^ 13 + vmovdqu 80(%rsp), %xmm0 + vmovdqu 96(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 192(%rsp) + # H ^ 14 + vmovdqu 96(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 208(%rsp) + # H ^ 15 + vmovdqu 96(%rsp), %xmm0 + vmovdqu 112(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 224(%rsp) + # H ^ 16 + vmovdqu 112(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 240(%rsp) + vmovdqu 224(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 192(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 160(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu 128(%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + vmovdqu %ymm7, 256(%rsp) + vmovdqu %ymm8, 288(%rsp) + vmovdqu %ymm9, 320(%rsp) + vmovdqu %ymm10, 352(%rsp) + vmovdqu 96(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 64(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 32(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu (%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + vmovdqu %ymm7, 384(%rsp) + vmovdqu %ymm8, 416(%rsp) + vmovdqu %ymm9, 448(%rsp) + vmovdqu %ymm10, 480(%rsp) +L_AES_GCM_decrypt_update_vaes_no_ext: + vbroadcasti128 L_vaes_aes_gcm_mod2_128(%rip), %ymm14 + cmpl $0x100, %r8d + jl L_AES_GCM_decrypt_update_vaes_after_256 + movl %r8d, %r13d + andl $0xffffff00, %r13d +L_AES_GCM_decrypt_update_vaes_loop_256: + # 256 bytes of input + leaq (%r11,%r14,1), %rbx + vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6 + vpxor %ymm4, %ymm4, %ymm4 + vinserti128 $0x00, %xmm15, %ymm4, %ymm4 + vmovdqu 256(%rsp), %ymm7 + vmovdqu 288(%rsp), %ymm8 + vmovdqu 320(%rsp), %ymm9 + vmovdqu 352(%rsp), %ymm10 + vmovdqu (%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpxor %ymm4, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vmovdqa %ymm0, %ymm11 + vpxor %ymm1, %ymm2, %ymm12 + vmovdqa %ymm3, %ymm13 + vmovdqu 32(%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 64(%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 96(%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 384(%rsp), %ymm7 + vmovdqu 416(%rsp), %ymm8 + vmovdqu 448(%rsp), %ymm9 + vmovdqu 480(%rsp), %ymm10 + vmovdqu 128(%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 160(%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 192(%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 224(%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5 + vpshufd $0x4e, %ymm11, %ymm11 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm11, %ymm12, %ymm12 + vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5 + vpshufd $0x4e, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm12, %ymm13, %ymm13 + vextracti128 $0x01, %ymm13, %xmm0 + vpxor %xmm0, %xmm13, %xmm15 + vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6 + vbroadcasti128 (%r12), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu (%r12), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, (%r12) + vbroadcasti128 (%rdi), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %esi + vbroadcasti128 160(%rdi), %ymm4 + jl L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %esi + vbroadcasti128 192(%rdi), %ymm4 + jl L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%rdi), %ymm4 +L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %r14d + vbroadcasti128 (%r12), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu (%r12), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, (%r12) + vbroadcasti128 (%rdi), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %esi + vbroadcasti128 160(%rdi), %ymm4 + jl L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %esi + vbroadcasti128 192(%rdi), %ymm4 + jl L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%rdi), %ymm4 +L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %r14d + cmpl %r13d, %r14d + jl L_AES_GCM_decrypt_update_vaes_loop_256 +L_AES_GCM_decrypt_update_vaes_after_256: + vmovdqu 96(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 64(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 32(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu (%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + movl %r8d, %r13d + andl $0xffffff80, %r13d + cmpl %r13d, %r14d + jge L_AES_GCM_decrypt_update_vaes_after_128 + # 128 bytes of input + leaq (%r11,%r14,1), %rbx + vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6 + vpxor %ymm4, %ymm4, %ymm4 + vinserti128 $0x00, %xmm15, %ymm4, %ymm4 + vmovdqu (%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpxor %ymm4, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vmovdqa %ymm0, %ymm11 + vpxor %ymm1, %ymm2, %ymm12 + vmovdqa %ymm3, %ymm13 + vmovdqu 32(%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 64(%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 96(%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5 + vpshufd $0x4e, %ymm11, %ymm11 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm11, %ymm12, %ymm12 + vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5 + vpshufd $0x4e, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm12, %ymm13, %ymm13 + vextracti128 $0x01, %ymm13, %xmm0 + vpxor %xmm0, %xmm13, %xmm15 + vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6 + vbroadcasti128 (%r12), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu (%r12), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, (%r12) + vbroadcasti128 (%rdi), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %esi + vbroadcasti128 160(%rdi), %ymm4 + jl L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %esi + vbroadcasti128 192(%rdi), %ymm4 + jl L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%rdi), %ymm4 +L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %r14d +L_AES_GCM_decrypt_update_vaes_after_128: + vmovdqu (%rsp), %xmm6 +L_AES_GCM_decrypt_update_vaes_done_128: + movl %r8d, %edx + cmpl %edx, %r14d + jge L_AES_GCM_decrypt_update_vaes_done_dec + movl %r8d, %r13d + andl $0xfffffff0, %r13d + cmpl %r13d, %r14d + jge L_AES_GCM_decrypt_update_vaes_last_block_done +L_AES_GCM_decrypt_update_vaes_last_block_start: + vmovdqu (%r11,%r14,1), %xmm12 + vmovdqa %xmm6, %xmm0 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm12, %xmm1 + vpxor %xmm15, %xmm1, %xmm1 + vmovdqu (%r12), %xmm8 + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm8, %xmm7 + vpaddd L_vaes_aes_gcm_one(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxor (%rdi), %xmm7, %xmm7 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vaesenc 16(%rdi), %xmm7, %xmm7 + vaesenc 32(%rdi), %xmm7, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm10 + vaesenc 48(%rdi), %xmm7, %xmm7 + vaesenc 64(%rdi), %xmm7, %xmm7 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm11 + vaesenc 80(%rdi), %xmm7, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vaesenc 96(%rdi), %xmm7, %xmm7 + vpxor %xmm10, %xmm9, %xmm9 + vpslldq $8, %xmm9, %xmm2 + vpsrldq $8, %xmm9, %xmm9 + vaesenc 112(%rdi), %xmm7, %xmm7 + vpxor %xmm11, %xmm2, %xmm2 + vpxor %xmm9, %xmm1, %xmm3 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm0 + vpclmulqdq $16, %xmm0, %xmm2, %xmm10 + vaesenc 128(%rdi), %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm9 + vpxor %xmm10, %xmm9, %xmm9 + vpclmulqdq $16, %xmm0, %xmm9, %xmm10 + vaesenc 144(%rdi), %xmm7, %xmm7 + vpshufd $0x4e, %xmm9, %xmm9 + vpxor %xmm10, %xmm9, %xmm9 + vpxor %xmm3, %xmm9, %xmm15 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm8 + jl L_AES_GCM_decrypt_update_vaes_aesenc_gfmul_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 176(%rdi), %xmm7, %xmm7 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm8 + jl L_AES_GCM_decrypt_update_vaes_aesenc_gfmul_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 208(%rdi), %xmm7, %xmm7 + vmovdqa 224(%rdi), %xmm8 +L_AES_GCM_decrypt_update_vaes_aesenc_gfmul_last: + vaesenclast %xmm8, %xmm7, %xmm7 + vmovdqa %xmm12, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vmovdqu %xmm7, (%r10,%r14,1) + addl $16, %r14d + cmpl %r13d, %r14d + jl L_AES_GCM_decrypt_update_vaes_last_block_start +L_AES_GCM_decrypt_update_vaes_last_block_done: +L_AES_GCM_decrypt_update_vaes_done_dec: + vmovdqa %xmm15, (%r9) + vzeroupper + addq $0x210, %rsp + popq %rbx + popq %r15 + popq %r14 + popq %r12 + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_decrypt_update_vaes,.-AES_GCM_decrypt_update_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_decrypt_final_vaes +.type AES_GCM_decrypt_final_vaes,@function +.align 16 +AES_GCM_decrypt_final_vaes: +#else +.section __TEXT,__text +.globl _AES_GCM_decrypt_final_vaes +.p2align 4 +_AES_GCM_decrypt_final_vaes: +#endif /* __APPLE__ */ + pushq %r13 + pushq %rbp + pushq %r12 + movl %edx, %eax + movl %ecx, %r10d + movl %r8d, %r11d + movq 32(%rsp), %r8 + movq 40(%rsp), %rbp + subq $16, %rsp + vmovdqa (%rdi), %xmm6 + vmovdqa (%r9), %xmm5 + vmovdqa (%r8), %xmm15 + vpsrlq $63, %xmm5, %xmm8 + vpsllq $0x01, %xmm5, %xmm7 + vpslldq $8, %xmm8, %xmm8 + vpor %xmm8, %xmm7, %xmm7 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpand L_vaes_aes_gcm_mod2_128(%rip), %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + movl %r10d, %edx + movl %r11d, %ecx + shlq $3, %rdx + shlq $3, %rcx + vmovq %rdx, %xmm0 + vmovq %rcx, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm7 + vpclmulqdq $0x01, %xmm5, %xmm6, %xmm8 + vpclmulqdq $16, %xmm5, %xmm6, %xmm9 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm6 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6 + vpxor %xmm15, %xmm6, %xmm0 + cmpl $16, %eax + je L_AES_GCM_decrypt_final_vaes_cmp_tag_16 + subq $16, %rsp + xorq %rcx, %rcx + xorq %r12, %r12 + vmovdqu %xmm0, (%rsp) +L_AES_GCM_decrypt_final_vaes_cmp_tag_loop: + movzbl (%rsp,%rcx,1), %r13d + xorb (%rsi,%rcx,1), %r13b + orb %r13b, %r12b + incl %ecx + cmpl %eax, %ecx + jne L_AES_GCM_decrypt_final_vaes_cmp_tag_loop + cmpb $0x00, %r12b + sete %r12b + addq $16, %rsp + xorq %rcx, %rcx + jmp L_AES_GCM_decrypt_final_vaes_cmp_tag_done +L_AES_GCM_decrypt_final_vaes_cmp_tag_16: + vmovdqu (%rsi), %xmm1 + vpcmpeqb %xmm1, %xmm0, %xmm0 + vpmovmskb %xmm0, %rdx + # %%edx == 0xFFFF then return 1 else => return 0 + xorl %r12d, %r12d + cmpl $0xffff, %edx + sete %r12b +L_AES_GCM_decrypt_final_vaes_cmp_tag_done: + movl %r12d, (%rbp) + vzeroupper + addq $16, %rsp + popq %r12 + popq %rbp + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_decrypt_final_vaes,.-AES_GCM_decrypt_final_vaes +#endif /* __APPLE__ */ +#endif /* WOLFSSL_AESGCM_STREAM */ +#endif /* HAVE_INTEL_VAES */ +#ifdef HAVE_INTEL_AVX512 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_inc_z0: +.quad 0x0000000000000000,0x0000000000000000 +.quad 0x0000000000000000,0x0000000000000001 +.quad 0x0000000000000000,0x0000000000000002 +.quad 0x0000000000000000,0x0000000000000003 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_inc_z1: +.quad 0x0000000000000000,0x0000000000000004 +.quad 0x0000000000000000,0x0000000000000005 +.quad 0x0000000000000000,0x0000000000000006 +.quad 0x0000000000000000,0x0000000000000007 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_inc_z2: +.quad 0x0000000000000000,0x0000000000000008 +.quad 0x0000000000000000,0x0000000000000009 +.quad 0x0000000000000000,0x000000000000000a +.quad 0x0000000000000000,0x000000000000000b +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_inc_z3: +.quad 0x0000000000000000,0x000000000000000c +.quad 0x0000000000000000,0x000000000000000d +.quad 0x0000000000000000,0x000000000000000e +.quad 0x0000000000000000,0x000000000000000f +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_sixteen: +.quad 0x0000000000000000,0x0000000000000010 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_GCM_generate_m0_avx512_rev8: +.quad 0x08090a0b0c0d0e0f,0x0001020304050607 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_GCM_generate_m0_avx512_mod2_128: +.quad 0x0000000000000000,0xe100000000000000 +#ifndef __APPLE__ +.text +.globl GCM_generate_m0_avx512 +.type GCM_generate_m0_avx512,@function +.align 16 +GCM_generate_m0_avx512: +#else +.section __TEXT,__text +.globl _GCM_generate_m0_avx512 +.p2align 4 +_GCM_generate_m0_avx512: +#endif /* __APPLE__ */ + vmovdqu L_GCM_generate_m0_avx512_rev8(%rip), %xmm9 + vmovdqu L_GCM_generate_m0_avx512_mod2_128(%rip), %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqu (%rdi), %xmm0 + vmovdqu %xmm8, (%rsi) + vmovdqu %xmm0, %xmm8 + vpshufb %xmm9, %xmm0, %xmm0 + vpsllq $63, %xmm0, %xmm5 + vpsrlq $0x01, %xmm0, %xmm4 + vpslldq $8, %xmm5, %xmm1 + vpsrldq $8, %xmm5, %xmm5 + vpshufd $0xff, %xmm1, %xmm1 + vpor %xmm5, %xmm4, %xmm4 + vpsrad $31, %xmm1, %xmm1 + vpand %xmm10, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpsllq $63, %xmm1, %xmm5 + vpsrlq $0x01, %xmm1, %xmm4 + vpslldq $8, %xmm5, %xmm2 + vpsrldq $8, %xmm5, %xmm5 + vpshufd $0xff, %xmm2, %xmm2 + vpor %xmm5, %xmm4, %xmm4 + vpsrad $31, %xmm2, %xmm2 + vpand %xmm10, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpsllq $63, %xmm2, %xmm5 + vpsrlq $0x01, %xmm2, %xmm4 + vpslldq $8, %xmm5, %xmm3 + vpsrldq $8, %xmm5, %xmm5 + vpshufd $0xff, %xmm3, %xmm3 + vpor %xmm5, %xmm4, %xmm4 + vpsrad $31, %xmm3, %xmm3 + vpand %xmm10, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpshufb %xmm9, %xmm3, %xmm3 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm0, %xmm0 + vpxor %xmm2, %xmm3, %xmm8 + vmovdqu %xmm3, 16(%rsi) + vmovdqu %xmm2, 32(%rsi) + vmovdqu %xmm8, 48(%rsi) + vmovdqu %xmm1, 64(%rsi) + vpxor %xmm1, %xmm3, %xmm4 + vpxor %xmm1, %xmm2, %xmm5 + vpxor %xmm1, %xmm8, %xmm6 + vmovdqu %xmm4, 80(%rsi) + vmovdqu %xmm5, 96(%rsi) + vmovdqu %xmm6, 112(%rsi) + vmovdqu %xmm0, 128(%rsi) + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm0, %xmm3, %xmm4 + vpxor %xmm0, %xmm2, %xmm6 + vmovdqu %xmm4, 144(%rsi) + vmovdqu %xmm6, 160(%rsi) + vpxor %xmm6, %xmm3, %xmm6 + vmovdqu %xmm6, 176(%rsi) + vmovdqu %xmm1, 192(%rsi) + vpxor %xmm1, %xmm3, %xmm4 + vpxor %xmm1, %xmm2, %xmm5 + vpxor %xmm1, %xmm8, %xmm6 + vmovdqu %xmm4, 208(%rsi) + vmovdqu %xmm5, 224(%rsi) + vmovdqu %xmm6, 240(%rsi) + vmovdqu (%rsi), %xmm0 + vmovdqu 16(%rsi), %xmm1 + vmovdqu 32(%rsi), %xmm2 + vmovdqu 48(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 256(%rsi) + vmovdqu %xmm1, 272(%rsi) + vmovdqu %xmm2, 288(%rsi) + vmovdqu %xmm3, 304(%rsi) + vmovdqu 64(%rsi), %xmm0 + vmovdqu 80(%rsi), %xmm1 + vmovdqu 96(%rsi), %xmm2 + vmovdqu 112(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 320(%rsi) + vmovdqu %xmm1, 336(%rsi) + vmovdqu %xmm2, 352(%rsi) + vmovdqu %xmm3, 368(%rsi) + vmovdqu 128(%rsi), %xmm0 + vmovdqu 144(%rsi), %xmm1 + vmovdqu 160(%rsi), %xmm2 + vmovdqu 176(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 384(%rsi) + vmovdqu %xmm1, 400(%rsi) + vmovdqu %xmm2, 416(%rsi) + vmovdqu %xmm3, 432(%rsi) + vmovdqu 192(%rsi), %xmm0 + vmovdqu 208(%rsi), %xmm1 + vmovdqu 224(%rsi), %xmm2 + vmovdqu 240(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 448(%rsi) + vmovdqu %xmm1, 464(%rsi) + vmovdqu %xmm2, 480(%rsi) + vmovdqu %xmm3, 496(%rsi) + repz retq +#ifndef __APPLE__ +.size GCM_generate_m0_avx512,.-GCM_generate_m0_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_one: +.quad 0x0000000000000000,0x0000000000000001 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_two: +.quad 0x0000000000000000,0x0000000000000002 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_three: +.quad 0x0000000000000000,0x0000000000000003 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_four: +.quad 0x0000000000000000,0x0000000000000004 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_five: +.quad 0x0000000000000000,0x0000000000000005 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_six: +.quad 0x0000000000000000,0x0000000000000006 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_seven: +.quad 0x0000000000000000,0x0000000000000007 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_eight: +.quad 0x0000000000000000,0x0000000000000008 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_bswap_epi64: +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_bswap_mask: +.quad 0x08090a0b0c0d0e0f,0x0001020304050607 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_mod2_128: +.quad 0x0000000000000001,0xc200000000000000 +#ifndef __APPLE__ +.text +.globl AES_GCM_encrypt_avx512 +.type AES_GCM_encrypt_avx512,@function +.align 16 +AES_GCM_encrypt_avx512: +#else +.section __TEXT,__text +.globl _AES_GCM_encrypt_avx512 +.p2align 4 +_AES_GCM_encrypt_avx512: +#endif /* __APPLE__ */ + pushq %r13 + pushq %r12 + pushq %rbx + pushq %r14 + pushq %r15 + movq %rdx, %r12 + movq %rcx, %rax + movl 48(%rsp), %r11d + movl 56(%rsp), %ebx + movl 64(%rsp), %r14d + movq 72(%rsp), %r15 + movl 80(%rsp), %r10d + subq $0x440, %rsp + vpxor %xmm4, %xmm4, %xmm4 + vpxor %xmm6, %xmm6, %xmm6 + movl %ebx, %edx + cmpl $12, %edx + jne L_AES_GCM_encrypt_avx512_iv_not_12 + # # Calculate values when IV is 12 bytes + # Set counter based on IV + movl $0x1000000, %ecx + vmovq (%rax), %xmm4 + vpinsrd $2, 8(%rax), %xmm4, %xmm4 + vpinsrd $3, %ecx, %xmm4, %xmm4 + # H = Encrypt X(=0) and T = Encrypt counter + vmovdqa (%r15), %xmm5 + vpxor %xmm5, %xmm4, %xmm1 + vmovdqa 16(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 32(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 48(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 64(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 80(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 96(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 112(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 128(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 144(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm7 + jl L_AES_GCM_encrypt_avx512_calc_iv_12_last + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 176(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm7 + jl L_AES_GCM_encrypt_avx512_calc_iv_12_last + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 208(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 224(%r15), %xmm7 +L_AES_GCM_encrypt_avx512_calc_iv_12_last: + vaesenclast %xmm7, %xmm5, %xmm5 + vaesenclast %xmm7, %xmm1, %xmm1 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + vmovdqu %xmm1, 1040(%rsp) + jmp L_AES_GCM_encrypt_avx512_iv_done +L_AES_GCM_encrypt_avx512_iv_not_12: + # Calculate values when IV is not 12 bytes + # H = Encrypt X(=0) + vmovdqa (%r15), %xmm5 + vaesenc 16(%r15), %xmm5, %xmm5 + vaesenc 32(%r15), %xmm5, %xmm5 + vaesenc 48(%r15), %xmm5, %xmm5 + vaesenc 64(%r15), %xmm5, %xmm5 + vaesenc 80(%r15), %xmm5, %xmm5 + vaesenc 96(%r15), %xmm5, %xmm5 + vaesenc 112(%r15), %xmm5, %xmm5 + vaesenc 128(%r15), %xmm5, %xmm5 + vaesenc 144(%r15), %xmm5, %xmm5 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm9 + jl L_AES_GCM_encrypt_avx512_calc_iv_1_aesenc_avx_last + vaesenc %xmm9, %xmm5, %xmm5 + vaesenc 176(%r15), %xmm5, %xmm5 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm9 + jl L_AES_GCM_encrypt_avx512_calc_iv_1_aesenc_avx_last + vaesenc %xmm9, %xmm5, %xmm5 + vaesenc 208(%r15), %xmm5, %xmm5 + vmovdqa 224(%r15), %xmm9 +L_AES_GCM_encrypt_avx512_calc_iv_1_aesenc_avx_last: + vaesenclast %xmm9, %xmm5, %xmm5 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + # Calc counter + # Initialization vector + cmpl $0x00, %edx + movq $0x00, %rcx + je L_AES_GCM_encrypt_avx512_calc_iv_done + cmpl $16, %edx + jl L_AES_GCM_encrypt_avx512_calc_iv_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_encrypt_avx512_calc_iv_16_loop: + vmovdqu (%rax,%rcx,1), %xmm8 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + vpxor %xmm8, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_avx512_calc_iv_16_loop + movl %ebx, %edx + cmpl %edx, %ecx + je L_AES_GCM_encrypt_avx512_calc_iv_done +L_AES_GCM_encrypt_avx512_calc_iv_lt16: + subq $16, %rsp + vpxor %xmm8, %xmm8, %xmm8 + xorl %ebx, %ebx + vmovdqu %xmm8, (%rsp) +L_AES_GCM_encrypt_avx512_calc_iv_loop: + movzbl (%rax,%rcx,1), %r13d + movb %r13b, (%rsp,%rbx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_avx512_calc_iv_loop + vmovdqu (%rsp), %xmm8 + addq $16, %rsp + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + vpxor %xmm8, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 +L_AES_GCM_encrypt_avx512_calc_iv_done: + # T = Encrypt counter + vpxor %xmm0, %xmm0, %xmm0 + shll $3, %edx + vmovq %rdx, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4 + # Encrypt counter + vmovdqa (%r15), %xmm8 + vpxor %xmm4, %xmm8, %xmm8 + vaesenc 16(%r15), %xmm8, %xmm8 + vaesenc 32(%r15), %xmm8, %xmm8 + vaesenc 48(%r15), %xmm8, %xmm8 + vaesenc 64(%r15), %xmm8, %xmm8 + vaesenc 80(%r15), %xmm8, %xmm8 + vaesenc 96(%r15), %xmm8, %xmm8 + vaesenc 112(%r15), %xmm8, %xmm8 + vaesenc 128(%r15), %xmm8, %xmm8 + vaesenc 144(%r15), %xmm8, %xmm8 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm9 + jl L_AES_GCM_encrypt_avx512_calc_iv_2_aesenc_avx_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 176(%r15), %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm9 + jl L_AES_GCM_encrypt_avx512_calc_iv_2_aesenc_avx_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 208(%r15), %xmm8, %xmm8 + vmovdqa 224(%r15), %xmm9 +L_AES_GCM_encrypt_avx512_calc_iv_2_aesenc_avx_last: + vaesenclast %xmm9, %xmm8, %xmm8 + vmovdqu %xmm8, 1040(%rsp) +L_AES_GCM_encrypt_avx512_iv_done: + # Additional authentication data + movl %r11d, %edx + cmpl $0x00, %edx + je L_AES_GCM_encrypt_avx512_calc_aad_done + xorl %ecx, %ecx + cmpl $16, %edx + jl L_AES_GCM_encrypt_avx512_calc_aad_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_encrypt_avx512_calc_aad_16_loop: + vmovdqu (%r12,%rcx,1), %xmm8 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + vpxor %xmm8, %xmm6, %xmm6 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm6, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm6, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm0 + vpxor %xmm6, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm6 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm6, %xmm6 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm6, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm6, %xmm6 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm6, %xmm6 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm6, %xmm6 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm6, %xmm6 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_avx512_calc_aad_16_loop + movl %r11d, %edx + cmpl %edx, %ecx + je L_AES_GCM_encrypt_avx512_calc_aad_done +L_AES_GCM_encrypt_avx512_calc_aad_lt16: + subq $16, %rsp + vpxor %xmm8, %xmm8, %xmm8 + xorl %ebx, %ebx + vmovdqu %xmm8, (%rsp) +L_AES_GCM_encrypt_avx512_calc_aad_loop: + movzbl (%r12,%rcx,1), %r13d + movb %r13b, (%rsp,%rbx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_avx512_calc_aad_loop + vmovdqu (%rsp), %xmm8 + addq $16, %rsp + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + vpxor %xmm8, %xmm6, %xmm6 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm6, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm6, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm0 + vpxor %xmm6, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm6 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm6, %xmm6 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm6, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm6, %xmm6 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm6, %xmm6 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm6, %xmm6 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm6, %xmm6 +L_AES_GCM_encrypt_avx512_calc_aad_done: + # Calculate counter and H + vpsrlq $63, %xmm5, %xmm9 + vpsllq $0x01, %xmm5, %xmm8 + vpslldq $8, %xmm9, %xmm9 + vpor %xmm9, %xmm8, %xmm8 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4 + vpand L_avx512_aes_gcm_mod2_128(%rip), %xmm5, %xmm5 + vpaddd L_avx512_aes_gcm_one(%rip), %xmm4, %xmm4 + vpxor %xmm8, %xmm5, %xmm5 + vmovdqu %xmm4, 1024(%rsp) + xorl %ebx, %ebx + cmpl $0x100, %r9d + jl L_AES_GCM_encrypt_avx512_done_128 + vmovdqa %xmm6, %xmm2 + # H ^ 1 + vmovdqu %xmm5, (%rsp) + # H ^ 2 + vpclmulqdq $0x00, %xmm5, %xmm5, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm5, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm0 + vmovdqu %xmm0, 16(%rsp) + # H ^ 3 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpshufd $0x4e, %xmm0, %xmm10 + vpxor %xmm0, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm5, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm0, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm1 + vmovdqu %xmm1, 32(%rsp) + # H ^ 4 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm3 + vmovdqu %xmm3, 48(%rsp) + # H ^ 5 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 64(%rsp) + # H ^ 6 + vpclmulqdq $0x00, %xmm1, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm1, %xmm1, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 80(%rsp) + # H ^ 7 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm1, %xmm9 + vpxor %xmm1, %xmm9, %xmm9 + vpshufd $0x4e, %xmm3, %xmm10 + vpxor %xmm3, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm1, %xmm3, %xmm8 + vpclmulqdq $0x11, %xmm1, %xmm3, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 96(%rsp) + # H ^ 8 + vpclmulqdq $0x00, %xmm3, %xmm3, %xmm8 + vpclmulqdq $0x11, %xmm3, %xmm3, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 112(%rsp) + # H ^ 9 + vmovdqu 48(%rsp), %xmm0 + vmovdqu 64(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 128(%rsp) + # H ^ 10 + vmovdqu 64(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 144(%rsp) + # H ^ 11 + vmovdqu 64(%rsp), %xmm0 + vmovdqu 80(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 160(%rsp) + # H ^ 12 + vmovdqu 80(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 176(%rsp) + # H ^ 13 + vmovdqu 80(%rsp), %xmm0 + vmovdqu 96(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 192(%rsp) + # H ^ 14 + vmovdqu 96(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 208(%rsp) + # H ^ 15 + vmovdqu 96(%rsp), %xmm0 + vmovdqu 112(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 224(%rsp) + # H ^ 16 + vmovdqu 112(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 240(%rsp) + cmpl $0x200, %r9d + jl L_AES_GCM_encrypt_avx512_no_ext + # H ^ 17 + vmovdqu 112(%rsp), %xmm0 + vmovdqu 128(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 256(%rsp) + # H ^ 18 + vmovdqu 128(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 272(%rsp) + # H ^ 19 + vmovdqu 128(%rsp), %xmm0 + vmovdqu 144(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 288(%rsp) + # H ^ 20 + vmovdqu 144(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 304(%rsp) + # H ^ 21 + vmovdqu 144(%rsp), %xmm0 + vmovdqu 160(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 320(%rsp) + # H ^ 22 + vmovdqu 160(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 336(%rsp) + # H ^ 23 + vmovdqu 160(%rsp), %xmm0 + vmovdqu 176(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 352(%rsp) + # H ^ 24 + vmovdqu 176(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 368(%rsp) + # H ^ 25 + vmovdqu 176(%rsp), %xmm0 + vmovdqu 192(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 384(%rsp) + # H ^ 26 + vmovdqu 192(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 400(%rsp) + # H ^ 27 + vmovdqu 192(%rsp), %xmm0 + vmovdqu 208(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 416(%rsp) + # H ^ 28 + vmovdqu 208(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 432(%rsp) + # H ^ 29 + vmovdqu 208(%rsp), %xmm0 + vmovdqu 224(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 448(%rsp) + # H ^ 30 + vmovdqu 224(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 464(%rsp) + # H ^ 31 + vmovdqu 224(%rsp), %xmm0 + vmovdqu 240(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 480(%rsp) + # H ^ 32 + vmovdqu 240(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 496(%rsp) +L_AES_GCM_encrypt_avx512_no_ext: + vbroadcasti32x4 L_avx512_aes_gcm_bswap_epi64(%rip), %zmm22 + vbroadcasti32x4 L_avx512_aes_gcm_bswap_mask(%rip), %zmm30 + vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31 + vbroadcasti32x4 (%r15), %zmm9 + vbroadcasti32x4 16(%r15), %zmm10 + vbroadcasti32x4 32(%r15), %zmm11 + vbroadcasti32x4 48(%r15), %zmm12 + vbroadcasti32x4 64(%r15), %zmm13 + vbroadcasti32x4 80(%r15), %zmm14 + vbroadcasti32x4 96(%r15), %zmm15 + vbroadcasti32x4 112(%r15), %zmm1 + vbroadcasti32x4 128(%r15), %zmm2 + vbroadcasti32x4 144(%r15), %zmm3 + cmpl $0x200, %r9d + jl L_AES_GCM_encrypt_avx512_no_windows + movl %r9d, %r13d + andl $0xfffffe00, %r13d + vmovdqu64 448(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 384(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 320(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 256(%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + vmovdqu64 %zmm23, 512(%rsp) + vmovdqu64 %zmm24, 576(%rsp) + vmovdqu64 %zmm25, 640(%rsp) + vmovdqu64 %zmm26, 704(%rsp) + vmovdqu64 192(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 128(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 64(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 (%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + vmovdqu64 %zmm23, 768(%rsp) + vmovdqu64 %zmm24, 832(%rsp) + vmovdqu64 %zmm25, 896(%rsp) + vmovdqu64 %zmm26, 960(%rsp) + # 512 bytes of input + leaq (%rsi,%rbx,1), %rcx + movq %rcx, 1056(%rsp) + vbroadcasti32x4 1024(%rsp), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu 1024(%rsp), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 1024(%rsp) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %r10d + vbroadcasti32x4 160(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %r10d + vbroadcasti32x4 192(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%r15), %zmm20 +L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %ebx + vbroadcasti32x4 1024(%rsp), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu 1024(%rsp), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 1024(%rsp) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %r10d + vbroadcasti32x4 160(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %r10d + vbroadcasti32x4 192(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%r15), %zmm20 +L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %ebx + cmpl %r13d, %ebx + jge L_AES_GCM_encrypt_avx512_last_win +L_AES_GCM_encrypt_avx512_win_loop: + leaq (%rsi,%rbx,1), %rcx + movq %rcx, 1072(%rsp) + movq 1056(%rsp), %r12 + vpxorq %zmm21, %zmm21, %zmm21 + vinserti32x4 $0x00, %xmm6, %zmm21, %zmm21 + vbroadcasti32x4 1024(%rsp), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu 1024(%rsp), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 1024(%rsp) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vmovdqu64 (%r12), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vpxorq %zmm21, %zmm31, %zmm31 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vpclmulqdq $0x00, 512(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 512(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 512(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 512(%rsp), %zmm31, %zmm26 + vmovdqa64 %zmm23, %zmm27 + vpxorq %zmm24, %zmm25, %zmm28 + vmovdqa64 %zmm26, %zmm29 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vmovdqu64 64(%r12), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vpclmulqdq $0x00, 576(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 576(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 576(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 576(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vmovdqu64 128(%r12), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vpclmulqdq $0x00, 640(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 640(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 640(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 640(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vmovdqu64 192(%r12), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vpclmulqdq $0x00, 704(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 704(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 704(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 704(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %r10d + vbroadcasti32x4 160(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_a_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %r10d + vbroadcasti32x4 192(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_a_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%r15), %zmm20 +L_AES_GCM_encrypt_avx512_a_il_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %ebx + vbroadcasti32x4 1024(%rsp), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu 1024(%rsp), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 1024(%rsp) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vmovdqu64 256(%r12), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vpclmulqdq $0x00, 768(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 768(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 768(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 768(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vmovdqu64 320(%r12), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vpclmulqdq $0x00, 832(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 832(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 832(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 832(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vmovdqu64 384(%r12), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vpclmulqdq $0x00, 896(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 896(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 896(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 896(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vmovdqu64 448(%r12), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vpclmulqdq $0x00, 960(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 960(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 960(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 960(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %r10d + vbroadcasti32x4 160(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_b_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %r10d + vbroadcasti32x4 192(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_b_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%r15), %zmm20 +L_AES_GCM_encrypt_avx512_b_il_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %ebx + vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm23 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm23, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm23 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm23, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 + movq 1072(%rsp), %rcx + movq %rcx, 1056(%rsp) + cmpl %r13d, %ebx + jl L_AES_GCM_encrypt_avx512_win_loop +L_AES_GCM_encrypt_avx512_last_win: + movq 1056(%rsp), %rcx + vpxorq %zmm20, %zmm20, %zmm20 + vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20 + vmovdqu64 512(%rsp), %zmm23 + vmovdqu64 576(%rsp), %zmm24 + vmovdqu64 640(%rsp), %zmm25 + vmovdqu64 704(%rsp), %zmm26 + vmovdqu64 (%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpxorq %zmm20, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vmovdqa64 %zmm16, %zmm27 + vpxorq %zmm17, %zmm18, %zmm28 + vmovdqa64 %zmm19, %zmm29 + vmovdqu64 64(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 128(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 192(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 768(%rsp), %zmm23 + vmovdqu64 832(%rsp), %zmm24 + vmovdqu64 896(%rsp), %zmm25 + vmovdqu64 960(%rsp), %zmm26 + vmovdqu64 256(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 320(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 384(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 448(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm21, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm21, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 +L_AES_GCM_encrypt_avx512_no_windows: + vmovdqu64 192(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 128(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 64(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 (%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + movl %r9d, %r13d + andl $0xffffff00, %r13d + cmpl %r13d, %ebx + jge L_AES_GCM_encrypt_avx512_after_256 + # 256 bytes of input + vbroadcasti32x4 1024(%rsp), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu 1024(%rsp), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 1024(%rsp) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %r10d + vbroadcasti32x4 160(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %r10d + vbroadcasti32x4 192(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%r15), %zmm20 +L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + movq %rdx, 1056(%rsp) + addl $0x100, %ebx + cmpl %r13d, %ebx + jge L_AES_GCM_encrypt_avx512_last_ghash +L_AES_GCM_encrypt_avx512_ghash_128: + vbroadcasti32x4 1024(%rsp), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu 1024(%rsp), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 1024(%rsp) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %r10d + vbroadcasti32x4 160(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %r10d + vbroadcasti32x4 192(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%r15), %zmm20 +L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + movq 1056(%rsp), %rcx + vpxorq %zmm20, %zmm20, %zmm20 + vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20 + vmovdqu64 (%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpxorq %zmm20, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vmovdqa64 %zmm16, %zmm27 + vpxorq %zmm17, %zmm18, %zmm28 + vmovdqa64 %zmm19, %zmm29 + vmovdqu64 64(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 128(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 192(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm21, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm21, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 + movq %rdx, 1056(%rsp) + addl $0x100, %ebx + cmpl %r13d, %ebx + jl L_AES_GCM_encrypt_avx512_ghash_128 +L_AES_GCM_encrypt_avx512_last_ghash: + movq 1056(%rsp), %rcx + vpxorq %zmm20, %zmm20, %zmm20 + vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20 + vmovdqu64 (%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpxorq %zmm20, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vmovdqa64 %zmm16, %zmm27 + vpxorq %zmm17, %zmm18, %zmm28 + vmovdqa64 %zmm19, %zmm29 + vmovdqu64 64(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 128(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 192(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm21, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm21, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 +L_AES_GCM_encrypt_avx512_after_256: + vmovdqu (%rsp), %xmm5 +L_AES_GCM_encrypt_avx512_done_128: + movl %r9d, %edx + cmpl %edx, %ebx + jge L_AES_GCM_encrypt_avx512_done_enc + movl %r9d, %r13d + andl $0xfffffff0, %r13d + cmpl %r13d, %ebx + jge L_AES_GCM_encrypt_avx512_last_block_done + vmovdqu 1024(%rsp), %xmm9 + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8 + vpaddd L_avx512_aes_gcm_one(%rip), %xmm9, %xmm9 + vmovdqu %xmm9, 1024(%rsp) + vpxor (%r15), %xmm8, %xmm8 + vaesenc 16(%r15), %xmm8, %xmm8 + vaesenc 32(%r15), %xmm8, %xmm8 + vaesenc 48(%r15), %xmm8, %xmm8 + vaesenc 64(%r15), %xmm8, %xmm8 + vaesenc 80(%r15), %xmm8, %xmm8 + vaesenc 96(%r15), %xmm8, %xmm8 + vaesenc 112(%r15), %xmm8, %xmm8 + vaesenc 128(%r15), %xmm8, %xmm8 + vaesenc 144(%r15), %xmm8, %xmm8 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm9 + jl L_AES_GCM_encrypt_avx512_aesenc_block_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 176(%r15), %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm9 + jl L_AES_GCM_encrypt_avx512_aesenc_block_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 208(%r15), %xmm8, %xmm8 + vmovdqa 224(%r15), %xmm9 +L_AES_GCM_encrypt_avx512_aesenc_block_last: + vaesenclast %xmm9, %xmm8, %xmm8 + vmovdqu (%rdi,%rbx,1), %xmm9 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqu %xmm8, (%rsi,%rbx,1) + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + vpxor %xmm8, %xmm6, %xmm6 + addl $16, %ebx + cmpl %r13d, %ebx + jge L_AES_GCM_encrypt_avx512_last_block_ghash +L_AES_GCM_encrypt_avx512_last_block_start: + vmovdqu (%rdi,%rbx,1), %xmm13 + vmovdqu 1024(%rsp), %xmm9 + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8 + vpaddd L_avx512_aes_gcm_one(%rip), %xmm9, %xmm9 + vmovdqu %xmm9, 1024(%rsp) + vpxor (%r15), %xmm8, %xmm8 + vpclmulqdq $16, %xmm5, %xmm6, %xmm10 + vaesenc 16(%r15), %xmm8, %xmm8 + vaesenc 32(%r15), %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm5, %xmm6, %xmm11 + vaesenc 48(%r15), %xmm8, %xmm8 + vaesenc 64(%r15), %xmm8, %xmm8 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm12 + vaesenc 80(%r15), %xmm8, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm1 + vaesenc 96(%r15), %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpslldq $8, %xmm10, %xmm2 + vpsrldq $8, %xmm10, %xmm10 + vaesenc 112(%r15), %xmm8, %xmm8 + vpxor %xmm12, %xmm2, %xmm2 + vpxor %xmm10, %xmm1, %xmm3 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm0 + vpclmulqdq $16, %xmm0, %xmm2, %xmm11 + vaesenc 128(%r15), %xmm8, %xmm8 + vpshufd $0x4e, %xmm2, %xmm10 + vpxor %xmm11, %xmm10, %xmm10 + vpclmulqdq $16, %xmm0, %xmm10, %xmm11 + vaesenc 144(%r15), %xmm8, %xmm8 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm3, %xmm10, %xmm6 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm9 + jl L_AES_GCM_encrypt_avx512_aesenc_gfmul_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 176(%r15), %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm9 + jl L_AES_GCM_encrypt_avx512_aesenc_gfmul_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 208(%r15), %xmm8, %xmm8 + vmovdqa 224(%r15), %xmm9 +L_AES_GCM_encrypt_avx512_aesenc_gfmul_last: + vaesenclast %xmm9, %xmm8, %xmm8 + vmovdqa %xmm13, %xmm0 + vpxor %xmm0, %xmm8, %xmm8 + vmovdqu %xmm8, (%rsi,%rbx,1) + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + addl $16, %ebx + vpxor %xmm8, %xmm6, %xmm6 + cmpl %r13d, %ebx + jl L_AES_GCM_encrypt_avx512_last_block_start +L_AES_GCM_encrypt_avx512_last_block_ghash: + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpshufd $0x4e, %xmm6, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm6 +L_AES_GCM_encrypt_avx512_last_block_done: + movl %r9d, %ecx + movl %ecx, %edx + andl $15, %ecx + jz L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_done + vmovdqu 1024(%rsp), %xmm4 + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4 + vpxor (%r15), %xmm4, %xmm4 + vaesenc 16(%r15), %xmm4, %xmm4 + vaesenc 32(%r15), %xmm4, %xmm4 + vaesenc 48(%r15), %xmm4, %xmm4 + vaesenc 64(%r15), %xmm4, %xmm4 + vaesenc 80(%r15), %xmm4, %xmm4 + vaesenc 96(%r15), %xmm4, %xmm4 + vaesenc 112(%r15), %xmm4, %xmm4 + vaesenc 128(%r15), %xmm4, %xmm4 + vaesenc 144(%r15), %xmm4, %xmm4 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm9 + jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc %xmm9, %xmm4, %xmm4 + vaesenc 176(%r15), %xmm4, %xmm4 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm9 + jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc %xmm9, %xmm4, %xmm4 + vaesenc 208(%r15), %xmm4, %xmm4 + vmovdqa 224(%r15), %xmm9 +L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_aesenc_avx_last: + vaesenclast %xmm9, %xmm4, %xmm4 + subq $16, %rsp + xorl %ecx, %ecx + vmovdqu %xmm4, (%rsp) +L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_loop: + movzbl (%rdi,%rbx,1), %r13d + xorb (%rsp,%rcx,1), %r13b + movb %r13b, (%rsi,%rbx,1) + movb %r13b, (%rsp,%rcx,1) + incl %ebx + incl %ecx + cmpl %edx, %ebx + jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_loop + xorq %r13, %r13 + cmpl $16, %ecx + je L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_finish_enc +L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_byte_loop: + movb %r13b, (%rsp,%rcx,1) + incl %ecx + cmpl $16, %ecx + jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_byte_loop +L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_finish_enc: + vmovdqu (%rsp), %xmm4 + addq $16, %rsp + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4 + vpxor %xmm4, %xmm6, %xmm6 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpshufd $0x4e, %xmm6, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm6 +L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_done: +L_AES_GCM_encrypt_avx512_done_enc: + movl %r9d, %edx + movl %r11d, %ecx + shlq $3, %rdx + shlq $3, %rcx + vmovq %rdx, %xmm0 + vmovq %rcx, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpshufd $0x4e, %xmm6, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm6 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6 + vmovdqu 1040(%rsp), %xmm0 + vpxor %xmm6, %xmm0, %xmm0 + cmpl $16, %r14d + je L_AES_GCM_encrypt_avx512_store_tag_16 + xorq %rcx, %rcx + vmovdqu %xmm0, (%rsp) +L_AES_GCM_encrypt_avx512_store_tag_loop: + movzbl (%rsp,%rcx,1), %r13d + movb %r13b, (%r8,%rcx,1) + incl %ecx + cmpl %r14d, %ecx + jne L_AES_GCM_encrypt_avx512_store_tag_loop + jmp L_AES_GCM_encrypt_avx512_store_tag_done +L_AES_GCM_encrypt_avx512_store_tag_16: + vmovdqu %xmm0, (%r8) +L_AES_GCM_encrypt_avx512_store_tag_done: + vzeroupper + addq $0x440, %rsp + popq %r15 + popq %r14 + popq %rbx + popq %r12 + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_encrypt_avx512,.-AES_GCM_encrypt_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_decrypt_avx512 +.type AES_GCM_decrypt_avx512,@function +.align 16 +AES_GCM_decrypt_avx512: +#else +.section __TEXT,__text +.globl _AES_GCM_decrypt_avx512 +.p2align 4 +_AES_GCM_decrypt_avx512: +#endif /* __APPLE__ */ + pushq %r13 + pushq %r12 + pushq %rbx + pushq %r14 + pushq %r15 + pushq %rbp + movq %rdx, %r12 + movq %rcx, %rax + movl 56(%rsp), %r11d + movl 64(%rsp), %ebx + movl 72(%rsp), %r14d + movq 80(%rsp), %r15 + movl 88(%rsp), %r10d + movq 96(%rsp), %rbp + subq $0x420, %rsp + vpxor %xmm4, %xmm4, %xmm4 + vpxor %xmm6, %xmm6, %xmm6 + cmpl $12, %ebx + movl %ebx, %edx + jne L_AES_GCM_decrypt_avx512_iv_not_12 + # # Calculate values when IV is 12 bytes + # Set counter based on IV + movl $0x1000000, %ecx + vmovq (%rax), %xmm4 + vpinsrd $2, 8(%rax), %xmm4, %xmm4 + vpinsrd $3, %ecx, %xmm4, %xmm4 + # H = Encrypt X(=0) and T = Encrypt counter + vmovdqa (%r15), %xmm5 + vpxor %xmm5, %xmm4, %xmm1 + vmovdqa 16(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 32(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 48(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 64(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 80(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 96(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 112(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 128(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 144(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm7 + jl L_AES_GCM_decrypt_avx512_calc_iv_12_last + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 176(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm7 + jl L_AES_GCM_decrypt_avx512_calc_iv_12_last + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 208(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 224(%r15), %xmm7 +L_AES_GCM_decrypt_avx512_calc_iv_12_last: + vaesenclast %xmm7, %xmm5, %xmm5 + vaesenclast %xmm7, %xmm1, %xmm1 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + vmovdqu %xmm1, 1040(%rsp) + jmp L_AES_GCM_decrypt_avx512_iv_done +L_AES_GCM_decrypt_avx512_iv_not_12: + # Calculate values when IV is not 12 bytes + # H = Encrypt X(=0) + vmovdqa (%r15), %xmm5 + vaesenc 16(%r15), %xmm5, %xmm5 + vaesenc 32(%r15), %xmm5, %xmm5 + vaesenc 48(%r15), %xmm5, %xmm5 + vaesenc 64(%r15), %xmm5, %xmm5 + vaesenc 80(%r15), %xmm5, %xmm5 + vaesenc 96(%r15), %xmm5, %xmm5 + vaesenc 112(%r15), %xmm5, %xmm5 + vaesenc 128(%r15), %xmm5, %xmm5 + vaesenc 144(%r15), %xmm5, %xmm5 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm9 + jl L_AES_GCM_decrypt_avx512_calc_iv_1_aesenc_avx_last + vaesenc %xmm9, %xmm5, %xmm5 + vaesenc 176(%r15), %xmm5, %xmm5 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm9 + jl L_AES_GCM_decrypt_avx512_calc_iv_1_aesenc_avx_last + vaesenc %xmm9, %xmm5, %xmm5 + vaesenc 208(%r15), %xmm5, %xmm5 + vmovdqa 224(%r15), %xmm9 +L_AES_GCM_decrypt_avx512_calc_iv_1_aesenc_avx_last: + vaesenclast %xmm9, %xmm5, %xmm5 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + # Calc counter + # Initialization vector + cmpl $0x00, %edx + movq $0x00, %rcx + je L_AES_GCM_decrypt_avx512_calc_iv_done + cmpl $16, %edx + jl L_AES_GCM_decrypt_avx512_calc_iv_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_decrypt_avx512_calc_iv_16_loop: + vmovdqu (%rax,%rcx,1), %xmm8 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + vpxor %xmm8, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_avx512_calc_iv_16_loop + movl %ebx, %edx + cmpl %edx, %ecx + je L_AES_GCM_decrypt_avx512_calc_iv_done +L_AES_GCM_decrypt_avx512_calc_iv_lt16: + subq $16, %rsp + vpxor %xmm8, %xmm8, %xmm8 + xorl %ebx, %ebx + vmovdqu %xmm8, (%rsp) +L_AES_GCM_decrypt_avx512_calc_iv_loop: + movzbl (%rax,%rcx,1), %r13d + movb %r13b, (%rsp,%rbx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_avx512_calc_iv_loop + vmovdqu (%rsp), %xmm8 + addq $16, %rsp + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + vpxor %xmm8, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 +L_AES_GCM_decrypt_avx512_calc_iv_done: + # T = Encrypt counter + vpxor %xmm0, %xmm0, %xmm0 + shll $3, %edx + vmovq %rdx, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4 + # Encrypt counter + vmovdqa (%r15), %xmm8 + vpxor %xmm4, %xmm8, %xmm8 + vaesenc 16(%r15), %xmm8, %xmm8 + vaesenc 32(%r15), %xmm8, %xmm8 + vaesenc 48(%r15), %xmm8, %xmm8 + vaesenc 64(%r15), %xmm8, %xmm8 + vaesenc 80(%r15), %xmm8, %xmm8 + vaesenc 96(%r15), %xmm8, %xmm8 + vaesenc 112(%r15), %xmm8, %xmm8 + vaesenc 128(%r15), %xmm8, %xmm8 + vaesenc 144(%r15), %xmm8, %xmm8 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm9 + jl L_AES_GCM_decrypt_avx512_calc_iv_2_aesenc_avx_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 176(%r15), %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm9 + jl L_AES_GCM_decrypt_avx512_calc_iv_2_aesenc_avx_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 208(%r15), %xmm8, %xmm8 + vmovdqa 224(%r15), %xmm9 +L_AES_GCM_decrypt_avx512_calc_iv_2_aesenc_avx_last: + vaesenclast %xmm9, %xmm8, %xmm8 + vmovdqu %xmm8, 1040(%rsp) +L_AES_GCM_decrypt_avx512_iv_done: + # Additional authentication data + movl %r11d, %edx + cmpl $0x00, %edx + je L_AES_GCM_decrypt_avx512_calc_aad_done + xorl %ecx, %ecx + cmpl $16, %edx + jl L_AES_GCM_decrypt_avx512_calc_aad_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_decrypt_avx512_calc_aad_16_loop: + vmovdqu (%r12,%rcx,1), %xmm8 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + vpxor %xmm8, %xmm6, %xmm6 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm6, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm6, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm0 + vpxor %xmm6, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm6 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm6, %xmm6 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm6, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm6, %xmm6 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm6, %xmm6 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm6, %xmm6 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm6, %xmm6 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_avx512_calc_aad_16_loop + movl %r11d, %edx + cmpl %edx, %ecx + je L_AES_GCM_decrypt_avx512_calc_aad_done +L_AES_GCM_decrypt_avx512_calc_aad_lt16: + subq $16, %rsp + vpxor %xmm8, %xmm8, %xmm8 + xorl %ebx, %ebx + vmovdqu %xmm8, (%rsp) +L_AES_GCM_decrypt_avx512_calc_aad_loop: + movzbl (%r12,%rcx,1), %r13d + movb %r13b, (%rsp,%rbx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_avx512_calc_aad_loop + vmovdqu (%rsp), %xmm8 + addq $16, %rsp + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + vpxor %xmm8, %xmm6, %xmm6 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm6, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm6, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm0 + vpxor %xmm6, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm6 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm6, %xmm6 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm6, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm6, %xmm6 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm6, %xmm6 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm6, %xmm6 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm6, %xmm6 +L_AES_GCM_decrypt_avx512_calc_aad_done: + # Calculate counter and H + vpsrlq $63, %xmm5, %xmm9 + vpsllq $0x01, %xmm5, %xmm8 + vpslldq $8, %xmm9, %xmm9 + vpor %xmm9, %xmm8, %xmm8 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4 + vpand L_avx512_aes_gcm_mod2_128(%rip), %xmm5, %xmm5 + vpaddd L_avx512_aes_gcm_one(%rip), %xmm4, %xmm4 + vpxor %xmm8, %xmm5, %xmm5 + vmovdqu %xmm4, 1024(%rsp) + xorl %ebx, %ebx + cmpl $0x100, %r9d + jl L_AES_GCM_decrypt_avx512_done_128 + vmovdqa %xmm6, %xmm2 + # H ^ 1 + vmovdqu %xmm5, (%rsp) + # H ^ 2 + vpclmulqdq $0x00, %xmm5, %xmm5, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm5, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm0 + vmovdqu %xmm0, 16(%rsp) + # H ^ 3 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpshufd $0x4e, %xmm0, %xmm10 + vpxor %xmm0, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm5, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm0, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm1 + vmovdqu %xmm1, 32(%rsp) + # H ^ 4 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm3 + vmovdqu %xmm3, 48(%rsp) + # H ^ 5 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 64(%rsp) + # H ^ 6 + vpclmulqdq $0x00, %xmm1, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm1, %xmm1, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 80(%rsp) + # H ^ 7 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm1, %xmm9 + vpxor %xmm1, %xmm9, %xmm9 + vpshufd $0x4e, %xmm3, %xmm10 + vpxor %xmm3, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm1, %xmm3, %xmm8 + vpclmulqdq $0x11, %xmm1, %xmm3, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 96(%rsp) + # H ^ 8 + vpclmulqdq $0x00, %xmm3, %xmm3, %xmm8 + vpclmulqdq $0x11, %xmm3, %xmm3, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 112(%rsp) + # H ^ 9 + vmovdqu 48(%rsp), %xmm0 + vmovdqu 64(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 128(%rsp) + # H ^ 10 + vmovdqu 64(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 144(%rsp) + # H ^ 11 + vmovdqu 64(%rsp), %xmm0 + vmovdqu 80(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 160(%rsp) + # H ^ 12 + vmovdqu 80(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 176(%rsp) + # H ^ 13 + vmovdqu 80(%rsp), %xmm0 + vmovdqu 96(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 192(%rsp) + # H ^ 14 + vmovdqu 96(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 208(%rsp) + # H ^ 15 + vmovdqu 96(%rsp), %xmm0 + vmovdqu 112(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 224(%rsp) + # H ^ 16 + vmovdqu 112(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 240(%rsp) + cmpl $0x200, %r9d + jl L_AES_GCM_decrypt_avx512_no_ext + # H ^ 17 + vmovdqu 112(%rsp), %xmm0 + vmovdqu 128(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 256(%rsp) + # H ^ 18 + vmovdqu 128(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 272(%rsp) + # H ^ 19 + vmovdqu 128(%rsp), %xmm0 + vmovdqu 144(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 288(%rsp) + # H ^ 20 + vmovdqu 144(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 304(%rsp) + # H ^ 21 + vmovdqu 144(%rsp), %xmm0 + vmovdqu 160(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 320(%rsp) + # H ^ 22 + vmovdqu 160(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 336(%rsp) + # H ^ 23 + vmovdqu 160(%rsp), %xmm0 + vmovdqu 176(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 352(%rsp) + # H ^ 24 + vmovdqu 176(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 368(%rsp) + # H ^ 25 + vmovdqu 176(%rsp), %xmm0 + vmovdqu 192(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 384(%rsp) + # H ^ 26 + vmovdqu 192(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 400(%rsp) + # H ^ 27 + vmovdqu 192(%rsp), %xmm0 + vmovdqu 208(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 416(%rsp) + # H ^ 28 + vmovdqu 208(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 432(%rsp) + # H ^ 29 + vmovdqu 208(%rsp), %xmm0 + vmovdqu 224(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 448(%rsp) + # H ^ 30 + vmovdqu 224(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 464(%rsp) + # H ^ 31 + vmovdqu 224(%rsp), %xmm0 + vmovdqu 240(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 480(%rsp) + # H ^ 32 + vmovdqu 240(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 496(%rsp) +L_AES_GCM_decrypt_avx512_no_ext: + vbroadcasti32x4 L_avx512_aes_gcm_bswap_epi64(%rip), %zmm22 + vbroadcasti32x4 L_avx512_aes_gcm_bswap_mask(%rip), %zmm30 + vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31 + vbroadcasti32x4 (%r15), %zmm9 + vbroadcasti32x4 16(%r15), %zmm10 + vbroadcasti32x4 32(%r15), %zmm11 + vbroadcasti32x4 48(%r15), %zmm12 + vbroadcasti32x4 64(%r15), %zmm13 + vbroadcasti32x4 80(%r15), %zmm14 + vbroadcasti32x4 96(%r15), %zmm15 + vbroadcasti32x4 112(%r15), %zmm1 + vbroadcasti32x4 128(%r15), %zmm2 + vbroadcasti32x4 144(%r15), %zmm3 + cmpl $0x200, %r9d + jl L_AES_GCM_decrypt_avx512_no_windows + movl %r9d, %r13d + andl $0xfffffe00, %r13d + vmovdqu64 448(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 384(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 320(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 256(%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + vmovdqu64 %zmm23, 512(%rsp) + vmovdqu64 %zmm24, 576(%rsp) + vmovdqu64 %zmm25, 640(%rsp) + vmovdqu64 %zmm26, 704(%rsp) + vmovdqu64 192(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 128(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 64(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 (%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + vmovdqu64 %zmm23, 768(%rsp) + vmovdqu64 %zmm24, 832(%rsp) + vmovdqu64 %zmm25, 896(%rsp) + vmovdqu64 %zmm26, 960(%rsp) + # 512 bytes of input + xorl %r12d, %r12d + leaq (%rdi,%rbx,1), %rax + vpxorq %zmm20, %zmm20, %zmm20 + vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20 + vmovdqu64 512(%rsp), %zmm23 + vmovdqu64 576(%rsp), %zmm24 + vmovdqu64 640(%rsp), %zmm25 + vmovdqu64 704(%rsp), %zmm26 + vmovdqu64 (%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpxorq %zmm20, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vmovdqa64 %zmm16, %zmm27 + vpxorq %zmm17, %zmm18, %zmm28 + vmovdqa64 %zmm19, %zmm29 + vmovdqu64 64(%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 128(%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 192(%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 768(%rsp), %zmm23 + vmovdqu64 832(%rsp), %zmm24 + vmovdqu64 896(%rsp), %zmm25 + vmovdqu64 960(%rsp), %zmm26 + vmovdqu64 256(%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 320(%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 384(%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 448(%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm21, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm21, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 + addl $0x200, %ebx + cmpl %r13d, %ebx + jge L_AES_GCM_decrypt_avx512_last_aes +L_AES_GCM_decrypt_avx512_win_loop: + leaq (%rdi,%rbx,1), %rax + vpxorq %zmm21, %zmm21, %zmm21 + vinserti32x4 $0x00, %xmm6, %zmm21, %zmm21 + vbroadcasti32x4 1024(%rsp), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu 1024(%rsp), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 1024(%rsp) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vmovdqu64 (%rax), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vpxorq %zmm21, %zmm31, %zmm31 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vpclmulqdq $0x00, 512(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 512(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 512(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 512(%rsp), %zmm31, %zmm26 + vmovdqa64 %zmm23, %zmm27 + vpxorq %zmm24, %zmm25, %zmm28 + vmovdqa64 %zmm26, %zmm29 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vmovdqu64 64(%rax), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vpclmulqdq $0x00, 576(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 576(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 576(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 576(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vmovdqu64 128(%rax), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vpclmulqdq $0x00, 640(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 640(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 640(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 640(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vmovdqu64 192(%rax), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vpclmulqdq $0x00, 704(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 704(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 704(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 704(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %r10d + vbroadcasti32x4 160(%r15), %zmm20 + jl L_AES_GCM_decrypt_avx512_a_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %r10d + vbroadcasti32x4 192(%r15), %zmm20 + jl L_AES_GCM_decrypt_avx512_a_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%r15), %zmm20 +L_AES_GCM_decrypt_avx512_a_il_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r12d + vbroadcasti32x4 1024(%rsp), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu 1024(%rsp), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 1024(%rsp) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vmovdqu64 256(%rax), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vpclmulqdq $0x00, 768(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 768(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 768(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 768(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vmovdqu64 320(%rax), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vpclmulqdq $0x00, 832(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 832(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 832(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 832(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vmovdqu64 384(%rax), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vpclmulqdq $0x00, 896(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 896(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 896(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 896(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vmovdqu64 448(%rax), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vpclmulqdq $0x00, 960(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 960(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 960(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 960(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %r10d + vbroadcasti32x4 160(%r15), %zmm20 + jl L_AES_GCM_decrypt_avx512_b_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %r10d + vbroadcasti32x4 192(%r15), %zmm20 + jl L_AES_GCM_decrypt_avx512_b_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%r15), %zmm20 +L_AES_GCM_decrypt_avx512_b_il_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r12d + vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm23 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm23, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm23 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm23, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 + addl $0x200, %ebx + cmpl %r13d, %ebx + jl L_AES_GCM_decrypt_avx512_win_loop +L_AES_GCM_decrypt_avx512_last_aes: + vbroadcasti32x4 1024(%rsp), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu 1024(%rsp), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 1024(%rsp) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %r10d + vbroadcasti32x4 160(%r15), %zmm20 + jl L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %r10d + vbroadcasti32x4 192(%r15), %zmm20 + jl L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%r15), %zmm20 +L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r12d + vbroadcasti32x4 1024(%rsp), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu 1024(%rsp), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 1024(%rsp) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %r10d + vbroadcasti32x4 160(%r15), %zmm20 + jl L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %r10d + vbroadcasti32x4 192(%r15), %zmm20 + jl L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%r15), %zmm20 +L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r12d +L_AES_GCM_decrypt_avx512_no_windows: + vmovdqu64 192(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 128(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 64(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 (%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + movl %r9d, %r13d + andl $0xffffff00, %r13d + cmpl %r13d, %ebx + jge L_AES_GCM_decrypt_avx512_after_256 + # 256 bytes of input + leaq (%rdi,%rbx,1), %rax + vpxorq %zmm20, %zmm20, %zmm20 + vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20 + vmovdqu64 (%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpxorq %zmm20, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vmovdqa64 %zmm16, %zmm27 + vpxorq %zmm17, %zmm18, %zmm28 + vmovdqa64 %zmm19, %zmm29 + vmovdqu64 64(%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 128(%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 192(%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm21, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm21, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 + vbroadcasti32x4 1024(%rsp), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu 1024(%rsp), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 1024(%rsp) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %r10d + vbroadcasti32x4 160(%r15), %zmm20 + jl L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %r10d + vbroadcasti32x4 192(%r15), %zmm20 + jl L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%r15), %zmm20 +L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %ebx +L_AES_GCM_decrypt_avx512_after_256: + vmovdqu (%rsp), %xmm5 +L_AES_GCM_decrypt_avx512_done_128: + movl %r9d, %edx + cmpl %edx, %ebx + jge L_AES_GCM_decrypt_avx512_done_dec + movl %r9d, %r13d + andl $0xfffffff0, %r13d + cmpl %r13d, %ebx + jge L_AES_GCM_decrypt_avx512_last_block_done +L_AES_GCM_decrypt_avx512_last_block_start: + vmovdqu (%rdi,%rbx,1), %xmm13 + vmovdqa %xmm5, %xmm0 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm13, %xmm1 + vpxor %xmm6, %xmm1, %xmm1 + vmovdqu 1024(%rsp), %xmm9 + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8 + vpaddd L_avx512_aes_gcm_one(%rip), %xmm9, %xmm9 + vmovdqu %xmm9, 1024(%rsp) + vpxor (%r15), %xmm8, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm10 + vaesenc 16(%r15), %xmm8, %xmm8 + vaesenc 32(%r15), %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm11 + vaesenc 48(%r15), %xmm8, %xmm8 + vaesenc 64(%r15), %xmm8, %xmm8 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm12 + vaesenc 80(%r15), %xmm8, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vaesenc 96(%r15), %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpslldq $8, %xmm10, %xmm2 + vpsrldq $8, %xmm10, %xmm10 + vaesenc 112(%r15), %xmm8, %xmm8 + vpxor %xmm12, %xmm2, %xmm2 + vpxor %xmm10, %xmm1, %xmm3 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm0 + vpclmulqdq $16, %xmm0, %xmm2, %xmm11 + vaesenc 128(%r15), %xmm8, %xmm8 + vpshufd $0x4e, %xmm2, %xmm10 + vpxor %xmm11, %xmm10, %xmm10 + vpclmulqdq $16, %xmm0, %xmm10, %xmm11 + vaesenc 144(%r15), %xmm8, %xmm8 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm3, %xmm10, %xmm6 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm9 + jl L_AES_GCM_decrypt_avx512_aesenc_gfmul_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 176(%r15), %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm9 + jl L_AES_GCM_decrypt_avx512_aesenc_gfmul_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 208(%r15), %xmm8, %xmm8 + vmovdqa 224(%r15), %xmm9 +L_AES_GCM_decrypt_avx512_aesenc_gfmul_last: + vaesenclast %xmm9, %xmm8, %xmm8 + vmovdqa %xmm13, %xmm0 + vpxor %xmm0, %xmm8, %xmm8 + vmovdqu %xmm8, (%rsi,%rbx,1) + addl $16, %ebx + cmpl %r13d, %ebx + jl L_AES_GCM_decrypt_avx512_last_block_start +L_AES_GCM_decrypt_avx512_last_block_done: + movl %r9d, %ecx + movl %ecx, %edx + andl $15, %ecx + jz L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_done + vmovdqu 1024(%rsp), %xmm4 + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4 + vpxor (%r15), %xmm4, %xmm4 + vaesenc 16(%r15), %xmm4, %xmm4 + vaesenc 32(%r15), %xmm4, %xmm4 + vaesenc 48(%r15), %xmm4, %xmm4 + vaesenc 64(%r15), %xmm4, %xmm4 + vaesenc 80(%r15), %xmm4, %xmm4 + vaesenc 96(%r15), %xmm4, %xmm4 + vaesenc 112(%r15), %xmm4, %xmm4 + vaesenc 128(%r15), %xmm4, %xmm4 + vaesenc 144(%r15), %xmm4, %xmm4 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm9 + jl L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc %xmm9, %xmm4, %xmm4 + vaesenc 176(%r15), %xmm4, %xmm4 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm9 + jl L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc %xmm9, %xmm4, %xmm4 + vaesenc 208(%r15), %xmm4, %xmm4 + vmovdqa 224(%r15), %xmm9 +L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_aesenc_avx_last: + vaesenclast %xmm9, %xmm4, %xmm4 + subq $32, %rsp + xorl %ecx, %ecx + vmovdqu %xmm4, (%rsp) + vpxor %xmm0, %xmm0, %xmm0 + vmovdqu %xmm0, 16(%rsp) +L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_loop: + movzbl (%rdi,%rbx,1), %r13d + movb %r13b, 16(%rsp,%rcx,1) + xorb (%rsp,%rcx,1), %r13b + movb %r13b, (%rsi,%rbx,1) + incl %ebx + incl %ecx + cmpl %edx, %ebx + jl L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_loop + vmovdqu 16(%rsp), %xmm4 + addq $32, %rsp + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4 + vpxor %xmm4, %xmm6, %xmm6 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpshufd $0x4e, %xmm6, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm6 +L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_done: +L_AES_GCM_decrypt_avx512_done_dec: + movl %r9d, %edx + movl %r11d, %ecx + shlq $3, %rdx + shlq $3, %rcx + vmovq %rdx, %xmm0 + vmovq %rcx, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpshufd $0x4e, %xmm6, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm6 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6 + vmovdqu 1040(%rsp), %xmm0 + vpxor %xmm6, %xmm0, %xmm0 + cmpl $16, %r14d + je L_AES_GCM_decrypt_avx512_cmp_tag_16 + subq $16, %rsp + xorq %rcx, %rcx + xorq %rbx, %rbx + vmovdqu %xmm0, (%rsp) +L_AES_GCM_decrypt_avx512_cmp_tag_loop: + movzbl (%rsp,%rcx,1), %r13d + xorb (%r8,%rcx,1), %r13b + orb %r13b, %bl + incl %ecx + cmpl %r14d, %ecx + jne L_AES_GCM_decrypt_avx512_cmp_tag_loop + cmpb $0x00, %bl + sete %bl + addq $16, %rsp + xorq %rcx, %rcx + jmp L_AES_GCM_decrypt_avx512_cmp_tag_done +L_AES_GCM_decrypt_avx512_cmp_tag_16: + vmovdqu (%r8), %xmm1 + vpcmpeqb %xmm1, %xmm0, %xmm0 + vpmovmskb %xmm0, %rdx + # %%edx == 0xFFFF then return 1 else => return 0 + xorl %ebx, %ebx + cmpl $0xffff, %edx + sete %bl +L_AES_GCM_decrypt_avx512_cmp_tag_done: + movl %ebx, (%rbp) + vzeroupper + addq $0x420, %rsp + popq %rbp + popq %r15 + popq %r14 + popq %rbx + popq %r12 + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_decrypt_avx512,.-AES_GCM_decrypt_avx512 +#endif /* __APPLE__ */ +#ifdef WOLFSSL_AESGCM_STREAM +#ifndef __APPLE__ +.text +.globl AES_GCM_init_avx512 +.type AES_GCM_init_avx512,@function +.align 16 +AES_GCM_init_avx512: +#else +.section __TEXT,__text +.globl _AES_GCM_init_avx512 +.p2align 4 +_AES_GCM_init_avx512: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + movq %rdx, %r10 + movl %ecx, %r11d + movq 24(%rsp), %rax + subq $16, %rsp + vpxor %xmm4, %xmm4, %xmm4 + movl %r11d, %edx + cmpl $12, %edx + jne L_AES_GCM_init_avx512_iv_not_12 + # # Calculate values when IV is 12 bytes + # Set counter based on IV + movl $0x1000000, %ecx + vmovq (%r10), %xmm4 + vpinsrd $2, 8(%r10), %xmm4, %xmm4 + vpinsrd $3, %ecx, %xmm4, %xmm4 + # H = Encrypt X(=0) and T = Encrypt counter + vmovdqa (%rdi), %xmm5 + vpxor %xmm5, %xmm4, %xmm1 + vmovdqa 16(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 32(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 48(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 64(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 80(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 96(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 112(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 128(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 144(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm6 + jl L_AES_GCM_init_avx512_calc_iv_12_last + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 176(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm6 + jl L_AES_GCM_init_avx512_calc_iv_12_last + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 208(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 224(%rdi), %xmm6 +L_AES_GCM_init_avx512_calc_iv_12_last: + vaesenclast %xmm6, %xmm5, %xmm5 + vaesenclast %xmm6, %xmm1, %xmm1 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + vmovdqu %xmm1, %xmm15 + jmp L_AES_GCM_init_avx512_iv_done +L_AES_GCM_init_avx512_iv_not_12: + # Calculate values when IV is not 12 bytes + # H = Encrypt X(=0) + vmovdqa (%rdi), %xmm5 + vaesenc 16(%rdi), %xmm5, %xmm5 + vaesenc 32(%rdi), %xmm5, %xmm5 + vaesenc 48(%rdi), %xmm5, %xmm5 + vaesenc 64(%rdi), %xmm5, %xmm5 + vaesenc 80(%rdi), %xmm5, %xmm5 + vaesenc 96(%rdi), %xmm5, %xmm5 + vaesenc 112(%rdi), %xmm5, %xmm5 + vaesenc 128(%rdi), %xmm5, %xmm5 + vaesenc 144(%rdi), %xmm5, %xmm5 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm8 + jl L_AES_GCM_init_avx512_calc_iv_1_aesenc_avx_last + vaesenc %xmm8, %xmm5, %xmm5 + vaesenc 176(%rdi), %xmm5, %xmm5 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm8 + jl L_AES_GCM_init_avx512_calc_iv_1_aesenc_avx_last + vaesenc %xmm8, %xmm5, %xmm5 + vaesenc 208(%rdi), %xmm5, %xmm5 + vmovdqa 224(%rdi), %xmm8 +L_AES_GCM_init_avx512_calc_iv_1_aesenc_avx_last: + vaesenclast %xmm8, %xmm5, %xmm5 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + # Calc counter + # Initialization vector + cmpl $0x00, %edx + movq $0x00, %rcx + je L_AES_GCM_init_avx512_calc_iv_done + cmpl $16, %edx + jl L_AES_GCM_init_avx512_calc_iv_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_init_avx512_calc_iv_16_loop: + vmovdqu (%r10,%rcx,1), %xmm7 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm6 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm6, %xmm6 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm6, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm6, %xmm6 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm6, %xmm6 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm6, %xmm0 + vpslld $30, %xmm6, %xmm1 + vpslld $25, %xmm6, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm2 + vpsrld $2, %xmm6, %xmm3 + vpsrld $7, %xmm6, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm6, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_init_avx512_calc_iv_16_loop + movl %r11d, %edx + cmpl %edx, %ecx + je L_AES_GCM_init_avx512_calc_iv_done +L_AES_GCM_init_avx512_calc_iv_lt16: + subq $16, %rsp + vpxor %xmm7, %xmm7, %xmm7 + xorl %r13d, %r13d + vmovdqu %xmm7, (%rsp) +L_AES_GCM_init_avx512_calc_iv_loop: + movzbl (%r10,%rcx,1), %r12d + movb %r12b, (%rsp,%r13,1) + incl %ecx + incl %r13d + cmpl %edx, %ecx + jl L_AES_GCM_init_avx512_calc_iv_loop + vmovdqu (%rsp), %xmm7 + addq $16, %rsp + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm6 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm6, %xmm6 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm6, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm6, %xmm6 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm6, %xmm6 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm6, %xmm0 + vpslld $30, %xmm6, %xmm1 + vpslld $25, %xmm6, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm2 + vpsrld $2, %xmm6, %xmm3 + vpsrld $7, %xmm6, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm6, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 +L_AES_GCM_init_avx512_calc_iv_done: + # T = Encrypt counter + vpxor %xmm0, %xmm0, %xmm0 + shll $3, %edx + vmovq %rdx, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm6 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm6, %xmm6 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm6, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm6, %xmm6 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm6, %xmm6 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm6, %xmm0 + vpslld $30, %xmm6, %xmm1 + vpslld $25, %xmm6, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm2 + vpsrld $2, %xmm6, %xmm3 + vpsrld $7, %xmm6, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm6, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4 + # Encrypt counter + vmovdqa (%rdi), %xmm7 + vpxor %xmm4, %xmm7, %xmm7 + vaesenc 16(%rdi), %xmm7, %xmm7 + vaesenc 32(%rdi), %xmm7, %xmm7 + vaesenc 48(%rdi), %xmm7, %xmm7 + vaesenc 64(%rdi), %xmm7, %xmm7 + vaesenc 80(%rdi), %xmm7, %xmm7 + vaesenc 96(%rdi), %xmm7, %xmm7 + vaesenc 112(%rdi), %xmm7, %xmm7 + vaesenc 128(%rdi), %xmm7, %xmm7 + vaesenc 144(%rdi), %xmm7, %xmm7 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm8 + jl L_AES_GCM_init_avx512_calc_iv_2_aesenc_avx_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 176(%rdi), %xmm7, %xmm7 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm8 + jl L_AES_GCM_init_avx512_calc_iv_2_aesenc_avx_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 208(%rdi), %xmm7, %xmm7 + vmovdqa 224(%rdi), %xmm8 +L_AES_GCM_init_avx512_calc_iv_2_aesenc_avx_last: + vaesenclast %xmm8, %xmm7, %xmm7 + vmovdqu %xmm7, %xmm15 +L_AES_GCM_init_avx512_iv_done: + vmovdqa %xmm15, (%rax) + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4 + vpaddd L_avx512_aes_gcm_one(%rip), %xmm4, %xmm4 + vmovdqa %xmm5, (%r8) + vmovdqa %xmm4, (%r9) + addq $16, %rsp + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_GCM_init_avx512,.-AES_GCM_init_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_aad_update_avx512 +.type AES_GCM_aad_update_avx512,@function +.align 16 +AES_GCM_aad_update_avx512: +#else +.section __TEXT,__text +.globl _AES_GCM_aad_update_avx512 +.p2align 4 +_AES_GCM_aad_update_avx512: +#endif /* __APPLE__ */ + movq %rcx, %rax + vmovdqa (%rdx), %xmm5 + vmovdqa (%rax), %xmm6 + xorl %ecx, %ecx +L_AES_GCM_aad_update_avx512_16_loop: + vmovdqu (%rdi,%rcx,1), %xmm7 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm5, %xmm5 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm5 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm5, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm5, %xmm5 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm5, %xmm5 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm5, %xmm5 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + addl $16, %ecx + cmpl %esi, %ecx + jl L_AES_GCM_aad_update_avx512_16_loop + vmovdqa %xmm5, (%rdx) + repz retq +#ifndef __APPLE__ +.size AES_GCM_aad_update_avx512,.-AES_GCM_aad_update_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_encrypt_block_avx512 +.type AES_GCM_encrypt_block_avx512,@function +.align 16 +AES_GCM_encrypt_block_avx512: +#else +.section __TEXT,__text +.globl _AES_GCM_encrypt_block_avx512 +.p2align 4 +_AES_GCM_encrypt_block_avx512: +#endif /* __APPLE__ */ + movq %rdx, %r10 + movq %rcx, %r11 + vmovdqu (%r8), %xmm1 + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm1, %xmm0 + vpaddd L_avx512_aes_gcm_one(%rip), %xmm1, %xmm1 + vmovdqu %xmm1, (%r8) + vpxor (%rdi), %xmm0, %xmm0 + vaesenc 16(%rdi), %xmm0, %xmm0 + vaesenc 32(%rdi), %xmm0, %xmm0 + vaesenc 48(%rdi), %xmm0, %xmm0 + vaesenc 64(%rdi), %xmm0, %xmm0 + vaesenc 80(%rdi), %xmm0, %xmm0 + vaesenc 96(%rdi), %xmm0, %xmm0 + vaesenc 112(%rdi), %xmm0, %xmm0 + vaesenc 128(%rdi), %xmm0, %xmm0 + vaesenc 144(%rdi), %xmm0, %xmm0 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm1 + jl L_AES_GCM_encrypt_block_avx512_aesenc_block_last + vaesenc %xmm1, %xmm0, %xmm0 + vaesenc 176(%rdi), %xmm0, %xmm0 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm1 + jl L_AES_GCM_encrypt_block_avx512_aesenc_block_last + vaesenc %xmm1, %xmm0, %xmm0 + vaesenc 208(%rdi), %xmm0, %xmm0 + vmovdqa 224(%rdi), %xmm1 +L_AES_GCM_encrypt_block_avx512_aesenc_block_last: + vaesenclast %xmm1, %xmm0, %xmm0 + vmovdqu (%r11), %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vmovdqu %xmm0, (%r10) + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 + vzeroupper + repz retq +#ifndef __APPLE__ +.size AES_GCM_encrypt_block_avx512,.-AES_GCM_encrypt_block_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_ghash_block_avx512 +.type AES_GCM_ghash_block_avx512,@function +.align 16 +AES_GCM_ghash_block_avx512: +#else +.section __TEXT,__text +.globl _AES_GCM_ghash_block_avx512 +.p2align 4 +_AES_GCM_ghash_block_avx512: +#endif /* __APPLE__ */ + vmovdqa (%rsi), %xmm4 + vmovdqa (%rdx), %xmm5 + vmovdqu (%rdi), %xmm7 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm6 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm6, %xmm6 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm6, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm6, %xmm6 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm6, %xmm6 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm6, %xmm0 + vpslld $30, %xmm6, %xmm1 + vpslld $25, %xmm6, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm2 + vpsrld $2, %xmm6, %xmm3 + vpsrld $7, %xmm6, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm6, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vmovdqa %xmm4, (%rsi) + vzeroupper + repz retq +#ifndef __APPLE__ +.size AES_GCM_ghash_block_avx512,.-AES_GCM_ghash_block_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_encrypt_update_avx512 +.type AES_GCM_encrypt_update_avx512,@function +.align 16 +AES_GCM_encrypt_update_avx512: +#else +.section __TEXT,__text +.globl _AES_GCM_encrypt_update_avx512 +.p2align 4 +_AES_GCM_encrypt_update_avx512: +#endif /* __APPLE__ */ + pushq %r13 + pushq %r12 + pushq %r14 + pushq %r15 + pushq %rbx + pushq %rbp + movq %rdx, %r10 + movq %rcx, %r11 + movq 56(%rsp), %rax + movq 64(%rsp), %r12 + subq $0x410, %rsp + vmovdqa (%r9), %xmm6 + vmovdqa (%rax), %xmm5 + vpsrlq $63, %xmm5, %xmm9 + vpsllq $0x01, %xmm5, %xmm8 + vpslldq $8, %xmm9, %xmm9 + vpor %xmm9, %xmm8, %xmm8 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpand L_avx512_aes_gcm_mod2_128(%rip), %xmm5, %xmm5 + vpxor %xmm8, %xmm5, %xmm5 + xorl %r14d, %r14d + cmpl $0x100, %r8d + jl L_AES_GCM_encrypt_update_avx512_done_128 + vmovdqa %xmm6, %xmm2 + # H ^ 1 + vmovdqu %xmm5, (%rsp) + # H ^ 2 + vpclmulqdq $0x00, %xmm5, %xmm5, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm5, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm0 + vmovdqu %xmm0, 16(%rsp) + # H ^ 3 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpshufd $0x4e, %xmm0, %xmm10 + vpxor %xmm0, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm5, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm0, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm1 + vmovdqu %xmm1, 32(%rsp) + # H ^ 4 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm3 + vmovdqu %xmm3, 48(%rsp) + # H ^ 5 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 64(%rsp) + # H ^ 6 + vpclmulqdq $0x00, %xmm1, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm1, %xmm1, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 80(%rsp) + # H ^ 7 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm1, %xmm9 + vpxor %xmm1, %xmm9, %xmm9 + vpshufd $0x4e, %xmm3, %xmm10 + vpxor %xmm3, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm1, %xmm3, %xmm8 + vpclmulqdq $0x11, %xmm1, %xmm3, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 96(%rsp) + # H ^ 8 + vpclmulqdq $0x00, %xmm3, %xmm3, %xmm8 + vpclmulqdq $0x11, %xmm3, %xmm3, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 112(%rsp) + # H ^ 9 + vmovdqu 48(%rsp), %xmm0 + vmovdqu 64(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 128(%rsp) + # H ^ 10 + vmovdqu 64(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 144(%rsp) + # H ^ 11 + vmovdqu 64(%rsp), %xmm0 + vmovdqu 80(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 160(%rsp) + # H ^ 12 + vmovdqu 80(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 176(%rsp) + # H ^ 13 + vmovdqu 80(%rsp), %xmm0 + vmovdqu 96(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 192(%rsp) + # H ^ 14 + vmovdqu 96(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 208(%rsp) + # H ^ 15 + vmovdqu 96(%rsp), %xmm0 + vmovdqu 112(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 224(%rsp) + # H ^ 16 + vmovdqu 112(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 240(%rsp) + cmpl $0x200, %r8d + jl L_AES_GCM_encrypt_update_avx512_no_ext + # H ^ 17 + vmovdqu 112(%rsp), %xmm0 + vmovdqu 128(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 256(%rsp) + # H ^ 18 + vmovdqu 128(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 272(%rsp) + # H ^ 19 + vmovdqu 128(%rsp), %xmm0 + vmovdqu 144(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 288(%rsp) + # H ^ 20 + vmovdqu 144(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 304(%rsp) + # H ^ 21 + vmovdqu 144(%rsp), %xmm0 + vmovdqu 160(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 320(%rsp) + # H ^ 22 + vmovdqu 160(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 336(%rsp) + # H ^ 23 + vmovdqu 160(%rsp), %xmm0 + vmovdqu 176(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 352(%rsp) + # H ^ 24 + vmovdqu 176(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 368(%rsp) + # H ^ 25 + vmovdqu 176(%rsp), %xmm0 + vmovdqu 192(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 384(%rsp) + # H ^ 26 + vmovdqu 192(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 400(%rsp) + # H ^ 27 + vmovdqu 192(%rsp), %xmm0 + vmovdqu 208(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 416(%rsp) + # H ^ 28 + vmovdqu 208(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 432(%rsp) + # H ^ 29 + vmovdqu 208(%rsp), %xmm0 + vmovdqu 224(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 448(%rsp) + # H ^ 30 + vmovdqu 224(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 464(%rsp) + # H ^ 31 + vmovdqu 224(%rsp), %xmm0 + vmovdqu 240(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 480(%rsp) + # H ^ 32 + vmovdqu 240(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 496(%rsp) +L_AES_GCM_encrypt_update_avx512_no_ext: + vbroadcasti32x4 L_avx512_aes_gcm_bswap_epi64(%rip), %zmm22 + vbroadcasti32x4 L_avx512_aes_gcm_bswap_mask(%rip), %zmm30 + vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31 + vbroadcasti32x4 (%rdi), %zmm9 + vbroadcasti32x4 16(%rdi), %zmm10 + vbroadcasti32x4 32(%rdi), %zmm11 + vbroadcasti32x4 48(%rdi), %zmm12 + vbroadcasti32x4 64(%rdi), %zmm13 + vbroadcasti32x4 80(%rdi), %zmm14 + vbroadcasti32x4 96(%rdi), %zmm15 + vbroadcasti32x4 112(%rdi), %zmm1 + vbroadcasti32x4 128(%rdi), %zmm2 + vbroadcasti32x4 144(%rdi), %zmm3 + cmpl $0x200, %r8d + jl L_AES_GCM_encrypt_update_avx512_no_windows + movl %r8d, %ebp + andl $0xfffffe00, %ebp + vmovdqu64 448(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 384(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 320(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 256(%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + vmovdqu64 %zmm23, 512(%rsp) + vmovdqu64 %zmm24, 576(%rsp) + vmovdqu64 %zmm25, 640(%rsp) + vmovdqu64 %zmm26, 704(%rsp) + vmovdqu64 192(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 128(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 64(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 (%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + vmovdqu64 %zmm23, 768(%rsp) + vmovdqu64 %zmm24, 832(%rsp) + vmovdqu64 %zmm25, 896(%rsp) + vmovdqu64 %zmm26, 960(%rsp) + # 512 bytes of input + leaq (%r10,%r14,1), %r15 + vbroadcasti32x4 (%r12), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu (%r12), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %esi + vbroadcasti32x4 160(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %esi + vbroadcasti32x4 192(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%rdi), %zmm20 +L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r14d + vbroadcasti32x4 (%r12), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu (%r12), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %esi + vbroadcasti32x4 160(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %esi + vbroadcasti32x4 192(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%rdi), %zmm20 +L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r14d + cmpl %ebp, %r14d + jge L_AES_GCM_encrypt_update_avx512_last_win +L_AES_GCM_encrypt_update_avx512_win_loop: + leaq (%r10,%r14,1), %rbx + vpxorq %zmm21, %zmm21, %zmm21 + vinserti32x4 $0x00, %xmm6, %zmm21, %zmm21 + vbroadcasti32x4 (%r12), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu (%r12), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vmovdqu64 (%r15), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vpxorq %zmm21, %zmm31, %zmm31 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vpclmulqdq $0x00, 512(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 512(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 512(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 512(%rsp), %zmm31, %zmm26 + vmovdqa64 %zmm23, %zmm27 + vpxorq %zmm24, %zmm25, %zmm28 + vmovdqa64 %zmm26, %zmm29 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vmovdqu64 64(%r15), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vpclmulqdq $0x00, 576(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 576(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 576(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 576(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vmovdqu64 128(%r15), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vpclmulqdq $0x00, 640(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 640(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 640(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 640(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vmovdqu64 192(%r15), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vpclmulqdq $0x00, 704(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 704(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 704(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 704(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %esi + vbroadcasti32x4 160(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_a_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %esi + vbroadcasti32x4 192(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_a_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%rdi), %zmm20 +L_AES_GCM_encrypt_update_avx512_a_il_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r14d + vbroadcasti32x4 (%r12), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu (%r12), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vmovdqu64 256(%r15), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vpclmulqdq $0x00, 768(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 768(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 768(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 768(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vmovdqu64 320(%r15), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vpclmulqdq $0x00, 832(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 832(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 832(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 832(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vmovdqu64 384(%r15), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vpclmulqdq $0x00, 896(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 896(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 896(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 896(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vmovdqu64 448(%r15), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vpclmulqdq $0x00, 960(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 960(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 960(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 960(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %esi + vbroadcasti32x4 160(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_b_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %esi + vbroadcasti32x4 192(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_b_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%rdi), %zmm20 +L_AES_GCM_encrypt_update_avx512_b_il_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r14d + vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm23 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm23, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm23 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm23, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 + movq %rbx, %r15 + cmpl %ebp, %r14d + jl L_AES_GCM_encrypt_update_avx512_win_loop +L_AES_GCM_encrypt_update_avx512_last_win: + vpxorq %zmm20, %zmm20, %zmm20 + vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20 + vmovdqu64 512(%rsp), %zmm23 + vmovdqu64 576(%rsp), %zmm24 + vmovdqu64 640(%rsp), %zmm25 + vmovdqu64 704(%rsp), %zmm26 + vmovdqu64 (%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpxorq %zmm20, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vmovdqa64 %zmm16, %zmm27 + vpxorq %zmm17, %zmm18, %zmm28 + vmovdqa64 %zmm19, %zmm29 + vmovdqu64 64(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 128(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 192(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 768(%rsp), %zmm23 + vmovdqu64 832(%rsp), %zmm24 + vmovdqu64 896(%rsp), %zmm25 + vmovdqu64 960(%rsp), %zmm26 + vmovdqu64 256(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 320(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 384(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 448(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm21, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm21, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 +L_AES_GCM_encrypt_update_avx512_no_windows: + vmovdqu64 192(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 128(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 64(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 (%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + movl %r8d, %r13d + andl $0xffffff00, %r13d + cmpl %r13d, %r14d + jge L_AES_GCM_encrypt_update_avx512_after_256 + # 256 bytes of input + vbroadcasti32x4 (%r12), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu (%r12), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %esi + vbroadcasti32x4 160(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %esi + vbroadcasti32x4 192(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%rdi), %zmm20 +L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + movq %rdx, %r15 + addl $0x100, %r14d + cmpl %r13d, %r14d + jge L_AES_GCM_encrypt_update_avx512_last_ghash +L_AES_GCM_encrypt_update_avx512_ghash_128: + vbroadcasti32x4 (%r12), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu (%r12), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %esi + vbroadcasti32x4 160(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %esi + vbroadcasti32x4 192(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%rdi), %zmm20 +L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + vpxorq %zmm20, %zmm20, %zmm20 + vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20 + vmovdqu64 (%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpxorq %zmm20, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vmovdqa64 %zmm16, %zmm27 + vpxorq %zmm17, %zmm18, %zmm28 + vmovdqa64 %zmm19, %zmm29 + vmovdqu64 64(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 128(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 192(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm21, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm21, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 + movq %rdx, %r15 + addl $0x100, %r14d + cmpl %r13d, %r14d + jl L_AES_GCM_encrypt_update_avx512_ghash_128 +L_AES_GCM_encrypt_update_avx512_last_ghash: + vpxorq %zmm20, %zmm20, %zmm20 + vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20 + vmovdqu64 (%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpxorq %zmm20, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vmovdqa64 %zmm16, %zmm27 + vpxorq %zmm17, %zmm18, %zmm28 + vmovdqa64 %zmm19, %zmm29 + vmovdqu64 64(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 128(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 192(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm21, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm21, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 +L_AES_GCM_encrypt_update_avx512_after_256: + vmovdqu (%rsp), %xmm5 +L_AES_GCM_encrypt_update_avx512_done_128: + movl %r8d, %edx + cmpl %edx, %r14d + jge L_AES_GCM_encrypt_update_avx512_done_enc + movl %r8d, %r13d + andl $0xfffffff0, %r13d + cmpl %r13d, %r14d + jge L_AES_GCM_encrypt_update_avx512_last_block_done + vmovdqu (%r12), %xmm9 + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8 + vpaddd L_avx512_aes_gcm_one(%rip), %xmm9, %xmm9 + vmovdqu %xmm9, (%r12) + vpxor (%rdi), %xmm8, %xmm8 + vaesenc 16(%rdi), %xmm8, %xmm8 + vaesenc 32(%rdi), %xmm8, %xmm8 + vaesenc 48(%rdi), %xmm8, %xmm8 + vaesenc 64(%rdi), %xmm8, %xmm8 + vaesenc 80(%rdi), %xmm8, %xmm8 + vaesenc 96(%rdi), %xmm8, %xmm8 + vaesenc 112(%rdi), %xmm8, %xmm8 + vaesenc 128(%rdi), %xmm8, %xmm8 + vaesenc 144(%rdi), %xmm8, %xmm8 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm9 + jl L_AES_GCM_encrypt_update_avx512_aesenc_block_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 176(%rdi), %xmm8, %xmm8 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm9 + jl L_AES_GCM_encrypt_update_avx512_aesenc_block_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 208(%rdi), %xmm8, %xmm8 + vmovdqa 224(%rdi), %xmm9 +L_AES_GCM_encrypt_update_avx512_aesenc_block_last: + vaesenclast %xmm9, %xmm8, %xmm8 + vmovdqu (%r11,%r14,1), %xmm9 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqu %xmm8, (%r10,%r14,1) + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + vpxor %xmm8, %xmm6, %xmm6 + addl $16, %r14d + cmpl %r13d, %r14d + jge L_AES_GCM_encrypt_update_avx512_last_block_ghash +L_AES_GCM_encrypt_update_avx512_last_block_start: + vmovdqu (%r11,%r14,1), %xmm13 + vmovdqu (%r12), %xmm9 + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8 + vpaddd L_avx512_aes_gcm_one(%rip), %xmm9, %xmm9 + vmovdqu %xmm9, (%r12) + vpxor (%rdi), %xmm8, %xmm8 + vpclmulqdq $16, %xmm5, %xmm6, %xmm10 + vaesenc 16(%rdi), %xmm8, %xmm8 + vaesenc 32(%rdi), %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm5, %xmm6, %xmm11 + vaesenc 48(%rdi), %xmm8, %xmm8 + vaesenc 64(%rdi), %xmm8, %xmm8 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm12 + vaesenc 80(%rdi), %xmm8, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm1 + vaesenc 96(%rdi), %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpslldq $8, %xmm10, %xmm2 + vpsrldq $8, %xmm10, %xmm10 + vaesenc 112(%rdi), %xmm8, %xmm8 + vpxor %xmm12, %xmm2, %xmm2 + vpxor %xmm10, %xmm1, %xmm3 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm0 + vpclmulqdq $16, %xmm0, %xmm2, %xmm11 + vaesenc 128(%rdi), %xmm8, %xmm8 + vpshufd $0x4e, %xmm2, %xmm10 + vpxor %xmm11, %xmm10, %xmm10 + vpclmulqdq $16, %xmm0, %xmm10, %xmm11 + vaesenc 144(%rdi), %xmm8, %xmm8 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm3, %xmm10, %xmm6 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm9 + jl L_AES_GCM_encrypt_update_avx512_aesenc_gfmul_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 176(%rdi), %xmm8, %xmm8 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm9 + jl L_AES_GCM_encrypt_update_avx512_aesenc_gfmul_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 208(%rdi), %xmm8, %xmm8 + vmovdqa 224(%rdi), %xmm9 +L_AES_GCM_encrypt_update_avx512_aesenc_gfmul_last: + vaesenclast %xmm9, %xmm8, %xmm8 + vmovdqa %xmm13, %xmm0 + vpxor %xmm0, %xmm8, %xmm8 + vmovdqu %xmm8, (%r10,%r14,1) + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + addl $16, %r14d + vpxor %xmm8, %xmm6, %xmm6 + cmpl %r13d, %r14d + jl L_AES_GCM_encrypt_update_avx512_last_block_start +L_AES_GCM_encrypt_update_avx512_last_block_ghash: + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpshufd $0x4e, %xmm6, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm6 +L_AES_GCM_encrypt_update_avx512_last_block_done: +L_AES_GCM_encrypt_update_avx512_done_enc: + vmovdqa %xmm6, (%r9) + vzeroupper + addq $0x410, %rsp + popq %rbp + popq %rbx + popq %r15 + popq %r14 + popq %r12 + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_encrypt_update_avx512,.-AES_GCM_encrypt_update_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_encrypt_final_avx512 +.type AES_GCM_encrypt_final_avx512,@function +.align 16 +AES_GCM_encrypt_final_avx512: +#else +.section __TEXT,__text +.globl _AES_GCM_encrypt_final_avx512 +.p2align 4 +_AES_GCM_encrypt_final_avx512: +#endif /* __APPLE__ */ + pushq %r13 + movl %edx, %eax + movl %ecx, %r10d + movl %r8d, %r11d + movq 16(%rsp), %r8 + subq $16, %rsp + vmovdqa (%rdi), %xmm4 + vmovdqa (%r9), %xmm5 + vmovdqa (%r8), %xmm6 + vpsrlq $63, %xmm5, %xmm8 + vpsllq $0x01, %xmm5, %xmm7 + vpslldq $8, %xmm8, %xmm8 + vpor %xmm8, %xmm7, %xmm7 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpand L_avx512_aes_gcm_mod2_128(%rip), %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + movl %r10d, %edx + movl %r11d, %ecx + shlq $3, %rdx + shlq $3, %rcx + vmovq %rdx, %xmm0 + vmovq %rcx, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm8 + vpxor %xmm5, %xmm8, %xmm8 + vpshufd $0x4e, %xmm4, %xmm9 + vpxor %xmm4, %xmm9, %xmm9 + vpclmulqdq $0x00, %xmm5, %xmm4, %xmm7 + vpclmulqdq $0x11, %xmm5, %xmm4, %xmm10 + vpclmulqdq $0x00, %xmm9, %xmm8, %xmm8 + vpternlogq $0x96, %xmm7, %xmm10, %xmm8 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpternlogq $0x96, %xmm11, %xmm7, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm11, %xmm8, %xmm10 + vmovdqa %xmm10, %xmm4 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm0 + cmpl $16, %eax + je L_AES_GCM_encrypt_final_avx512_store_tag_16 + xorq %rcx, %rcx + vmovdqu %xmm0, (%rsp) +L_AES_GCM_encrypt_final_avx512_store_tag_loop: + movzbl (%rsp,%rcx,1), %r13d + movb %r13b, (%rsi,%rcx,1) + incl %ecx + cmpl %eax, %ecx + jne L_AES_GCM_encrypt_final_avx512_store_tag_loop + jmp L_AES_GCM_encrypt_final_avx512_store_tag_done +L_AES_GCM_encrypt_final_avx512_store_tag_16: + vmovdqu %xmm0, (%rsi) +L_AES_GCM_encrypt_final_avx512_store_tag_done: + vzeroupper + addq $16, %rsp + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_encrypt_final_avx512,.-AES_GCM_encrypt_final_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_decrypt_update_avx512 +.type AES_GCM_decrypt_update_avx512,@function +.align 16 +AES_GCM_decrypt_update_avx512: +#else +.section __TEXT,__text +.globl _AES_GCM_decrypt_update_avx512 +.p2align 4 +_AES_GCM_decrypt_update_avx512: +#endif /* __APPLE__ */ + pushq %r13 + pushq %r12 + pushq %r14 + pushq %r15 + pushq %rbx + movq %rdx, %r10 + movq %rcx, %r11 + movq 48(%rsp), %rax + movq 56(%rsp), %r12 + subq $0x410, %rsp + vmovdqa (%r9), %xmm6 + vmovdqa (%rax), %xmm5 + vpsrlq $63, %xmm5, %xmm9 + vpsllq $0x01, %xmm5, %xmm8 + vpslldq $8, %xmm9, %xmm9 + vpor %xmm9, %xmm8, %xmm8 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpand L_avx512_aes_gcm_mod2_128(%rip), %xmm5, %xmm5 + vpxor %xmm8, %xmm5, %xmm5 + xorl %r14d, %r14d + cmpl $0x100, %r8d + jl L_AES_GCM_decrypt_update_avx512_done_128 + vmovdqa %xmm6, %xmm2 + # H ^ 1 + vmovdqu %xmm5, (%rsp) + # H ^ 2 + vpclmulqdq $0x00, %xmm5, %xmm5, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm5, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm0 + vmovdqu %xmm0, 16(%rsp) + # H ^ 3 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpshufd $0x4e, %xmm0, %xmm10 + vpxor %xmm0, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm5, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm0, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm1 + vmovdqu %xmm1, 32(%rsp) + # H ^ 4 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm3 + vmovdqu %xmm3, 48(%rsp) + # H ^ 5 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 64(%rsp) + # H ^ 6 + vpclmulqdq $0x00, %xmm1, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm1, %xmm1, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 80(%rsp) + # H ^ 7 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm1, %xmm9 + vpxor %xmm1, %xmm9, %xmm9 + vpshufd $0x4e, %xmm3, %xmm10 + vpxor %xmm3, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm1, %xmm3, %xmm8 + vpclmulqdq $0x11, %xmm1, %xmm3, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 96(%rsp) + # H ^ 8 + vpclmulqdq $0x00, %xmm3, %xmm3, %xmm8 + vpclmulqdq $0x11, %xmm3, %xmm3, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 112(%rsp) + # H ^ 9 + vmovdqu 48(%rsp), %xmm0 + vmovdqu 64(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 128(%rsp) + # H ^ 10 + vmovdqu 64(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 144(%rsp) + # H ^ 11 + vmovdqu 64(%rsp), %xmm0 + vmovdqu 80(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 160(%rsp) + # H ^ 12 + vmovdqu 80(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 176(%rsp) + # H ^ 13 + vmovdqu 80(%rsp), %xmm0 + vmovdqu 96(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 192(%rsp) + # H ^ 14 + vmovdqu 96(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 208(%rsp) + # H ^ 15 + vmovdqu 96(%rsp), %xmm0 + vmovdqu 112(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 224(%rsp) + # H ^ 16 + vmovdqu 112(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 240(%rsp) + cmpl $0x200, %r8d + jl L_AES_GCM_decrypt_update_avx512_no_ext + # H ^ 17 + vmovdqu 112(%rsp), %xmm0 + vmovdqu 128(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 256(%rsp) + # H ^ 18 + vmovdqu 128(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 272(%rsp) + # H ^ 19 + vmovdqu 128(%rsp), %xmm0 + vmovdqu 144(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 288(%rsp) + # H ^ 20 + vmovdqu 144(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 304(%rsp) + # H ^ 21 + vmovdqu 144(%rsp), %xmm0 + vmovdqu 160(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 320(%rsp) + # H ^ 22 + vmovdqu 160(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 336(%rsp) + # H ^ 23 + vmovdqu 160(%rsp), %xmm0 + vmovdqu 176(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 352(%rsp) + # H ^ 24 + vmovdqu 176(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 368(%rsp) + # H ^ 25 + vmovdqu 176(%rsp), %xmm0 + vmovdqu 192(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 384(%rsp) + # H ^ 26 + vmovdqu 192(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 400(%rsp) + # H ^ 27 + vmovdqu 192(%rsp), %xmm0 + vmovdqu 208(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 416(%rsp) + # H ^ 28 + vmovdqu 208(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 432(%rsp) + # H ^ 29 + vmovdqu 208(%rsp), %xmm0 + vmovdqu 224(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 448(%rsp) + # H ^ 30 + vmovdqu 224(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 464(%rsp) + # H ^ 31 + vmovdqu 224(%rsp), %xmm0 + vmovdqu 240(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 480(%rsp) + # H ^ 32 + vmovdqu 240(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 496(%rsp) +L_AES_GCM_decrypt_update_avx512_no_ext: + vbroadcasti32x4 L_avx512_aes_gcm_bswap_epi64(%rip), %zmm22 + vbroadcasti32x4 L_avx512_aes_gcm_bswap_mask(%rip), %zmm30 + vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31 + vbroadcasti32x4 (%rdi), %zmm9 + vbroadcasti32x4 16(%rdi), %zmm10 + vbroadcasti32x4 32(%rdi), %zmm11 + vbroadcasti32x4 48(%rdi), %zmm12 + vbroadcasti32x4 64(%rdi), %zmm13 + vbroadcasti32x4 80(%rdi), %zmm14 + vbroadcasti32x4 96(%rdi), %zmm15 + vbroadcasti32x4 112(%rdi), %zmm1 + vbroadcasti32x4 128(%rdi), %zmm2 + vbroadcasti32x4 144(%rdi), %zmm3 + cmpl $0x200, %r8d + jl L_AES_GCM_decrypt_update_avx512_no_windows + movl %r8d, %r13d + andl $0xfffffe00, %r13d + vmovdqu64 448(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 384(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 320(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 256(%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + vmovdqu64 %zmm23, 512(%rsp) + vmovdqu64 %zmm24, 576(%rsp) + vmovdqu64 %zmm25, 640(%rsp) + vmovdqu64 %zmm26, 704(%rsp) + vmovdqu64 192(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 128(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 64(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 (%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + vmovdqu64 %zmm23, 768(%rsp) + vmovdqu64 %zmm24, 832(%rsp) + vmovdqu64 %zmm25, 896(%rsp) + vmovdqu64 %zmm26, 960(%rsp) + # 512 bytes of input + xorl %r15d, %r15d + leaq (%r11,%r14,1), %rbx + vpxorq %zmm20, %zmm20, %zmm20 + vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20 + vmovdqu64 512(%rsp), %zmm23 + vmovdqu64 576(%rsp), %zmm24 + vmovdqu64 640(%rsp), %zmm25 + vmovdqu64 704(%rsp), %zmm26 + vmovdqu64 (%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpxorq %zmm20, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vmovdqa64 %zmm16, %zmm27 + vpxorq %zmm17, %zmm18, %zmm28 + vmovdqa64 %zmm19, %zmm29 + vmovdqu64 64(%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 128(%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 192(%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 768(%rsp), %zmm23 + vmovdqu64 832(%rsp), %zmm24 + vmovdqu64 896(%rsp), %zmm25 + vmovdqu64 960(%rsp), %zmm26 + vmovdqu64 256(%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 320(%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 384(%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 448(%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm21, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm21, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 + addl $0x200, %r14d + cmpl %r13d, %r14d + jge L_AES_GCM_decrypt_update_avx512_last_aes +L_AES_GCM_decrypt_update_avx512_win_loop: + leaq (%r11,%r14,1), %rbx + vpxorq %zmm21, %zmm21, %zmm21 + vinserti32x4 $0x00, %xmm6, %zmm21, %zmm21 + vbroadcasti32x4 (%r12), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu (%r12), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vmovdqu64 (%rbx), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vpxorq %zmm21, %zmm31, %zmm31 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vpclmulqdq $0x00, 512(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 512(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 512(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 512(%rsp), %zmm31, %zmm26 + vmovdqa64 %zmm23, %zmm27 + vpxorq %zmm24, %zmm25, %zmm28 + vmovdqa64 %zmm26, %zmm29 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vmovdqu64 64(%rbx), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vpclmulqdq $0x00, 576(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 576(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 576(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 576(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vmovdqu64 128(%rbx), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vpclmulqdq $0x00, 640(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 640(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 640(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 640(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vmovdqu64 192(%rbx), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vpclmulqdq $0x00, 704(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 704(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 704(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 704(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %esi + vbroadcasti32x4 160(%rdi), %zmm20 + jl L_AES_GCM_decrypt_update_avx512_a_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %esi + vbroadcasti32x4 192(%rdi), %zmm20 + jl L_AES_GCM_decrypt_update_avx512_a_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%rdi), %zmm20 +L_AES_GCM_decrypt_update_avx512_a_il_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%r11,%r15,1), %rcx + leaq (%r10,%r15,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r15d + vbroadcasti32x4 (%r12), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu (%r12), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vmovdqu64 256(%rbx), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vpclmulqdq $0x00, 768(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 768(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 768(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 768(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vmovdqu64 320(%rbx), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vpclmulqdq $0x00, 832(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 832(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 832(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 832(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vmovdqu64 384(%rbx), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vpclmulqdq $0x00, 896(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 896(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 896(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 896(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vmovdqu64 448(%rbx), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vpclmulqdq $0x00, 960(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 960(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 960(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 960(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %esi + vbroadcasti32x4 160(%rdi), %zmm20 + jl L_AES_GCM_decrypt_update_avx512_b_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %esi + vbroadcasti32x4 192(%rdi), %zmm20 + jl L_AES_GCM_decrypt_update_avx512_b_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%rdi), %zmm20 +L_AES_GCM_decrypt_update_avx512_b_il_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%r11,%r15,1), %rcx + leaq (%r10,%r15,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r15d + vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm23 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm23, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm23 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm23, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 + addl $0x200, %r14d + cmpl %r13d, %r14d + jl L_AES_GCM_decrypt_update_avx512_win_loop +L_AES_GCM_decrypt_update_avx512_last_aes: + vbroadcasti32x4 (%r12), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu (%r12), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %esi + vbroadcasti32x4 160(%rdi), %zmm20 + jl L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %esi + vbroadcasti32x4 192(%rdi), %zmm20 + jl L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%rdi), %zmm20 +L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%r11,%r15,1), %rcx + leaq (%r10,%r15,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r15d + vbroadcasti32x4 (%r12), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu (%r12), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %esi + vbroadcasti32x4 160(%rdi), %zmm20 + jl L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %esi + vbroadcasti32x4 192(%rdi), %zmm20 + jl L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%rdi), %zmm20 +L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%r11,%r15,1), %rcx + leaq (%r10,%r15,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r15d +L_AES_GCM_decrypt_update_avx512_no_windows: + vmovdqu64 192(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 128(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 64(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 (%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + movl %r8d, %r13d + andl $0xffffff00, %r13d + cmpl %r13d, %r14d + jge L_AES_GCM_decrypt_update_avx512_after_256 + # 256 bytes of input + leaq (%r11,%r14,1), %rbx + vpxorq %zmm20, %zmm20, %zmm20 + vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20 + vmovdqu64 (%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpxorq %zmm20, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vmovdqa64 %zmm16, %zmm27 + vpxorq %zmm17, %zmm18, %zmm28 + vmovdqa64 %zmm19, %zmm29 + vmovdqu64 64(%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 128(%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 192(%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm21, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm21, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 + vbroadcasti32x4 (%r12), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu (%r12), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %esi + vbroadcasti32x4 160(%rdi), %zmm20 + jl L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %esi + vbroadcasti32x4 192(%rdi), %zmm20 + jl L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%rdi), %zmm20 +L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r14d +L_AES_GCM_decrypt_update_avx512_after_256: + vmovdqu (%rsp), %xmm5 +L_AES_GCM_decrypt_update_avx512_done_128: + movl %r8d, %edx + cmpl %edx, %r14d + jge L_AES_GCM_decrypt_update_avx512_done_dec + movl %r8d, %r13d + andl $0xfffffff0, %r13d + cmpl %r13d, %r14d + jge L_AES_GCM_decrypt_update_avx512_last_block_done +L_AES_GCM_decrypt_update_avx512_last_block_start: + vmovdqu (%r11,%r14,1), %xmm13 + vmovdqa %xmm5, %xmm0 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm13, %xmm1 + vpxor %xmm6, %xmm1, %xmm1 + vmovdqu (%r12), %xmm9 + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8 + vpaddd L_avx512_aes_gcm_one(%rip), %xmm9, %xmm9 + vmovdqu %xmm9, (%r12) + vpxor (%rdi), %xmm8, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm10 + vaesenc 16(%rdi), %xmm8, %xmm8 + vaesenc 32(%rdi), %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm11 + vaesenc 48(%rdi), %xmm8, %xmm8 + vaesenc 64(%rdi), %xmm8, %xmm8 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm12 + vaesenc 80(%rdi), %xmm8, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vaesenc 96(%rdi), %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpslldq $8, %xmm10, %xmm2 + vpsrldq $8, %xmm10, %xmm10 + vaesenc 112(%rdi), %xmm8, %xmm8 + vpxor %xmm12, %xmm2, %xmm2 + vpxor %xmm10, %xmm1, %xmm3 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm0 + vpclmulqdq $16, %xmm0, %xmm2, %xmm11 + vaesenc 128(%rdi), %xmm8, %xmm8 + vpshufd $0x4e, %xmm2, %xmm10 + vpxor %xmm11, %xmm10, %xmm10 + vpclmulqdq $16, %xmm0, %xmm10, %xmm11 + vaesenc 144(%rdi), %xmm8, %xmm8 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm3, %xmm10, %xmm6 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm9 + jl L_AES_GCM_decrypt_update_avx512_aesenc_gfmul_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 176(%rdi), %xmm8, %xmm8 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm9 + jl L_AES_GCM_decrypt_update_avx512_aesenc_gfmul_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 208(%rdi), %xmm8, %xmm8 + vmovdqa 224(%rdi), %xmm9 +L_AES_GCM_decrypt_update_avx512_aesenc_gfmul_last: + vaesenclast %xmm9, %xmm8, %xmm8 + vmovdqa %xmm13, %xmm0 + vpxor %xmm0, %xmm8, %xmm8 + vmovdqu %xmm8, (%r10,%r14,1) + addl $16, %r14d + cmpl %r13d, %r14d + jl L_AES_GCM_decrypt_update_avx512_last_block_start +L_AES_GCM_decrypt_update_avx512_last_block_done: +L_AES_GCM_decrypt_update_avx512_done_dec: + vmovdqa %xmm6, (%r9) + vzeroupper + addq $0x410, %rsp + popq %rbx + popq %r15 + popq %r14 + popq %r12 + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_decrypt_update_avx512,.-AES_GCM_decrypt_update_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_decrypt_final_avx512 +.type AES_GCM_decrypt_final_avx512,@function +.align 16 +AES_GCM_decrypt_final_avx512: +#else +.section __TEXT,__text +.globl _AES_GCM_decrypt_final_avx512 +.p2align 4 +_AES_GCM_decrypt_final_avx512: +#endif /* __APPLE__ */ + pushq %r13 + pushq %rbp + pushq %r12 + movl %edx, %eax + movl %ecx, %r10d + movl %r8d, %r11d + movq 32(%rsp), %r8 + movq 40(%rsp), %rbp + subq $16, %rsp + vmovdqa (%rdi), %xmm6 + vmovdqa (%r9), %xmm5 + vmovdqa (%r8), %xmm15 + vpsrlq $63, %xmm5, %xmm8 + vpsllq $0x01, %xmm5, %xmm7 + vpslldq $8, %xmm8, %xmm8 + vpor %xmm8, %xmm7, %xmm7 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpand L_avx512_aes_gcm_mod2_128(%rip), %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + movl %r10d, %edx + movl %r11d, %ecx + shlq $3, %rdx + shlq $3, %rcx + vmovq %rdx, %xmm0 + vmovq %rcx, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm8 + vpxor %xmm5, %xmm8, %xmm8 + vpshufd $0x4e, %xmm6, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm7 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm10 + vpclmulqdq $0x00, %xmm9, %xmm8, %xmm8 + vpternlogq $0x96, %xmm7, %xmm10, %xmm8 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpternlogq $0x96, %xmm11, %xmm7, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm11, %xmm8, %xmm10 + vmovdqa %xmm10, %xmm6 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6 + vpxor %xmm15, %xmm6, %xmm0 + cmpl $16, %eax + je L_AES_GCM_decrypt_final_avx512_cmp_tag_16 + subq $16, %rsp + xorq %rcx, %rcx + xorq %r12, %r12 + vmovdqu %xmm0, (%rsp) +L_AES_GCM_decrypt_final_avx512_cmp_tag_loop: + movzbl (%rsp,%rcx,1), %r13d + xorb (%rsi,%rcx,1), %r13b + orb %r13b, %r12b + incl %ecx + cmpl %eax, %ecx + jne L_AES_GCM_decrypt_final_avx512_cmp_tag_loop + cmpb $0x00, %r12b + sete %r12b + addq $16, %rsp + xorq %rcx, %rcx + jmp L_AES_GCM_decrypt_final_avx512_cmp_tag_done +L_AES_GCM_decrypt_final_avx512_cmp_tag_16: + vmovdqu (%rsi), %xmm1 + vpcmpeqb %xmm1, %xmm0, %xmm0 + vpmovmskb %xmm0, %rdx + # %%edx == 0xFFFF then return 1 else => return 0 + xorl %r12d, %r12d + cmpl $0xffff, %edx + sete %r12b +L_AES_GCM_decrypt_final_avx512_cmp_tag_done: + movl %r12d, (%rbp) + vzeroupper + addq $16, %rsp + popq %r12 + popq %rbp + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_decrypt_final_avx512,.-AES_GCM_decrypt_final_avx512 +#endif /* __APPLE__ */ +#endif /* WOLFSSL_AESGCM_STREAM */ +#endif /* HAVE_INTEL_AVX512 */ #endif /* WOLFSSL_X86_64_BUILD */ #if defined(__linux__) && defined(__ELF__) diff --git a/wolfcrypt/src/aes_gcm_asm.asm b/wolfcrypt/src/aes_gcm_asm.asm index d222bc14478..34f68476310 100644 --- a/wolfcrypt/src/aes_gcm_asm.asm +++ b/wolfcrypt/src/aes_gcm_asm.asm @@ -171,10 +171,10 @@ GCM_generate_m0_aesni PROC por xmm1, xmm5 por xmm2, xmm6 por xmm3, xmm7 - vpshufb xmm0, xmm0, xmm9 - vpshufb xmm1, xmm1, xmm9 - vpshufb xmm2, xmm2, xmm9 - vpshufb xmm3, xmm3, xmm9 + pshufb xmm0, xmm9 + pshufb xmm1, xmm9 + pshufb xmm2, xmm9 + pshufb xmm3, xmm9 movdqu OWORD PTR [rdx+256], xmm0 movdqu OWORD PTR [rdx+272], xmm1 movdqu OWORD PTR [rdx+288], xmm2 @@ -207,10 +207,10 @@ GCM_generate_m0_aesni PROC por xmm1, xmm5 por xmm2, xmm6 por xmm3, xmm7 - vpshufb xmm0, xmm0, xmm9 - vpshufb xmm1, xmm1, xmm9 - vpshufb xmm2, xmm2, xmm9 - vpshufb xmm3, xmm3, xmm9 + pshufb xmm0, xmm9 + pshufb xmm1, xmm9 + pshufb xmm2, xmm9 + pshufb xmm3, xmm9 movdqu OWORD PTR [rdx+320], xmm0 movdqu OWORD PTR [rdx+336], xmm1 movdqu OWORD PTR [rdx+352], xmm2 @@ -243,10 +243,10 @@ GCM_generate_m0_aesni PROC por xmm1, xmm5 por xmm2, xmm6 por xmm3, xmm7 - vpshufb xmm0, xmm0, xmm9 - vpshufb xmm1, xmm1, xmm9 - vpshufb xmm2, xmm2, xmm9 - vpshufb xmm3, xmm3, xmm9 + pshufb xmm0, xmm9 + pshufb xmm1, xmm9 + pshufb xmm2, xmm9 + pshufb xmm3, xmm9 movdqu OWORD PTR [rdx+384], xmm0 movdqu OWORD PTR [rdx+400], xmm1 movdqu OWORD PTR [rdx+416], xmm2 @@ -279,10 +279,10 @@ GCM_generate_m0_aesni PROC por xmm1, xmm5 por xmm2, xmm6 por xmm3, xmm7 - vpshufb xmm0, xmm0, xmm9 - vpshufb xmm1, xmm1, xmm9 - vpshufb xmm2, xmm2, xmm9 - vpshufb xmm3, xmm3, xmm9 + pshufb xmm0, xmm9 + pshufb xmm1, xmm9 + pshufb xmm2, xmm9 + pshufb xmm3, xmm9 movdqu OWORD PTR [rdx+448], xmm0 movdqu OWORD PTR [rdx+464], xmm1 movdqu OWORD PTR [rdx+480], xmm2 @@ -16518,4 +16518,14153 @@ L_AES_GCM_decrypt_final_avx2_cmp_tag_done: AES_GCM_decrypt_final_avx2 ENDP _TEXT ENDS ENDIF +IFDEF HAVE_INTEL_VAES +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_inc_y0 QWORD \ + 0000000000000000h, 0000000000000000h, + 0000000000000000h, 0000000000000001h +ptr_L_vaes_aes_gcm_inc_y0 QWORD L_vaes_aes_gcm_inc_y0 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_inc_y1 QWORD \ + 0000000000000000h, 0000000000000002h, + 0000000000000000h, 0000000000000003h +ptr_L_vaes_aes_gcm_inc_y1 QWORD L_vaes_aes_gcm_inc_y1 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_inc_y2 QWORD \ + 0000000000000000h, 0000000000000004h, + 0000000000000000h, 0000000000000005h +ptr_L_vaes_aes_gcm_inc_y2 QWORD L_vaes_aes_gcm_inc_y2 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_inc_y3 QWORD \ + 0000000000000000h, 0000000000000006h, + 0000000000000000h, 0000000000000007h +ptr_L_vaes_aes_gcm_inc_y3 QWORD L_vaes_aes_gcm_inc_y3 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_GCM_generate_m0_vaes_rev8 QWORD \ + 08090a0b0c0d0e0fh, 0001020304050607h +ptr_L_GCM_generate_m0_vaes_rev8 QWORD L_GCM_generate_m0_vaes_rev8 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_GCM_generate_m0_vaes_mod2_128 QWORD \ + 0000000000000000h, 0e100000000000000h +ptr_L_GCM_generate_m0_vaes_mod2_128 QWORD L_GCM_generate_m0_vaes_mod2_128 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +GCM_generate_m0_vaes PROC + sub rsp, 80 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu xmm9, OWORD PTR L_GCM_generate_m0_vaes_rev8 + vmovdqu xmm10, OWORD PTR L_GCM_generate_m0_vaes_mod2_128 + vpxor xmm8, xmm8, xmm8 + vmovdqu xmm0, OWORD PTR [rcx] + vmovdqu OWORD PTR [rdx], xmm8 + vmovdqu xmm8, xmm0 + vpshufb xmm0, xmm0, xmm9 + vpsllq xmm5, xmm0, 63 + vpsrlq xmm4, xmm0, 1 + vpslldq xmm1, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpshufd xmm1, xmm1, 255 + vpor xmm4, xmm4, xmm5 + vpsrad xmm1, xmm1, 31 + vpand xmm1, xmm1, xmm10 + vpxor xmm1, xmm1, xmm4 + vpsllq xmm5, xmm1, 63 + vpsrlq xmm4, xmm1, 1 + vpslldq xmm2, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpshufd xmm2, xmm2, 255 + vpor xmm4, xmm4, xmm5 + vpsrad xmm2, xmm2, 31 + vpand xmm2, xmm2, xmm10 + vpxor xmm2, xmm2, xmm4 + vpsllq xmm5, xmm2, 63 + vpsrlq xmm4, xmm2, 1 + vpslldq xmm3, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpshufd xmm3, xmm3, 255 + vpor xmm4, xmm4, xmm5 + vpsrad xmm3, xmm3, 31 + vpand xmm3, xmm3, xmm10 + vpxor xmm3, xmm3, xmm4 + vpshufb xmm3, xmm3, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm0, xmm0, xmm9 + vpxor xmm8, xmm3, xmm2 + vmovdqu OWORD PTR [rdx+16], xmm3 + vmovdqu OWORD PTR [rdx+32], xmm2 + vmovdqu OWORD PTR [rdx+48], xmm8 + vmovdqu OWORD PTR [rdx+64], xmm1 + vpxor xmm4, xmm3, xmm1 + vpxor xmm5, xmm2, xmm1 + vpxor xmm6, xmm8, xmm1 + vmovdqu OWORD PTR [rdx+80], xmm4 + vmovdqu OWORD PTR [rdx+96], xmm5 + vmovdqu OWORD PTR [rdx+112], xmm6 + vmovdqu OWORD PTR [rdx+128], xmm0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm3, xmm0 + vpxor xmm6, xmm2, xmm0 + vmovdqu OWORD PTR [rdx+144], xmm4 + vmovdqu OWORD PTR [rdx+160], xmm6 + vpxor xmm6, xmm3, xmm6 + vmovdqu OWORD PTR [rdx+176], xmm6 + vmovdqu OWORD PTR [rdx+192], xmm1 + vpxor xmm4, xmm3, xmm1 + vpxor xmm5, xmm2, xmm1 + vpxor xmm6, xmm8, xmm1 + vmovdqu OWORD PTR [rdx+208], xmm4 + vmovdqu OWORD PTR [rdx+224], xmm5 + vmovdqu OWORD PTR [rdx+240], xmm6 + vmovdqu xmm0, OWORD PTR [rdx] + vmovdqu xmm1, OWORD PTR [rdx+16] + vmovdqu xmm2, OWORD PTR [rdx+32] + vmovdqu xmm3, OWORD PTR [rdx+48] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+256], xmm0 + vmovdqu OWORD PTR [rdx+272], xmm1 + vmovdqu OWORD PTR [rdx+288], xmm2 + vmovdqu OWORD PTR [rdx+304], xmm3 + vmovdqu xmm0, OWORD PTR [rdx+64] + vmovdqu xmm1, OWORD PTR [rdx+80] + vmovdqu xmm2, OWORD PTR [rdx+96] + vmovdqu xmm3, OWORD PTR [rdx+112] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+320], xmm0 + vmovdqu OWORD PTR [rdx+336], xmm1 + vmovdqu OWORD PTR [rdx+352], xmm2 + vmovdqu OWORD PTR [rdx+368], xmm3 + vmovdqu xmm0, OWORD PTR [rdx+128] + vmovdqu xmm1, OWORD PTR [rdx+144] + vmovdqu xmm2, OWORD PTR [rdx+160] + vmovdqu xmm3, OWORD PTR [rdx+176] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+384], xmm0 + vmovdqu OWORD PTR [rdx+400], xmm1 + vmovdqu OWORD PTR [rdx+416], xmm2 + vmovdqu OWORD PTR [rdx+432], xmm3 + vmovdqu xmm0, OWORD PTR [rdx+192] + vmovdqu xmm1, OWORD PTR [rdx+208] + vmovdqu xmm2, OWORD PTR [rdx+224] + vmovdqu xmm3, OWORD PTR [rdx+240] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+448], xmm0 + vmovdqu OWORD PTR [rdx+464], xmm1 + vmovdqu OWORD PTR [rdx+480], xmm2 + vmovdqu OWORD PTR [rdx+496], xmm3 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + add rsp, 80 + ret +GCM_generate_m0_vaes ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_one QWORD \ + 0000000000000000h, 0000000000000001h +ptr_L_vaes_aes_gcm_one QWORD L_vaes_aes_gcm_one +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_two QWORD \ + 0000000000000000h, 0000000000000002h +ptr_L_vaes_aes_gcm_two QWORD L_vaes_aes_gcm_two +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_three QWORD \ + 0000000000000000h, 0000000000000003h +ptr_L_vaes_aes_gcm_three QWORD L_vaes_aes_gcm_three +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_four QWORD \ + 0000000000000000h, 0000000000000004h +ptr_L_vaes_aes_gcm_four QWORD L_vaes_aes_gcm_four +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_five QWORD \ + 0000000000000000h, 0000000000000005h +ptr_L_vaes_aes_gcm_five QWORD L_vaes_aes_gcm_five +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_six QWORD \ + 0000000000000000h, 0000000000000006h +ptr_L_vaes_aes_gcm_six QWORD L_vaes_aes_gcm_six +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_seven QWORD \ + 0000000000000000h, 0000000000000007h +ptr_L_vaes_aes_gcm_seven QWORD L_vaes_aes_gcm_seven +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_eight QWORD \ + 0000000000000000h, 0000000000000008h +ptr_L_vaes_aes_gcm_eight QWORD L_vaes_aes_gcm_eight +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_bswap_epi64 QWORD \ + 0001020304050607h, 08090a0b0c0d0e0fh +ptr_L_vaes_aes_gcm_bswap_epi64 QWORD L_vaes_aes_gcm_bswap_epi64 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_bswap_mask QWORD \ + 08090a0b0c0d0e0fh, 0001020304050607h +ptr_L_vaes_aes_gcm_bswap_mask QWORD L_vaes_aes_gcm_bswap_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_mod2_128 QWORD \ + 0000000000000001h, 0c200000000000000h +ptr_L_vaes_aes_gcm_mod2_128 QWORD L_vaes_aes_gcm_mod2_128 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_vaes PROC + push r13 + push rdi + push rsi + push r12 + push rbx + push r14 + push r15 + mov rdi, rcx + mov rsi, rdx + mov r12, r8 + mov rax, r9 + mov r8, QWORD PTR [rsp+96] + mov r9d, DWORD PTR [rsp+104] + mov r11d, DWORD PTR [rsp+112] + mov ebx, DWORD PTR [rsp+120] + mov r14d, DWORD PTR [rsp+128] + mov r15, QWORD PTR [rsp+136] + mov r10d, DWORD PTR [rsp+144] + sub rsp, 720 + vmovdqu OWORD PTR [rsp+560], xmm6 + vmovdqu OWORD PTR [rsp+576], xmm7 + vmovdqu OWORD PTR [rsp+592], xmm8 + vmovdqu OWORD PTR [rsp+608], xmm9 + vmovdqu OWORD PTR [rsp+624], xmm10 + vmovdqu OWORD PTR [rsp+640], xmm11 + vmovdqu OWORD PTR [rsp+656], xmm12 + vmovdqu OWORD PTR [rsp+672], xmm13 + vmovdqu OWORD PTR [rsp+688], xmm14 + vmovdqu OWORD PTR [rsp+704], xmm15 + vpxor xmm5, xmm5, xmm5 + vpxor xmm15, xmm15, xmm15 + mov edx, ebx + cmp edx, 12 + jne L_AES_GCM_encrypt_vaes_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + vmovq xmm5, QWORD PTR [rax] + vpinsrd xmm5, xmm5, DWORD PTR [rax+8], 2 + vpinsrd xmm5, xmm5, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqa xmm6, OWORD PTR [r15] + vpxor xmm1, xmm5, xmm6 + vmovdqa xmm4, OWORD PTR [r15+16] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+32] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+48] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+64] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+80] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+96] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+112] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+128] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+144] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + cmp r10d, 11 + vmovdqa xmm4, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_vaes_calc_iv_12_last + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+176] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + cmp r10d, 13 + vmovdqa xmm4, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_vaes_calc_iv_12_last + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+208] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+224] +L_AES_GCM_encrypt_vaes_calc_iv_12_last: + vaesenclast xmm6, xmm6, xmm4 + vaesenclast xmm1, xmm1, xmm4 + vpshufb xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_bswap_mask + vmovdqu OWORD PTR [rsp+528], xmm1 + jmp L_AES_GCM_encrypt_vaes_iv_done +L_AES_GCM_encrypt_vaes_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqa xmm6, OWORD PTR [r15] + vaesenc xmm6, xmm6, [r15+16] + vaesenc xmm6, xmm6, [r15+32] + vaesenc xmm6, xmm6, [r15+48] + vaesenc xmm6, xmm6, [r15+64] + vaesenc xmm6, xmm6, [r15+80] + vaesenc xmm6, xmm6, [r15+96] + vaesenc xmm6, xmm6, [r15+112] + vaesenc xmm6, xmm6, [r15+128] + vaesenc xmm6, xmm6, [r15+144] + cmp r10d, 11 + vmovdqa xmm8, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_vaes_calc_iv_1_aesenc_avx_last + vaesenc xmm6, xmm6, xmm8 + vaesenc xmm6, xmm6, [r15+176] + cmp r10d, 13 + vmovdqa xmm8, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_vaes_calc_iv_1_aesenc_avx_last + vaesenc xmm6, xmm6, xmm8 + vaesenc xmm6, xmm6, [r15+208] + vmovdqa xmm8, OWORD PTR [r15+224] +L_AES_GCM_encrypt_vaes_calc_iv_1_aesenc_avx_last: + vaesenclast xmm6, xmm6, xmm8 + vpshufb xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov rcx, 0 + je L_AES_GCM_encrypt_vaes_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_encrypt_vaes_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_encrypt_vaes_calc_iv_16_loop: + vmovdqu xmm7, OWORD PTR [rax+rcx] + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm5, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm5, xmm5, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm5, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm5, xmm5, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm5, xmm5, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm5, xmm5, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm5, xmm5, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_encrypt_vaes_calc_iv_16_loop + mov edx, ebx + cmp ecx, edx + je L_AES_GCM_encrypt_vaes_calc_iv_done +L_AES_GCM_encrypt_vaes_calc_iv_lt16: + sub rsp, 16 + vpxor xmm7, xmm7, xmm7 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm7 +L_AES_GCM_encrypt_vaes_calc_iv_loop: + movzx r13d, BYTE PTR [rax+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_encrypt_vaes_calc_iv_loop + vmovdqu xmm7, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm5, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm5, xmm5, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm5, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm5, xmm5, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm5, xmm5, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm5, xmm5, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm5, xmm5, xmm2 +L_AES_GCM_encrypt_vaes_calc_iv_done: + ; T = Encrypt counter + vpxor xmm0, xmm0, xmm0 + shl edx, 3 + vmovq xmm0, rdx + vpxor xmm5, xmm5, xmm0 + ; ghash_gfmul_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm5, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm5, xmm5, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm5, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm5, xmm5, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm5, xmm5, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm5, xmm5, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm5, xmm5, xmm2 + vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_mask + ; Encrypt counter + vmovdqa xmm7, OWORD PTR [r15] + vpxor xmm7, xmm7, xmm5 + vaesenc xmm7, xmm7, [r15+16] + vaesenc xmm7, xmm7, [r15+32] + vaesenc xmm7, xmm7, [r15+48] + vaesenc xmm7, xmm7, [r15+64] + vaesenc xmm7, xmm7, [r15+80] + vaesenc xmm7, xmm7, [r15+96] + vaesenc xmm7, xmm7, [r15+112] + vaesenc xmm7, xmm7, [r15+128] + vaesenc xmm7, xmm7, [r15+144] + cmp r10d, 11 + vmovdqa xmm8, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_vaes_calc_iv_2_aesenc_avx_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [r15+176] + cmp r10d, 13 + vmovdqa xmm8, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_vaes_calc_iv_2_aesenc_avx_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [r15+208] + vmovdqa xmm8, OWORD PTR [r15+224] +L_AES_GCM_encrypt_vaes_calc_iv_2_aesenc_avx_last: + vaesenclast xmm7, xmm7, xmm8 + vmovdqu OWORD PTR [rsp+528], xmm7 +L_AES_GCM_encrypt_vaes_iv_done: + ; Additional authentication data + mov edx, r11d + cmp edx, 0 + je L_AES_GCM_encrypt_vaes_calc_aad_done + xor ecx, ecx + cmp edx, 16 + jl L_AES_GCM_encrypt_vaes_calc_aad_lt16 + and edx, 4294967280 +L_AES_GCM_encrypt_vaes_calc_aad_16_loop: + vmovdqu xmm7, OWORD PTR [r12+rcx] + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm15, xmm15, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm15, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm15, 17 + vpclmulqdq xmm0, xmm6, xmm15, 0 + vpxor xmm1, xmm1, xmm15 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm15, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm15, xmm15, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm15, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm15, xmm15, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm15, xmm15, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm15, xmm15, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm15, xmm15, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_encrypt_vaes_calc_aad_16_loop + mov edx, r11d + cmp ecx, edx + je L_AES_GCM_encrypt_vaes_calc_aad_done +L_AES_GCM_encrypt_vaes_calc_aad_lt16: + sub rsp, 16 + vpxor xmm7, xmm7, xmm7 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm7 +L_AES_GCM_encrypt_vaes_calc_aad_loop: + movzx r13d, BYTE PTR [r12+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_encrypt_vaes_calc_aad_loop + vmovdqu xmm7, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm15, xmm15, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm15, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm15, 17 + vpclmulqdq xmm0, xmm6, xmm15, 0 + vpxor xmm1, xmm1, xmm15 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm15, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm15, xmm15, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm15, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm15, xmm15, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm15, xmm15, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm15, xmm15, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm15, xmm15, xmm2 +L_AES_GCM_encrypt_vaes_calc_aad_done: + ; Calculate counter and H + vpsrlq xmm8, xmm6, 63 + vpsllq xmm7, xmm6, 1 + vpslldq xmm8, xmm8, 8 + vpor xmm7, xmm7, xmm8 + vpshufd xmm6, xmm6, 255 + vpsrad xmm6, xmm6, 31 + vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpand xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpaddd xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_one + vpxor xmm6, xmm6, xmm7 + vmovdqu OWORD PTR [rsp+512], xmm5 + xor ebx, ebx + cmp r9d, 128 + jl L_AES_GCM_encrypt_vaes_done_128 + vmovdqa xmm2, xmm15 + ; H ^ 1 + vmovdqu OWORD PTR [rsp], xmm6 + ; H ^ 2 + vpclmulqdq xmm7, xmm6, xmm6, 0 + vpclmulqdq xmm10, xmm6, xmm6, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm0, xmm10 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm0, xmm6, 0 + vpclmulqdq xmm8, xmm0, xmm6, 1 + vpclmulqdq xmm9, xmm0, xmm6, 16 + vpclmulqdq xmm10, xmm0, xmm6, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm1, xmm10 + vmovdqu OWORD PTR [rsp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm3, xmm10 + vmovdqu OWORD PTR [rsp+48], xmm3 + ; H ^ 5 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+64], xmm4 + ; H ^ 6 + vpclmulqdq xmm7, xmm1, xmm1, 0 + vpclmulqdq xmm10, xmm1, xmm1, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+80], xmm4 + ; H ^ 7 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm3, xmm1, 0 + vpclmulqdq xmm8, xmm3, xmm1, 1 + vpclmulqdq xmm9, xmm3, xmm1, 16 + vpclmulqdq xmm10, xmm3, xmm1, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+96], xmm4 + ; H ^ 8 + vpclmulqdq xmm7, xmm3, xmm3, 0 + vpclmulqdq xmm10, xmm3, xmm3, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+112], xmm4 + cmp r9d, 256 + jl L_AES_GCM_encrypt_vaes_no_ext + ; H ^ 9 + vmovdqu xmm0, OWORD PTR [rsp+48] + vmovdqu xmm1, OWORD PTR [rsp+64] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+128], xmm4 + ; H ^ 10 + vmovdqu xmm0, OWORD PTR [rsp+64] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+144], xmm4 + ; H ^ 11 + vmovdqu xmm0, OWORD PTR [rsp+64] + vmovdqu xmm1, OWORD PTR [rsp+80] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+160], xmm4 + ; H ^ 12 + vmovdqu xmm0, OWORD PTR [rsp+80] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+176], xmm4 + ; H ^ 13 + vmovdqu xmm0, OWORD PTR [rsp+80] + vmovdqu xmm1, OWORD PTR [rsp+96] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+192], xmm4 + ; H ^ 14 + vmovdqu xmm0, OWORD PTR [rsp+96] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+208], xmm4 + ; H ^ 15 + vmovdqu xmm0, OWORD PTR [rsp+96] + vmovdqu xmm1, OWORD PTR [rsp+112] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+224], xmm4 + ; H ^ 16 + vmovdqu xmm0, OWORD PTR [rsp+112] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+240], xmm4 + vmovdqu ymm7, YMMWORD PTR [rsp+224] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+192] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+160] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp+128] + vpermq ymm10, ymm10, 78 + vmovdqu YMMWORD PTR [rsp+256], ymm7 + vmovdqu YMMWORD PTR [rsp+288], ymm8 + vmovdqu YMMWORD PTR [rsp+320], ymm9 + vmovdqu YMMWORD PTR [rsp+352], ymm10 + vmovdqu ymm7, YMMWORD PTR [rsp+96] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+64] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+32] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp] + vpermq ymm10, ymm10, 78 + vmovdqu YMMWORD PTR [rsp+384], ymm7 + vmovdqu YMMWORD PTR [rsp+416], ymm8 + vmovdqu YMMWORD PTR [rsp+448], ymm9 + vmovdqu YMMWORD PTR [rsp+480], ymm10 +L_AES_GCM_encrypt_vaes_no_ext: + vbroadcasti128 ymm14, ptr_L_vaes_aes_gcm_mod2_128 + cmp r9d, 256 + jl L_AES_GCM_encrypt_vaes_after_256 + mov r13d, r9d + and r13d, 4294967040 +L_AES_GCM_encrypt_vaes_loop_256: + ; 256 bytes of input + lea rcx, QWORD PTR [rsi+rbx] + mov QWORD PTR [rsp+544], rcx + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 + vbroadcasti128 ymm4, [rsp+512] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [rsp+512] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [rsp+512], xmm7 + vbroadcasti128 ymm4, [r15] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 11 + vbroadcasti128 ymm4, [r15+160] + jl L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 13 + vbroadcasti128 ymm4, [r15+192] + jl L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+224] +L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add ebx, 128 + vbroadcasti128 ymm4, [rsp+512] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [rsp+512] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [rsp+512], xmm7 + vbroadcasti128 ymm4, [r15] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 11 + vbroadcasti128 ymm4, [r15+160] + jl L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 13 + vbroadcasti128 ymm4, [r15+192] + jl L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+224] +L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add ebx, 128 + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask + mov rcx, QWORD PTR [rsp+544] + vpxor ymm4, ymm4, ymm4 + vinserti128 ymm4, ymm4, xmm15, 0 + vmovdqu ymm7, YMMWORD PTR [rsp+256] + vmovdqu ymm8, YMMWORD PTR [rsp+288] + vmovdqu ymm9, YMMWORD PTR [rsp+320] + vmovdqu ymm10, YMMWORD PTR [rsp+352] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpshufb ymm5, ymm5, ymm6 + vpxor ymm5, ymm5, ymm4 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vmovdqa ymm11, ymm0 + vpxor ymm12, ymm2, ymm1 + vmovdqa ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm7, YMMWORD PTR [rsp+384] + vmovdqu ymm8, YMMWORD PTR [rsp+416] + vmovdqu ymm9, YMMWORD PTR [rsp+448] + vmovdqu ymm10, YMMWORD PTR [rsp+480] + vmovdqu ymm5, YMMWORD PTR [rcx+128] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rcx+160] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rcx+192] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rcx+224] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vpclmulqdq ymm5, ymm14, ymm11, 1 + vpshufd ymm11, ymm11, 78 + vpxor ymm12, ymm12, ymm5 + vpxor ymm12, ymm12, ymm11 + vpclmulqdq ymm5, ymm14, ymm12, 1 + vpshufd ymm12, ymm12, 78 + vpxor ymm13, ymm13, ymm5 + vpxor ymm13, ymm13, ymm12 + vextracti128 xmm0, ymm13, 1 + vpxor xmm15, xmm13, xmm0 + cmp ebx, r13d + jl L_AES_GCM_encrypt_vaes_loop_256 +L_AES_GCM_encrypt_vaes_after_256: + mov r13d, r9d + and r13d, 4294967168 + cmp ebx, r13d + jge L_AES_GCM_encrypt_vaes_after_128 + ; 128 bytes of input + lea rcx, QWORD PTR [rsi+rbx] + mov QWORD PTR [rsp+544], rcx + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 + vbroadcasti128 ymm4, [rsp+512] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [rsp+512] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [rsp+512], xmm7 + vbroadcasti128 ymm4, [r15] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 11 + vbroadcasti128 ymm4, [r15+160] + jl L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 13 + vbroadcasti128 ymm4, [r15+192] + jl L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+224] +L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add ebx, 128 + vmovdqu ymm7, YMMWORD PTR [rsp+96] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+64] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+32] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp] + vpermq ymm10, ymm10, 78 + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask + mov rcx, QWORD PTR [rsp+544] + vpxor ymm4, ymm4, ymm4 + vinserti128 ymm4, ymm4, xmm15, 0 + vmovdqu ymm5, YMMWORD PTR [rcx] + vpshufb ymm5, ymm5, ymm6 + vpxor ymm5, ymm5, ymm4 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vmovdqa ymm11, ymm0 + vpxor ymm12, ymm2, ymm1 + vmovdqa ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vpclmulqdq ymm5, ymm14, ymm11, 1 + vpshufd ymm11, ymm11, 78 + vpxor ymm12, ymm12, ymm5 + vpxor ymm12, ymm12, ymm11 + vpclmulqdq ymm5, ymm14, ymm12, 1 + vpshufd ymm12, ymm12, 78 + vpxor ymm13, ymm13, ymm5 + vpxor ymm13, ymm13, ymm12 + vextracti128 xmm0, ymm13, 1 + vpxor xmm15, xmm13, xmm0 +L_AES_GCM_encrypt_vaes_after_128: + vmovdqu xmm6, OWORD PTR [rsp] +L_AES_GCM_encrypt_vaes_done_128: + mov edx, r9d + cmp ebx, edx + jge L_AES_GCM_encrypt_vaes_done_enc + mov r13d, r9d + and r13d, 4294967280 + cmp ebx, r13d + jge L_AES_GCM_encrypt_vaes_last_block_done + vmovdqu xmm8, OWORD PTR [rsp+512] + vpshufb xmm7, xmm8, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpaddd xmm8, xmm8, OWORD PTR L_vaes_aes_gcm_one + vmovdqu OWORD PTR [rsp+512], xmm8 + vpxor xmm7, xmm7, [r15] + vaesenc xmm7, xmm7, [r15+16] + vaesenc xmm7, xmm7, [r15+32] + vaesenc xmm7, xmm7, [r15+48] + vaesenc xmm7, xmm7, [r15+64] + vaesenc xmm7, xmm7, [r15+80] + vaesenc xmm7, xmm7, [r15+96] + vaesenc xmm7, xmm7, [r15+112] + vaesenc xmm7, xmm7, [r15+128] + vaesenc xmm7, xmm7, [r15+144] + cmp r10d, 11 + vmovdqa xmm8, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_vaes_aesenc_block_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [r15+176] + cmp r10d, 13 + vmovdqa xmm8, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_vaes_aesenc_block_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [r15+208] + vmovdqa xmm8, OWORD PTR [r15+224] +L_AES_GCM_encrypt_vaes_aesenc_block_last: + vaesenclast xmm7, xmm7, xmm8 + vmovdqu xmm8, OWORD PTR [rdi+rbx] + vpxor xmm7, xmm7, xmm8 + vmovdqu OWORD PTR [rsi+rbx], xmm7 + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm15, xmm15, xmm7 + add ebx, 16 + cmp ebx, r13d + jge L_AES_GCM_encrypt_vaes_last_block_ghash +L_AES_GCM_encrypt_vaes_last_block_start: + vmovdqu xmm12, OWORD PTR [rdi+rbx] + vmovdqu xmm8, OWORD PTR [rsp+512] + vpshufb xmm7, xmm8, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpaddd xmm8, xmm8, OWORD PTR L_vaes_aes_gcm_one + vmovdqu OWORD PTR [rsp+512], xmm8 + vpxor xmm7, xmm7, [r15] + vpclmulqdq xmm9, xmm15, xmm6, 16 + vaesenc xmm7, xmm7, [r15+16] + vaesenc xmm7, xmm7, [r15+32] + vpclmulqdq xmm10, xmm15, xmm6, 1 + vaesenc xmm7, xmm7, [r15+48] + vaesenc xmm7, xmm7, [r15+64] + vpclmulqdq xmm11, xmm15, xmm6, 0 + vaesenc xmm7, xmm7, [r15+80] + vpclmulqdq xmm1, xmm15, xmm6, 17 + vaesenc xmm7, xmm7, [r15+96] + vpxor xmm9, xmm9, xmm10 + vpslldq xmm2, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vaesenc xmm7, xmm7, [r15+112] + vpxor xmm2, xmm2, xmm11 + vpxor xmm3, xmm1, xmm9 + vmovdqa xmm0, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm10, xmm2, xmm0, 16 + vaesenc xmm7, xmm7, [r15+128] + vpshufd xmm9, xmm2, 78 + vpxor xmm9, xmm9, xmm10 + vpclmulqdq xmm10, xmm9, xmm0, 16 + vaesenc xmm7, xmm7, [r15+144] + vpshufd xmm9, xmm9, 78 + vpxor xmm9, xmm9, xmm10 + vpxor xmm15, xmm9, xmm3 + cmp r10d, 11 + vmovdqa xmm8, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_vaes_aesenc_gfmul_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [r15+176] + cmp r10d, 13 + vmovdqa xmm8, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_vaes_aesenc_gfmul_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [r15+208] + vmovdqa xmm8, OWORD PTR [r15+224] +L_AES_GCM_encrypt_vaes_aesenc_gfmul_last: + vaesenclast xmm7, xmm7, xmm8 + vmovdqa xmm0, xmm12 + vpxor xmm7, xmm7, xmm0 + vmovdqu OWORD PTR [rsi+rbx], xmm7 + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + add ebx, 16 + vpxor xmm15, xmm15, xmm7 + cmp ebx, r13d + jl L_AES_GCM_encrypt_vaes_last_block_start +L_AES_GCM_encrypt_vaes_last_block_ghash: + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm15, xmm6, 0 + vpclmulqdq xmm8, xmm15, xmm6, 1 + vpclmulqdq xmm9, xmm15, xmm6, 16 + vpclmulqdq xmm10, xmm15, xmm6, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm15, xmm10 +L_AES_GCM_encrypt_vaes_last_block_done: + mov ecx, r9d + mov edx, ecx + and ecx, 15 + jz L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_done + vmovdqu xmm5, OWORD PTR [rsp+512] + vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpxor xmm5, xmm5, [r15] + vaesenc xmm5, xmm5, [r15+16] + vaesenc xmm5, xmm5, [r15+32] + vaesenc xmm5, xmm5, [r15+48] + vaesenc xmm5, xmm5, [r15+64] + vaesenc xmm5, xmm5, [r15+80] + vaesenc xmm5, xmm5, [r15+96] + vaesenc xmm5, xmm5, [r15+112] + vaesenc xmm5, xmm5, [r15+128] + vaesenc xmm5, xmm5, [r15+144] + cmp r10d, 11 + vmovdqa xmm8, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc xmm5, xmm5, xmm8 + vaesenc xmm5, xmm5, [r15+176] + cmp r10d, 13 + vmovdqa xmm8, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc xmm5, xmm5, xmm8 + vaesenc xmm5, xmm5, [r15+208] + vmovdqa xmm8, OWORD PTR [r15+224] +L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_aesenc_avx_last: + vaesenclast xmm5, xmm5, xmm8 + sub rsp, 16 + xor ecx, ecx + vmovdqu OWORD PTR [rsp], xmm5 +L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_loop: + movzx r13d, BYTE PTR [rdi+rbx] + xor r13b, BYTE PTR [rsp+rcx] + mov BYTE PTR [rsi+rbx], r13b + mov BYTE PTR [rsp+rcx], r13b + inc ebx + inc ecx + cmp ebx, edx + jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_loop + xor r13, r13 + cmp ecx, 16 + je L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_finish_enc +L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_byte_loop: + mov BYTE PTR [rsp+rcx], r13b + inc ecx + cmp ecx, 16 + jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_byte_loop +L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_finish_enc: + vmovdqu xmm5, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm15, xmm15, xmm5 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm15, xmm6, 0 + vpclmulqdq xmm8, xmm15, xmm6, 1 + vpclmulqdq xmm9, xmm15, xmm6, 16 + vpclmulqdq xmm10, xmm15, xmm6, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm15, xmm10 +L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_done: +L_AES_GCM_encrypt_vaes_done_enc: + mov edx, r9d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + vmovq xmm0, rdx + vmovq xmm1, rcx + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm15, xmm15, xmm0 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm15, xmm6, 0 + vpclmulqdq xmm8, xmm15, xmm6, 1 + vpclmulqdq xmm9, xmm15, xmm6, 16 + vpclmulqdq xmm10, xmm15, xmm6, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm15, xmm10 + vpshufb xmm15, xmm15, OWORD PTR L_vaes_aes_gcm_bswap_mask + vmovdqu xmm0, OWORD PTR [rsp+528] + vpxor xmm0, xmm0, xmm15 + cmp r14d, 16 + je L_AES_GCM_encrypt_vaes_store_tag_16 + xor rcx, rcx + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_encrypt_vaes_store_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + mov BYTE PTR [r8+rcx], r13b + inc ecx + cmp ecx, r14d + jne L_AES_GCM_encrypt_vaes_store_tag_loop + jmp L_AES_GCM_encrypt_vaes_store_tag_done +L_AES_GCM_encrypt_vaes_store_tag_16: + vmovdqu OWORD PTR [r8], xmm0 +L_AES_GCM_encrypt_vaes_store_tag_done: + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+560] + vmovdqu xmm7, OWORD PTR [rsp+576] + vmovdqu xmm8, OWORD PTR [rsp+592] + vmovdqu xmm9, OWORD PTR [rsp+608] + vmovdqu xmm10, OWORD PTR [rsp+624] + vmovdqu xmm11, OWORD PTR [rsp+640] + vmovdqu xmm12, OWORD PTR [rsp+656] + vmovdqu xmm13, OWORD PTR [rsp+672] + vmovdqu xmm14, OWORD PTR [rsp+688] + vmovdqu xmm15, OWORD PTR [rsp+704] + add rsp, 720 + pop r15 + pop r14 + pop rbx + pop r12 + pop rsi + pop rdi + pop r13 + ret +AES_GCM_encrypt_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_decrypt_vaes PROC + push r13 + push rdi + push rsi + push r12 + push rbx + push r14 + push r15 + push rbp + mov rdi, rcx + mov rsi, rdx + mov r12, r8 + mov rax, r9 + mov r8, QWORD PTR [rsp+104] + mov r9d, DWORD PTR [rsp+112] + mov r11d, DWORD PTR [rsp+120] + mov ebx, DWORD PTR [rsp+128] + mov r14d, DWORD PTR [rsp+136] + mov r15, QWORD PTR [rsp+144] + mov r10d, DWORD PTR [rsp+152] + mov rbp, QWORD PTR [rsp+160] + sub rsp, 704 + vmovdqu OWORD PTR [rsp+544], xmm6 + vmovdqu OWORD PTR [rsp+560], xmm7 + vmovdqu OWORD PTR [rsp+576], xmm8 + vmovdqu OWORD PTR [rsp+592], xmm9 + vmovdqu OWORD PTR [rsp+608], xmm10 + vmovdqu OWORD PTR [rsp+624], xmm11 + vmovdqu OWORD PTR [rsp+640], xmm12 + vmovdqu OWORD PTR [rsp+656], xmm13 + vmovdqu OWORD PTR [rsp+672], xmm14 + vmovdqu OWORD PTR [rsp+688], xmm15 + vpxor xmm5, xmm5, xmm5 + vpxor xmm15, xmm15, xmm15 + cmp ebx, 12 + mov edx, ebx + jne L_AES_GCM_decrypt_vaes_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + vmovq xmm5, QWORD PTR [rax] + vpinsrd xmm5, xmm5, DWORD PTR [rax+8], 2 + vpinsrd xmm5, xmm5, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqa xmm6, OWORD PTR [r15] + vpxor xmm1, xmm5, xmm6 + vmovdqa xmm4, OWORD PTR [r15+16] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+32] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+48] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+64] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+80] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+96] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+112] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+128] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+144] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + cmp r10d, 11 + vmovdqa xmm4, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_vaes_calc_iv_12_last + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+176] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + cmp r10d, 13 + vmovdqa xmm4, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_vaes_calc_iv_12_last + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+208] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+224] +L_AES_GCM_decrypt_vaes_calc_iv_12_last: + vaesenclast xmm6, xmm6, xmm4 + vaesenclast xmm1, xmm1, xmm4 + vpshufb xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_bswap_mask + vmovdqu OWORD PTR [rsp+528], xmm1 + jmp L_AES_GCM_decrypt_vaes_iv_done +L_AES_GCM_decrypt_vaes_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqa xmm6, OWORD PTR [r15] + vaesenc xmm6, xmm6, [r15+16] + vaesenc xmm6, xmm6, [r15+32] + vaesenc xmm6, xmm6, [r15+48] + vaesenc xmm6, xmm6, [r15+64] + vaesenc xmm6, xmm6, [r15+80] + vaesenc xmm6, xmm6, [r15+96] + vaesenc xmm6, xmm6, [r15+112] + vaesenc xmm6, xmm6, [r15+128] + vaesenc xmm6, xmm6, [r15+144] + cmp r10d, 11 + vmovdqa xmm8, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_vaes_calc_iv_1_aesenc_avx_last + vaesenc xmm6, xmm6, xmm8 + vaesenc xmm6, xmm6, [r15+176] + cmp r10d, 13 + vmovdqa xmm8, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_vaes_calc_iv_1_aesenc_avx_last + vaesenc xmm6, xmm6, xmm8 + vaesenc xmm6, xmm6, [r15+208] + vmovdqa xmm8, OWORD PTR [r15+224] +L_AES_GCM_decrypt_vaes_calc_iv_1_aesenc_avx_last: + vaesenclast xmm6, xmm6, xmm8 + vpshufb xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov rcx, 0 + je L_AES_GCM_decrypt_vaes_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_decrypt_vaes_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_decrypt_vaes_calc_iv_16_loop: + vmovdqu xmm7, OWORD PTR [rax+rcx] + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm5, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm5, xmm5, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm5, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm5, xmm5, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm5, xmm5, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm5, xmm5, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm5, xmm5, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_decrypt_vaes_calc_iv_16_loop + mov edx, ebx + cmp ecx, edx + je L_AES_GCM_decrypt_vaes_calc_iv_done +L_AES_GCM_decrypt_vaes_calc_iv_lt16: + sub rsp, 16 + vpxor xmm7, xmm7, xmm7 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm7 +L_AES_GCM_decrypt_vaes_calc_iv_loop: + movzx r13d, BYTE PTR [rax+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_decrypt_vaes_calc_iv_loop + vmovdqu xmm7, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm5, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm5, xmm5, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm5, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm5, xmm5, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm5, xmm5, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm5, xmm5, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm5, xmm5, xmm2 +L_AES_GCM_decrypt_vaes_calc_iv_done: + ; T = Encrypt counter + vpxor xmm0, xmm0, xmm0 + shl edx, 3 + vmovq xmm0, rdx + vpxor xmm5, xmm5, xmm0 + ; ghash_gfmul_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm5, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm5, xmm5, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm5, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm5, xmm5, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm5, xmm5, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm5, xmm5, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm5, xmm5, xmm2 + vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_mask + ; Encrypt counter + vmovdqa xmm7, OWORD PTR [r15] + vpxor xmm7, xmm7, xmm5 + vaesenc xmm7, xmm7, [r15+16] + vaesenc xmm7, xmm7, [r15+32] + vaesenc xmm7, xmm7, [r15+48] + vaesenc xmm7, xmm7, [r15+64] + vaesenc xmm7, xmm7, [r15+80] + vaesenc xmm7, xmm7, [r15+96] + vaesenc xmm7, xmm7, [r15+112] + vaesenc xmm7, xmm7, [r15+128] + vaesenc xmm7, xmm7, [r15+144] + cmp r10d, 11 + vmovdqa xmm8, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_vaes_calc_iv_2_aesenc_avx_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [r15+176] + cmp r10d, 13 + vmovdqa xmm8, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_vaes_calc_iv_2_aesenc_avx_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [r15+208] + vmovdqa xmm8, OWORD PTR [r15+224] +L_AES_GCM_decrypt_vaes_calc_iv_2_aesenc_avx_last: + vaesenclast xmm7, xmm7, xmm8 + vmovdqu OWORD PTR [rsp+528], xmm7 +L_AES_GCM_decrypt_vaes_iv_done: + ; Additional authentication data + mov edx, r11d + cmp edx, 0 + je L_AES_GCM_decrypt_vaes_calc_aad_done + xor ecx, ecx + cmp edx, 16 + jl L_AES_GCM_decrypt_vaes_calc_aad_lt16 + and edx, 4294967280 +L_AES_GCM_decrypt_vaes_calc_aad_16_loop: + vmovdqu xmm7, OWORD PTR [r12+rcx] + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm15, xmm15, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm15, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm15, 17 + vpclmulqdq xmm0, xmm6, xmm15, 0 + vpxor xmm1, xmm1, xmm15 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm15, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm15, xmm15, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm15, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm15, xmm15, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm15, xmm15, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm15, xmm15, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm15, xmm15, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_decrypt_vaes_calc_aad_16_loop + mov edx, r11d + cmp ecx, edx + je L_AES_GCM_decrypt_vaes_calc_aad_done +L_AES_GCM_decrypt_vaes_calc_aad_lt16: + sub rsp, 16 + vpxor xmm7, xmm7, xmm7 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm7 +L_AES_GCM_decrypt_vaes_calc_aad_loop: + movzx r13d, BYTE PTR [r12+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_decrypt_vaes_calc_aad_loop + vmovdqu xmm7, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm15, xmm15, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm15, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm15, 17 + vpclmulqdq xmm0, xmm6, xmm15, 0 + vpxor xmm1, xmm1, xmm15 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm15, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm15, xmm15, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm15, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm15, xmm15, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm15, xmm15, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm15, xmm15, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm15, xmm15, xmm2 +L_AES_GCM_decrypt_vaes_calc_aad_done: + ; Calculate counter and H + vpsrlq xmm8, xmm6, 63 + vpsllq xmm7, xmm6, 1 + vpslldq xmm8, xmm8, 8 + vpor xmm7, xmm7, xmm8 + vpshufd xmm6, xmm6, 255 + vpsrad xmm6, xmm6, 31 + vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpand xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpaddd xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_one + vpxor xmm6, xmm6, xmm7 + vmovdqu OWORD PTR [rsp+512], xmm5 + xor ebx, ebx + cmp r9d, 128 + jl L_AES_GCM_decrypt_vaes_done_128 + vmovdqa xmm2, xmm15 + ; H ^ 1 + vmovdqu OWORD PTR [rsp], xmm6 + ; H ^ 2 + vpclmulqdq xmm7, xmm6, xmm6, 0 + vpclmulqdq xmm10, xmm6, xmm6, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm0, xmm10 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm0, xmm6, 0 + vpclmulqdq xmm8, xmm0, xmm6, 1 + vpclmulqdq xmm9, xmm0, xmm6, 16 + vpclmulqdq xmm10, xmm0, xmm6, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm1, xmm10 + vmovdqu OWORD PTR [rsp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm3, xmm10 + vmovdqu OWORD PTR [rsp+48], xmm3 + ; H ^ 5 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+64], xmm4 + ; H ^ 6 + vpclmulqdq xmm7, xmm1, xmm1, 0 + vpclmulqdq xmm10, xmm1, xmm1, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+80], xmm4 + ; H ^ 7 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm3, xmm1, 0 + vpclmulqdq xmm8, xmm3, xmm1, 1 + vpclmulqdq xmm9, xmm3, xmm1, 16 + vpclmulqdq xmm10, xmm3, xmm1, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+96], xmm4 + ; H ^ 8 + vpclmulqdq xmm7, xmm3, xmm3, 0 + vpclmulqdq xmm10, xmm3, xmm3, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+112], xmm4 + cmp r9d, 256 + jl L_AES_GCM_decrypt_vaes_no_ext + ; H ^ 9 + vmovdqu xmm0, OWORD PTR [rsp+48] + vmovdqu xmm1, OWORD PTR [rsp+64] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+128], xmm4 + ; H ^ 10 + vmovdqu xmm0, OWORD PTR [rsp+64] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+144], xmm4 + ; H ^ 11 + vmovdqu xmm0, OWORD PTR [rsp+64] + vmovdqu xmm1, OWORD PTR [rsp+80] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+160], xmm4 + ; H ^ 12 + vmovdqu xmm0, OWORD PTR [rsp+80] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+176], xmm4 + ; H ^ 13 + vmovdqu xmm0, OWORD PTR [rsp+80] + vmovdqu xmm1, OWORD PTR [rsp+96] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+192], xmm4 + ; H ^ 14 + vmovdqu xmm0, OWORD PTR [rsp+96] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+208], xmm4 + ; H ^ 15 + vmovdqu xmm0, OWORD PTR [rsp+96] + vmovdqu xmm1, OWORD PTR [rsp+112] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+224], xmm4 + ; H ^ 16 + vmovdqu xmm0, OWORD PTR [rsp+112] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+240], xmm4 + vmovdqu ymm7, YMMWORD PTR [rsp+224] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+192] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+160] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp+128] + vpermq ymm10, ymm10, 78 + vmovdqu YMMWORD PTR [rsp+256], ymm7 + vmovdqu YMMWORD PTR [rsp+288], ymm8 + vmovdqu YMMWORD PTR [rsp+320], ymm9 + vmovdqu YMMWORD PTR [rsp+352], ymm10 + vmovdqu ymm7, YMMWORD PTR [rsp+96] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+64] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+32] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp] + vpermq ymm10, ymm10, 78 + vmovdqu YMMWORD PTR [rsp+384], ymm7 + vmovdqu YMMWORD PTR [rsp+416], ymm8 + vmovdqu YMMWORD PTR [rsp+448], ymm9 + vmovdqu YMMWORD PTR [rsp+480], ymm10 +L_AES_GCM_decrypt_vaes_no_ext: + vbroadcasti128 ymm14, ptr_L_vaes_aes_gcm_mod2_128 + cmp r9d, 256 + jl L_AES_GCM_decrypt_vaes_after_256 + mov r13d, r9d + and r13d, 4294967040 +L_AES_GCM_decrypt_vaes_loop_256: + ; 256 bytes of input + lea rax, QWORD PTR [rdi+rbx] + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask + vpxor ymm4, ymm4, ymm4 + vinserti128 ymm4, ymm4, xmm15, 0 + vmovdqu ymm7, YMMWORD PTR [rsp+256] + vmovdqu ymm8, YMMWORD PTR [rsp+288] + vmovdqu ymm9, YMMWORD PTR [rsp+320] + vmovdqu ymm10, YMMWORD PTR [rsp+352] + vmovdqu ymm5, YMMWORD PTR [rax] + vpshufb ymm5, ymm5, ymm6 + vpxor ymm5, ymm5, ymm4 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vmovdqa ymm11, ymm0 + vpxor ymm12, ymm2, ymm1 + vmovdqa ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rax+32] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rax+64] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rax+96] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm7, YMMWORD PTR [rsp+384] + vmovdqu ymm8, YMMWORD PTR [rsp+416] + vmovdqu ymm9, YMMWORD PTR [rsp+448] + vmovdqu ymm10, YMMWORD PTR [rsp+480] + vmovdqu ymm5, YMMWORD PTR [rax+128] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rax+160] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rax+192] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rax+224] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vpclmulqdq ymm5, ymm14, ymm11, 1 + vpshufd ymm11, ymm11, 78 + vpxor ymm12, ymm12, ymm5 + vpxor ymm12, ymm12, ymm11 + vpclmulqdq ymm5, ymm14, ymm12, 1 + vpshufd ymm12, ymm12, 78 + vpxor ymm13, ymm13, ymm5 + vpxor ymm13, ymm13, ymm12 + vextracti128 xmm0, ymm13, 1 + vpxor xmm15, xmm13, xmm0 + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 + vbroadcasti128 ymm4, [rsp+512] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [rsp+512] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [rsp+512], xmm7 + vbroadcasti128 ymm4, [r15] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 11 + vbroadcasti128 ymm4, [r15+160] + jl L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 13 + vbroadcasti128 ymm4, [r15+192] + jl L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+224] +L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add ebx, 128 + vbroadcasti128 ymm4, [rsp+512] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [rsp+512] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [rsp+512], xmm7 + vbroadcasti128 ymm4, [r15] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 11 + vbroadcasti128 ymm4, [r15+160] + jl L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 13 + vbroadcasti128 ymm4, [r15+192] + jl L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+224] +L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add ebx, 128 + cmp ebx, r13d + jl L_AES_GCM_decrypt_vaes_loop_256 +L_AES_GCM_decrypt_vaes_after_256: + vmovdqu ymm7, YMMWORD PTR [rsp+96] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+64] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+32] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp] + vpermq ymm10, ymm10, 78 + mov r13d, r9d + and r13d, 4294967168 + cmp ebx, r13d + jge L_AES_GCM_decrypt_vaes_after_128 + ; 128 bytes of input + lea rax, QWORD PTR [rdi+rbx] + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask + vpxor ymm4, ymm4, ymm4 + vinserti128 ymm4, ymm4, xmm15, 0 + vmovdqu ymm5, YMMWORD PTR [rax] + vpshufb ymm5, ymm5, ymm6 + vpxor ymm5, ymm5, ymm4 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vmovdqa ymm11, ymm0 + vpxor ymm12, ymm2, ymm1 + vmovdqa ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rax+32] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rax+64] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rax+96] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vpclmulqdq ymm5, ymm14, ymm11, 1 + vpshufd ymm11, ymm11, 78 + vpxor ymm12, ymm12, ymm5 + vpxor ymm12, ymm12, ymm11 + vpclmulqdq ymm5, ymm14, ymm12, 1 + vpshufd ymm12, ymm12, 78 + vpxor ymm13, ymm13, ymm5 + vpxor ymm13, ymm13, ymm12 + vextracti128 xmm0, ymm13, 1 + vpxor xmm15, xmm13, xmm0 + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 + vbroadcasti128 ymm4, [rsp+512] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [rsp+512] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [rsp+512], xmm7 + vbroadcasti128 ymm4, [r15] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 11 + vbroadcasti128 ymm4, [r15+160] + jl L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 13 + vbroadcasti128 ymm4, [r15+192] + jl L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+224] +L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add ebx, 128 +L_AES_GCM_decrypt_vaes_after_128: + vmovdqu xmm6, OWORD PTR [rsp] +L_AES_GCM_decrypt_vaes_done_128: + mov edx, r9d + cmp ebx, edx + jge L_AES_GCM_decrypt_vaes_done_dec + mov r13d, r9d + and r13d, 4294967280 + cmp ebx, r13d + jge L_AES_GCM_decrypt_vaes_last_block_done +L_AES_GCM_decrypt_vaes_last_block_start: + vmovdqu xmm12, OWORD PTR [rdi+rbx] + vmovdqa xmm0, xmm6 + vpshufb xmm1, xmm12, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm1, xmm1, xmm15 + vmovdqu xmm8, OWORD PTR [rsp+512] + vpshufb xmm7, xmm8, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpaddd xmm8, xmm8, OWORD PTR L_vaes_aes_gcm_one + vmovdqu OWORD PTR [rsp+512], xmm8 + vpxor xmm7, xmm7, [r15] + vpclmulqdq xmm9, xmm1, xmm0, 16 + vaesenc xmm7, xmm7, [r15+16] + vaesenc xmm7, xmm7, [r15+32] + vpclmulqdq xmm10, xmm1, xmm0, 1 + vaesenc xmm7, xmm7, [r15+48] + vaesenc xmm7, xmm7, [r15+64] + vpclmulqdq xmm11, xmm1, xmm0, 0 + vaesenc xmm7, xmm7, [r15+80] + vpclmulqdq xmm1, xmm1, xmm0, 17 + vaesenc xmm7, xmm7, [r15+96] + vpxor xmm9, xmm9, xmm10 + vpslldq xmm2, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vaesenc xmm7, xmm7, [r15+112] + vpxor xmm2, xmm2, xmm11 + vpxor xmm3, xmm1, xmm9 + vmovdqa xmm0, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm10, xmm2, xmm0, 16 + vaesenc xmm7, xmm7, [r15+128] + vpshufd xmm9, xmm2, 78 + vpxor xmm9, xmm9, xmm10 + vpclmulqdq xmm10, xmm9, xmm0, 16 + vaesenc xmm7, xmm7, [r15+144] + vpshufd xmm9, xmm9, 78 + vpxor xmm9, xmm9, xmm10 + vpxor xmm15, xmm9, xmm3 + cmp r10d, 11 + vmovdqa xmm8, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_vaes_aesenc_gfmul_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [r15+176] + cmp r10d, 13 + vmovdqa xmm8, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_vaes_aesenc_gfmul_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [r15+208] + vmovdqa xmm8, OWORD PTR [r15+224] +L_AES_GCM_decrypt_vaes_aesenc_gfmul_last: + vaesenclast xmm7, xmm7, xmm8 + vmovdqa xmm0, xmm12 + vpxor xmm7, xmm7, xmm0 + vmovdqu OWORD PTR [rsi+rbx], xmm7 + add ebx, 16 + cmp ebx, r13d + jl L_AES_GCM_decrypt_vaes_last_block_start +L_AES_GCM_decrypt_vaes_last_block_done: + mov ecx, r9d + mov edx, ecx + and ecx, 15 + jz L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_done + vmovdqu xmm5, OWORD PTR [rsp+512] + vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpxor xmm5, xmm5, [r15] + vaesenc xmm5, xmm5, [r15+16] + vaesenc xmm5, xmm5, [r15+32] + vaesenc xmm5, xmm5, [r15+48] + vaesenc xmm5, xmm5, [r15+64] + vaesenc xmm5, xmm5, [r15+80] + vaesenc xmm5, xmm5, [r15+96] + vaesenc xmm5, xmm5, [r15+112] + vaesenc xmm5, xmm5, [r15+128] + vaesenc xmm5, xmm5, [r15+144] + cmp r10d, 11 + vmovdqa xmm8, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc xmm5, xmm5, xmm8 + vaesenc xmm5, xmm5, [r15+176] + cmp r10d, 13 + vmovdqa xmm8, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc xmm5, xmm5, xmm8 + vaesenc xmm5, xmm5, [r15+208] + vmovdqa xmm8, OWORD PTR [r15+224] +L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_aesenc_avx_last: + vaesenclast xmm5, xmm5, xmm8 + sub rsp, 32 + xor ecx, ecx + vmovdqu OWORD PTR [rsp], xmm5 + vpxor xmm0, xmm0, xmm0 + vmovdqu OWORD PTR [rsp+16], xmm0 +L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_loop: + movzx r13d, BYTE PTR [rdi+rbx] + mov BYTE PTR [rsp+rcx+16], r13b + xor r13b, BYTE PTR [rsp+rcx] + mov BYTE PTR [rsi+rbx], r13b + inc ebx + inc ecx + cmp ebx, edx + jl L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_loop + vmovdqu xmm5, OWORD PTR [rsp+16] + add rsp, 32 + vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm15, xmm15, xmm5 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm15, xmm6, 0 + vpclmulqdq xmm8, xmm15, xmm6, 1 + vpclmulqdq xmm9, xmm15, xmm6, 16 + vpclmulqdq xmm10, xmm15, xmm6, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm15, xmm10 +L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_done: +L_AES_GCM_decrypt_vaes_done_dec: + mov edx, r9d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + vmovq xmm0, rdx + vmovq xmm1, rcx + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm15, xmm15, xmm0 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm15, xmm6, 0 + vpclmulqdq xmm8, xmm15, xmm6, 1 + vpclmulqdq xmm9, xmm15, xmm6, 16 + vpclmulqdq xmm10, xmm15, xmm6, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm15, xmm10 + vpshufb xmm15, xmm15, OWORD PTR L_vaes_aes_gcm_bswap_mask + vmovdqu xmm0, OWORD PTR [rsp+528] + vpxor xmm0, xmm0, xmm15 + cmp r14d, 16 + je L_AES_GCM_decrypt_vaes_cmp_tag_16 + sub rsp, 16 + xor rcx, rcx + xor rbx, rbx + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_decrypt_vaes_cmp_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + xor r13b, BYTE PTR [r8+rcx] + or bl, r13b + inc ecx + cmp ecx, r14d + jne L_AES_GCM_decrypt_vaes_cmp_tag_loop + cmp bl, 0 + sete bl + add rsp, 16 + xor rcx, rcx + jmp L_AES_GCM_decrypt_vaes_cmp_tag_done +L_AES_GCM_decrypt_vaes_cmp_tag_16: + vmovdqu xmm1, OWORD PTR [r8] + vpcmpeqb xmm0, xmm0, xmm1 + vpmovmskb rdx, xmm0 + ; %%edx == 0xFFFF then return 1 else => return 0 + xor ebx, ebx + cmp edx, 65535 + sete bl +L_AES_GCM_decrypt_vaes_cmp_tag_done: + mov DWORD PTR [rbp], ebx + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+544] + vmovdqu xmm7, OWORD PTR [rsp+560] + vmovdqu xmm8, OWORD PTR [rsp+576] + vmovdqu xmm9, OWORD PTR [rsp+592] + vmovdqu xmm10, OWORD PTR [rsp+608] + vmovdqu xmm11, OWORD PTR [rsp+624] + vmovdqu xmm12, OWORD PTR [rsp+640] + vmovdqu xmm13, OWORD PTR [rsp+656] + vmovdqu xmm14, OWORD PTR [rsp+672] + vmovdqu xmm15, OWORD PTR [rsp+688] + add rsp, 704 + pop rbp + pop r15 + pop r14 + pop rbx + pop r12 + pop rsi + pop rdi + pop r13 + ret +AES_GCM_decrypt_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_init_vaes PROC + push rdi + push rsi + push r12 + push r13 + mov rdi, rcx + mov rsi, rdx + mov r10, r8 + mov r11d, r9d + mov rax, QWORD PTR [rsp+72] + mov r8, QWORD PTR [rsp+80] + mov r9, QWORD PTR [rsp+88] + sub rsp, 80 + vmovdqu OWORD PTR [rsp+16], xmm6 + vmovdqu OWORD PTR [rsp+32], xmm7 + vmovdqu OWORD PTR [rsp+48], xmm8 + vmovdqu OWORD PTR [rsp+64], xmm15 + vpxor xmm4, xmm4, xmm4 + mov edx, r11d + cmp edx, 12 + jne L_AES_GCM_init_vaes_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + vmovq xmm4, QWORD PTR [r10] + vpinsrd xmm4, xmm4, DWORD PTR [r10+8], 2 + vpinsrd xmm4, xmm4, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqa xmm5, OWORD PTR [rdi] + vpxor xmm1, xmm4, xmm5 + vmovdqa xmm6, OWORD PTR [rdi+16] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+32] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+48] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+64] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+80] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+96] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+112] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+128] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+144] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + cmp esi, 11 + vmovdqa xmm6, OWORD PTR [rdi+160] + jl L_AES_GCM_init_vaes_calc_iv_12_last + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+176] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + cmp esi, 13 + vmovdqa xmm6, OWORD PTR [rdi+192] + jl L_AES_GCM_init_vaes_calc_iv_12_last + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+208] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+224] +L_AES_GCM_init_vaes_calc_iv_12_last: + vaesenclast xmm5, xmm5, xmm6 + vaesenclast xmm1, xmm1, xmm6 + vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_mask + vmovdqu xmm15, xmm1 + jmp L_AES_GCM_init_vaes_iv_done +L_AES_GCM_init_vaes_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqa xmm5, OWORD PTR [rdi] + vaesenc xmm5, xmm5, [rdi+16] + vaesenc xmm5, xmm5, [rdi+32] + vaesenc xmm5, xmm5, [rdi+48] + vaesenc xmm5, xmm5, [rdi+64] + vaesenc xmm5, xmm5, [rdi+80] + vaesenc xmm5, xmm5, [rdi+96] + vaesenc xmm5, xmm5, [rdi+112] + vaesenc xmm5, xmm5, [rdi+128] + vaesenc xmm5, xmm5, [rdi+144] + cmp esi, 11 + vmovdqa xmm8, OWORD PTR [rdi+160] + jl L_AES_GCM_init_vaes_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm8 + vaesenc xmm5, xmm5, [rdi+176] + cmp esi, 13 + vmovdqa xmm8, OWORD PTR [rdi+192] + jl L_AES_GCM_init_vaes_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm8 + vaesenc xmm5, xmm5, [rdi+208] + vmovdqa xmm8, OWORD PTR [rdi+224] +L_AES_GCM_init_vaes_calc_iv_1_aesenc_avx_last: + vaesenclast xmm5, xmm5, xmm8 + vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov rcx, 0 + je L_AES_GCM_init_vaes_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_init_vaes_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_init_vaes_calc_iv_16_loop: + vmovdqu xmm7, OWORD PTR [r10+rcx] + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm6, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm6, xmm6, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm6, 31 + vpslld xmm1, xmm6, 30 + vpslld xmm2, xmm6, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm6, xmm6, xmm0 + vpsrld xmm2, xmm6, 1 + vpsrld xmm3, xmm6, 2 + vpsrld xmm0, xmm6, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm6 + vpxor xmm4, xmm4, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_init_vaes_calc_iv_16_loop + mov edx, r11d + cmp ecx, edx + je L_AES_GCM_init_vaes_calc_iv_done +L_AES_GCM_init_vaes_calc_iv_lt16: + sub rsp, 16 + vpxor xmm7, xmm7, xmm7 + xor r13d, r13d + vmovdqu OWORD PTR [rsp], xmm7 +L_AES_GCM_init_vaes_calc_iv_loop: + movzx r12d, BYTE PTR [r10+rcx] + mov BYTE PTR [rsp+r13], r12b + inc ecx + inc r13d + cmp ecx, edx + jl L_AES_GCM_init_vaes_calc_iv_loop + vmovdqu xmm7, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm6, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm6, xmm6, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm6, 31 + vpslld xmm1, xmm6, 30 + vpslld xmm2, xmm6, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm6, xmm6, xmm0 + vpsrld xmm2, xmm6, 1 + vpsrld xmm3, xmm6, 2 + vpsrld xmm0, xmm6, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm6 + vpxor xmm4, xmm4, xmm2 +L_AES_GCM_init_vaes_calc_iv_done: + ; T = Encrypt counter + vpxor xmm0, xmm0, xmm0 + shl edx, 3 + vmovq xmm0, rdx + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm6, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm6, xmm6, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm6, 31 + vpslld xmm1, xmm6, 30 + vpslld xmm2, xmm6, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm6, xmm6, xmm0 + vpsrld xmm2, xmm6, 1 + vpsrld xmm3, xmm6, 2 + vpsrld xmm0, xmm6, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm6 + vpxor xmm4, xmm4, xmm2 + vpshufb xmm4, xmm4, OWORD PTR L_vaes_aes_gcm_bswap_mask + ; Encrypt counter + vmovdqa xmm7, OWORD PTR [rdi] + vpxor xmm7, xmm7, xmm4 + vaesenc xmm7, xmm7, [rdi+16] + vaesenc xmm7, xmm7, [rdi+32] + vaesenc xmm7, xmm7, [rdi+48] + vaesenc xmm7, xmm7, [rdi+64] + vaesenc xmm7, xmm7, [rdi+80] + vaesenc xmm7, xmm7, [rdi+96] + vaesenc xmm7, xmm7, [rdi+112] + vaesenc xmm7, xmm7, [rdi+128] + vaesenc xmm7, xmm7, [rdi+144] + cmp esi, 11 + vmovdqa xmm8, OWORD PTR [rdi+160] + jl L_AES_GCM_init_vaes_calc_iv_2_aesenc_avx_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [rdi+176] + cmp esi, 13 + vmovdqa xmm8, OWORD PTR [rdi+192] + jl L_AES_GCM_init_vaes_calc_iv_2_aesenc_avx_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [rdi+208] + vmovdqa xmm8, OWORD PTR [rdi+224] +L_AES_GCM_init_vaes_calc_iv_2_aesenc_avx_last: + vaesenclast xmm7, xmm7, xmm8 + vmovdqu xmm15, xmm7 +L_AES_GCM_init_vaes_iv_done: + vmovdqa OWORD PTR [r9], xmm15 + vpshufb xmm4, xmm4, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpaddd xmm4, xmm4, OWORD PTR L_vaes_aes_gcm_one + vmovdqa OWORD PTR [rax], xmm5 + vmovdqa OWORD PTR [r8], xmm4 + vmovdqu xmm6, OWORD PTR [rsp+16] + vmovdqu xmm7, OWORD PTR [rsp+32] + vmovdqu xmm8, OWORD PTR [rsp+48] + vmovdqu xmm15, OWORD PTR [rsp+64] + add rsp, 80 + pop r13 + pop r12 + pop rsi + pop rdi + ret +AES_GCM_init_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_aad_update_vaes PROC + mov rax, rcx + sub rsp, 32 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqa xmm5, OWORD PTR [r8] + vmovdqa xmm6, OWORD PTR [r9] + xor ecx, ecx +L_AES_GCM_aad_update_vaes_16_loop: + vmovdqu xmm7, OWORD PTR [rax+rcx] + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm5, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm5, xmm5, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm5, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm5, xmm5, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm5, xmm5, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm5, xmm5, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm5, xmm5, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_aad_update_vaes_16_loop + vmovdqa OWORD PTR [r8], xmm5 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + add rsp, 32 + ret +AES_GCM_aad_update_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_block_vaes PROC + mov r10, r8 + mov r11, r9 + mov rax, QWORD PTR [rsp+40] + vmovdqu xmm1, OWORD PTR [rax] + vpshufb xmm0, xmm1, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpaddd xmm1, xmm1, OWORD PTR L_vaes_aes_gcm_one + vmovdqu OWORD PTR [rax], xmm1 + vpxor xmm0, xmm0, [rcx] + vaesenc xmm0, xmm0, [rcx+16] + vaesenc xmm0, xmm0, [rcx+32] + vaesenc xmm0, xmm0, [rcx+48] + vaesenc xmm0, xmm0, [rcx+64] + vaesenc xmm0, xmm0, [rcx+80] + vaesenc xmm0, xmm0, [rcx+96] + vaesenc xmm0, xmm0, [rcx+112] + vaesenc xmm0, xmm0, [rcx+128] + vaesenc xmm0, xmm0, [rcx+144] + cmp edx, 11 + vmovdqa xmm1, OWORD PTR [rcx+160] + jl L_AES_GCM_encrypt_block_vaes_aesenc_block_last + vaesenc xmm0, xmm0, xmm1 + vaesenc xmm0, xmm0, [rcx+176] + cmp edx, 13 + vmovdqa xmm1, OWORD PTR [rcx+192] + jl L_AES_GCM_encrypt_block_vaes_aesenc_block_last + vaesenc xmm0, xmm0, xmm1 + vaesenc xmm0, xmm0, [rcx+208] + vmovdqa xmm1, OWORD PTR [rcx+224] +L_AES_GCM_encrypt_block_vaes_aesenc_block_last: + vaesenclast xmm0, xmm0, xmm1 + vmovdqu xmm1, OWORD PTR [r11] + vpxor xmm0, xmm0, xmm1 + vmovdqu OWORD PTR [r10], xmm0 + vpshufb xmm0, xmm0, OWORD PTR L_vaes_aes_gcm_bswap_mask + vzeroupper + ret +AES_GCM_encrypt_block_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_ghash_block_vaes PROC + sub rsp, 32 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqa xmm4, OWORD PTR [rdx] + vmovdqa xmm5, OWORD PTR [r8] + vmovdqu xmm7, OWORD PTR [rcx] + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm6, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm6, xmm6, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm6, 31 + vpslld xmm1, xmm6, 30 + vpslld xmm2, xmm6, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm6, xmm6, xmm0 + vpsrld xmm2, xmm6, 1 + vpsrld xmm3, xmm6, 2 + vpsrld xmm0, xmm6, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm6 + vpxor xmm4, xmm4, xmm2 + vmovdqa OWORD PTR [rdx], xmm4 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + add rsp, 32 + ret +AES_GCM_ghash_block_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_update_vaes PROC + push r13 + push r12 + push r14 + push r15 + push rdi + push rsi + push rbx + mov rax, rcx + mov r10, r8 + mov r8d, edx + mov r11, r9 + mov r9d, DWORD PTR [rsp+96] + mov r12, QWORD PTR [rsp+104] + mov r14, QWORD PTR [rsp+112] + mov r15, QWORD PTR [rsp+120] + sub rsp, 688 + vmovdqu OWORD PTR [rsp+528], xmm6 + vmovdqu OWORD PTR [rsp+544], xmm7 + vmovdqu OWORD PTR [rsp+560], xmm8 + vmovdqu OWORD PTR [rsp+576], xmm9 + vmovdqu OWORD PTR [rsp+592], xmm10 + vmovdqu OWORD PTR [rsp+608], xmm11 + vmovdqu OWORD PTR [rsp+624], xmm12 + vmovdqu OWORD PTR [rsp+640], xmm13 + vmovdqu OWORD PTR [rsp+656], xmm14 + vmovdqu OWORD PTR [rsp+672], xmm15 + vmovdqa xmm15, OWORD PTR [r12] + vmovdqa xmm6, OWORD PTR [r14] + vpsrlq xmm8, xmm6, 63 + vpsllq xmm7, xmm6, 1 + vpslldq xmm8, xmm8, 8 + vpor xmm7, xmm7, xmm8 + vpshufd xmm6, xmm6, 255 + vpsrad xmm6, xmm6, 31 + vpand xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpxor xmm6, xmm6, xmm7 + xor edi, edi + cmp r9d, 128 + jl L_AES_GCM_encrypt_update_vaes_done_128 + vmovdqa xmm2, xmm15 + ; H ^ 1 + vmovdqu OWORD PTR [rsp], xmm6 + ; H ^ 2 + vpclmulqdq xmm7, xmm6, xmm6, 0 + vpclmulqdq xmm10, xmm6, xmm6, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm0, xmm10 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm0, xmm6, 0 + vpclmulqdq xmm8, xmm0, xmm6, 1 + vpclmulqdq xmm9, xmm0, xmm6, 16 + vpclmulqdq xmm10, xmm0, xmm6, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm1, xmm10 + vmovdqu OWORD PTR [rsp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm3, xmm10 + vmovdqu OWORD PTR [rsp+48], xmm3 + ; H ^ 5 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+64], xmm4 + ; H ^ 6 + vpclmulqdq xmm7, xmm1, xmm1, 0 + vpclmulqdq xmm10, xmm1, xmm1, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+80], xmm4 + ; H ^ 7 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm3, xmm1, 0 + vpclmulqdq xmm8, xmm3, xmm1, 1 + vpclmulqdq xmm9, xmm3, xmm1, 16 + vpclmulqdq xmm10, xmm3, xmm1, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+96], xmm4 + ; H ^ 8 + vpclmulqdq xmm7, xmm3, xmm3, 0 + vpclmulqdq xmm10, xmm3, xmm3, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+112], xmm4 + cmp r9d, 256 + jl L_AES_GCM_encrypt_update_vaes_no_ext + ; H ^ 9 + vmovdqu xmm0, OWORD PTR [rsp+48] + vmovdqu xmm1, OWORD PTR [rsp+64] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+128], xmm4 + ; H ^ 10 + vmovdqu xmm0, OWORD PTR [rsp+64] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+144], xmm4 + ; H ^ 11 + vmovdqu xmm0, OWORD PTR [rsp+64] + vmovdqu xmm1, OWORD PTR [rsp+80] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+160], xmm4 + ; H ^ 12 + vmovdqu xmm0, OWORD PTR [rsp+80] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+176], xmm4 + ; H ^ 13 + vmovdqu xmm0, OWORD PTR [rsp+80] + vmovdqu xmm1, OWORD PTR [rsp+96] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+192], xmm4 + ; H ^ 14 + vmovdqu xmm0, OWORD PTR [rsp+96] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+208], xmm4 + ; H ^ 15 + vmovdqu xmm0, OWORD PTR [rsp+96] + vmovdqu xmm1, OWORD PTR [rsp+112] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+224], xmm4 + ; H ^ 16 + vmovdqu xmm0, OWORD PTR [rsp+112] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+240], xmm4 + vmovdqu ymm7, YMMWORD PTR [rsp+224] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+192] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+160] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp+128] + vpermq ymm10, ymm10, 78 + vmovdqu YMMWORD PTR [rsp+256], ymm7 + vmovdqu YMMWORD PTR [rsp+288], ymm8 + vmovdqu YMMWORD PTR [rsp+320], ymm9 + vmovdqu YMMWORD PTR [rsp+352], ymm10 + vmovdqu ymm7, YMMWORD PTR [rsp+96] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+64] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+32] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp] + vpermq ymm10, ymm10, 78 + vmovdqu YMMWORD PTR [rsp+384], ymm7 + vmovdqu YMMWORD PTR [rsp+416], ymm8 + vmovdqu YMMWORD PTR [rsp+448], ymm9 + vmovdqu YMMWORD PTR [rsp+480], ymm10 +L_AES_GCM_encrypt_update_vaes_no_ext: + vbroadcasti128 ymm14, ptr_L_vaes_aes_gcm_mod2_128 + cmp r9d, 256 + jl L_AES_GCM_encrypt_update_vaes_after_256 + mov r13d, r9d + and r13d, 4294967040 +L_AES_GCM_encrypt_update_vaes_loop_256: + ; 256 bytes of input + lea rsi, QWORD PTR [r10+rdi] + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 + vbroadcasti128 ymm4, [r15] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [r15] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [r15], xmm7 + vbroadcasti128 ymm4, [rax] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 11 + vbroadcasti128 ymm4, [rax+160] + jl L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 13 + vbroadcasti128 ymm4, [rax+192] + jl L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+224] +L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add edi, 128 + vbroadcasti128 ymm4, [r15] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [r15] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [r15], xmm7 + vbroadcasti128 ymm4, [rax] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 11 + vbroadcasti128 ymm4, [rax+160] + jl L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 13 + vbroadcasti128 ymm4, [rax+192] + jl L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+224] +L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add edi, 128 + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask + vpxor ymm4, ymm4, ymm4 + vinserti128 ymm4, ymm4, xmm15, 0 + vmovdqu ymm7, YMMWORD PTR [rsp+256] + vmovdqu ymm8, YMMWORD PTR [rsp+288] + vmovdqu ymm9, YMMWORD PTR [rsp+320] + vmovdqu ymm10, YMMWORD PTR [rsp+352] + vmovdqu ymm5, YMMWORD PTR [rsi] + vpshufb ymm5, ymm5, ymm6 + vpxor ymm5, ymm5, ymm4 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vmovdqa ymm11, ymm0 + vpxor ymm12, ymm2, ymm1 + vmovdqa ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rsi+32] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rsi+64] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rsi+96] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm7, YMMWORD PTR [rsp+384] + vmovdqu ymm8, YMMWORD PTR [rsp+416] + vmovdqu ymm9, YMMWORD PTR [rsp+448] + vmovdqu ymm10, YMMWORD PTR [rsp+480] + vmovdqu ymm5, YMMWORD PTR [rsi+128] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rsi+160] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rsi+192] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rsi+224] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vpclmulqdq ymm5, ymm14, ymm11, 1 + vpshufd ymm11, ymm11, 78 + vpxor ymm12, ymm12, ymm5 + vpxor ymm12, ymm12, ymm11 + vpclmulqdq ymm5, ymm14, ymm12, 1 + vpshufd ymm12, ymm12, 78 + vpxor ymm13, ymm13, ymm5 + vpxor ymm13, ymm13, ymm12 + vextracti128 xmm0, ymm13, 1 + vpxor xmm15, xmm13, xmm0 + cmp edi, r13d + jl L_AES_GCM_encrypt_update_vaes_loop_256 +L_AES_GCM_encrypt_update_vaes_after_256: + mov r13d, r9d + and r13d, 4294967168 + cmp edi, r13d + jge L_AES_GCM_encrypt_update_vaes_after_128 + ; 128 bytes of input + lea rsi, QWORD PTR [r10+rdi] + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 + vbroadcasti128 ymm4, [r15] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [r15] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [r15], xmm7 + vbroadcasti128 ymm4, [rax] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 11 + vbroadcasti128 ymm4, [rax+160] + jl L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 13 + vbroadcasti128 ymm4, [rax+192] + jl L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+224] +L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add edi, 128 + vmovdqu ymm7, YMMWORD PTR [rsp+96] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+64] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+32] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp] + vpermq ymm10, ymm10, 78 + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask + vpxor ymm4, ymm4, ymm4 + vinserti128 ymm4, ymm4, xmm15, 0 + vmovdqu ymm5, YMMWORD PTR [rsi] + vpshufb ymm5, ymm5, ymm6 + vpxor ymm5, ymm5, ymm4 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vmovdqa ymm11, ymm0 + vpxor ymm12, ymm2, ymm1 + vmovdqa ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rsi+32] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rsi+64] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rsi+96] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vpclmulqdq ymm5, ymm14, ymm11, 1 + vpshufd ymm11, ymm11, 78 + vpxor ymm12, ymm12, ymm5 + vpxor ymm12, ymm12, ymm11 + vpclmulqdq ymm5, ymm14, ymm12, 1 + vpshufd ymm12, ymm12, 78 + vpxor ymm13, ymm13, ymm5 + vpxor ymm13, ymm13, ymm12 + vextracti128 xmm0, ymm13, 1 + vpxor xmm15, xmm13, xmm0 +L_AES_GCM_encrypt_update_vaes_after_128: + vmovdqu xmm6, OWORD PTR [rsp] +L_AES_GCM_encrypt_update_vaes_done_128: + mov edx, r9d + cmp edi, edx + jge L_AES_GCM_encrypt_update_vaes_done_enc + mov r13d, r9d + and r13d, 4294967280 + cmp edi, r13d + jge L_AES_GCM_encrypt_update_vaes_last_block_done + vmovdqu xmm8, OWORD PTR [r15] + vpshufb xmm7, xmm8, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpaddd xmm8, xmm8, OWORD PTR L_vaes_aes_gcm_one + vmovdqu OWORD PTR [r15], xmm8 + vpxor xmm7, xmm7, [rax] + vaesenc xmm7, xmm7, [rax+16] + vaesenc xmm7, xmm7, [rax+32] + vaesenc xmm7, xmm7, [rax+48] + vaesenc xmm7, xmm7, [rax+64] + vaesenc xmm7, xmm7, [rax+80] + vaesenc xmm7, xmm7, [rax+96] + vaesenc xmm7, xmm7, [rax+112] + vaesenc xmm7, xmm7, [rax+128] + vaesenc xmm7, xmm7, [rax+144] + cmp r8d, 11 + vmovdqa xmm8, OWORD PTR [rax+160] + jl L_AES_GCM_encrypt_update_vaes_aesenc_block_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [rax+176] + cmp r8d, 13 + vmovdqa xmm8, OWORD PTR [rax+192] + jl L_AES_GCM_encrypt_update_vaes_aesenc_block_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [rax+208] + vmovdqa xmm8, OWORD PTR [rax+224] +L_AES_GCM_encrypt_update_vaes_aesenc_block_last: + vaesenclast xmm7, xmm7, xmm8 + vmovdqu xmm8, OWORD PTR [r11+rdi] + vpxor xmm7, xmm7, xmm8 + vmovdqu OWORD PTR [r10+rdi], xmm7 + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm15, xmm15, xmm7 + add edi, 16 + cmp edi, r13d + jge L_AES_GCM_encrypt_update_vaes_last_block_ghash +L_AES_GCM_encrypt_update_vaes_last_block_start: + vmovdqu xmm12, OWORD PTR [r11+rdi] + vmovdqu xmm8, OWORD PTR [r15] + vpshufb xmm7, xmm8, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpaddd xmm8, xmm8, OWORD PTR L_vaes_aes_gcm_one + vmovdqu OWORD PTR [r15], xmm8 + vpxor xmm7, xmm7, [rax] + vpclmulqdq xmm9, xmm15, xmm6, 16 + vaesenc xmm7, xmm7, [rax+16] + vaesenc xmm7, xmm7, [rax+32] + vpclmulqdq xmm10, xmm15, xmm6, 1 + vaesenc xmm7, xmm7, [rax+48] + vaesenc xmm7, xmm7, [rax+64] + vpclmulqdq xmm11, xmm15, xmm6, 0 + vaesenc xmm7, xmm7, [rax+80] + vpclmulqdq xmm1, xmm15, xmm6, 17 + vaesenc xmm7, xmm7, [rax+96] + vpxor xmm9, xmm9, xmm10 + vpslldq xmm2, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vaesenc xmm7, xmm7, [rax+112] + vpxor xmm2, xmm2, xmm11 + vpxor xmm3, xmm1, xmm9 + vmovdqa xmm0, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm10, xmm2, xmm0, 16 + vaesenc xmm7, xmm7, [rax+128] + vpshufd xmm9, xmm2, 78 + vpxor xmm9, xmm9, xmm10 + vpclmulqdq xmm10, xmm9, xmm0, 16 + vaesenc xmm7, xmm7, [rax+144] + vpshufd xmm9, xmm9, 78 + vpxor xmm9, xmm9, xmm10 + vpxor xmm15, xmm9, xmm3 + cmp r8d, 11 + vmovdqa xmm8, OWORD PTR [rax+160] + jl L_AES_GCM_encrypt_update_vaes_aesenc_gfmul_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [rax+176] + cmp r8d, 13 + vmovdqa xmm8, OWORD PTR [rax+192] + jl L_AES_GCM_encrypt_update_vaes_aesenc_gfmul_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [rax+208] + vmovdqa xmm8, OWORD PTR [rax+224] +L_AES_GCM_encrypt_update_vaes_aesenc_gfmul_last: + vaesenclast xmm7, xmm7, xmm8 + vmovdqa xmm0, xmm12 + vpxor xmm7, xmm7, xmm0 + vmovdqu OWORD PTR [r10+rdi], xmm7 + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + add edi, 16 + vpxor xmm15, xmm15, xmm7 + cmp edi, r13d + jl L_AES_GCM_encrypt_update_vaes_last_block_start +L_AES_GCM_encrypt_update_vaes_last_block_ghash: + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm15, xmm6, 0 + vpclmulqdq xmm8, xmm15, xmm6, 1 + vpclmulqdq xmm9, xmm15, xmm6, 16 + vpclmulqdq xmm10, xmm15, xmm6, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm15, xmm10 +L_AES_GCM_encrypt_update_vaes_last_block_done: +L_AES_GCM_encrypt_update_vaes_done_enc: + vmovdqa OWORD PTR [r12], xmm15 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+528] + vmovdqu xmm7, OWORD PTR [rsp+544] + vmovdqu xmm8, OWORD PTR [rsp+560] + vmovdqu xmm9, OWORD PTR [rsp+576] + vmovdqu xmm10, OWORD PTR [rsp+592] + vmovdqu xmm11, OWORD PTR [rsp+608] + vmovdqu xmm12, OWORD PTR [rsp+624] + vmovdqu xmm13, OWORD PTR [rsp+640] + vmovdqu xmm14, OWORD PTR [rsp+656] + vmovdqu xmm15, OWORD PTR [rsp+672] + add rsp, 688 + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r12 + pop r13 + ret +AES_GCM_encrypt_update_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_final_vaes PROC + push r13 + push r12 + push r14 + mov rax, rcx + mov r10d, r9d + mov r9, rdx + mov r11d, DWORD PTR [rsp+64] + mov r12, QWORD PTR [rsp+72] + mov r14, QWORD PTR [rsp+80] + sub rsp, 144 + vmovdqu OWORD PTR [rsp+16], xmm6 + vmovdqu OWORD PTR [rsp+32], xmm7 + vmovdqu OWORD PTR [rsp+48], xmm8 + vmovdqu OWORD PTR [rsp+64], xmm9 + vmovdqu OWORD PTR [rsp+80], xmm10 + vmovdqu OWORD PTR [rsp+96], xmm11 + vmovdqu OWORD PTR [rsp+112], xmm12 + vmovdqu OWORD PTR [rsp+128], xmm13 + vmovdqa xmm4, OWORD PTR [rax] + vmovdqa xmm5, OWORD PTR [r12] + vmovdqa xmm6, OWORD PTR [r14] + vpsrlq xmm8, xmm5, 63 + vpsllq xmm7, xmm5, 1 + vpslldq xmm8, xmm8, 8 + vpor xmm7, xmm7, xmm8 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpxor xmm5, xmm5, xmm7 + mov edx, r10d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + vmovq xmm0, rdx + vmovq xmm1, rcx + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm4, xmm5, 0 + vpclmulqdq xmm8, xmm4, xmm5, 1 + vpclmulqdq xmm9, xmm4, xmm5, 16 + vpclmulqdq xmm10, xmm4, xmm5, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vpshufb xmm4, xmm4, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm0, xmm4, xmm6 + cmp r8d, 16 + je L_AES_GCM_encrypt_final_vaes_store_tag_16 + xor rcx, rcx + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_encrypt_final_vaes_store_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + mov BYTE PTR [r9+rcx], r13b + inc ecx + cmp ecx, r8d + jne L_AES_GCM_encrypt_final_vaes_store_tag_loop + jmp L_AES_GCM_encrypt_final_vaes_store_tag_done +L_AES_GCM_encrypt_final_vaes_store_tag_16: + vmovdqu OWORD PTR [r9], xmm0 +L_AES_GCM_encrypt_final_vaes_store_tag_done: + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+16] + vmovdqu xmm7, OWORD PTR [rsp+32] + vmovdqu xmm8, OWORD PTR [rsp+48] + vmovdqu xmm9, OWORD PTR [rsp+64] + vmovdqu xmm10, OWORD PTR [rsp+80] + vmovdqu xmm11, OWORD PTR [rsp+96] + vmovdqu xmm12, OWORD PTR [rsp+112] + vmovdqu xmm13, OWORD PTR [rsp+128] + add rsp, 144 + pop r14 + pop r12 + pop r13 + ret +AES_GCM_encrypt_final_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_decrypt_update_vaes PROC + push r13 + push r12 + push r14 + push r15 + push rdi + push rsi + push rbx + mov rax, rcx + mov r10, r8 + mov r8d, edx + mov r11, r9 + mov r9d, DWORD PTR [rsp+96] + mov r12, QWORD PTR [rsp+104] + mov r14, QWORD PTR [rsp+112] + mov r15, QWORD PTR [rsp+120] + sub rsp, 688 + vmovdqu OWORD PTR [rsp+528], xmm6 + vmovdqu OWORD PTR [rsp+544], xmm7 + vmovdqu OWORD PTR [rsp+560], xmm8 + vmovdqu OWORD PTR [rsp+576], xmm9 + vmovdqu OWORD PTR [rsp+592], xmm10 + vmovdqu OWORD PTR [rsp+608], xmm11 + vmovdqu OWORD PTR [rsp+624], xmm12 + vmovdqu OWORD PTR [rsp+640], xmm13 + vmovdqu OWORD PTR [rsp+656], xmm14 + vmovdqu OWORD PTR [rsp+672], xmm15 + vmovdqa xmm15, OWORD PTR [r12] + vmovdqa xmm6, OWORD PTR [r14] + vpsrlq xmm8, xmm6, 63 + vpsllq xmm7, xmm6, 1 + vpslldq xmm8, xmm8, 8 + vpor xmm7, xmm7, xmm8 + vpshufd xmm6, xmm6, 255 + vpsrad xmm6, xmm6, 31 + vpand xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpxor xmm6, xmm6, xmm7 + xor edi, edi + cmp r9d, 128 + jl L_AES_GCM_decrypt_update_vaes_done_128 + vmovdqa xmm2, xmm15 + ; H ^ 1 + vmovdqu OWORD PTR [rsp], xmm6 + ; H ^ 2 + vpclmulqdq xmm7, xmm6, xmm6, 0 + vpclmulqdq xmm10, xmm6, xmm6, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm0, xmm10 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm0, xmm6, 0 + vpclmulqdq xmm8, xmm0, xmm6, 1 + vpclmulqdq xmm9, xmm0, xmm6, 16 + vpclmulqdq xmm10, xmm0, xmm6, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm1, xmm10 + vmovdqu OWORD PTR [rsp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm3, xmm10 + vmovdqu OWORD PTR [rsp+48], xmm3 + ; H ^ 5 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+64], xmm4 + ; H ^ 6 + vpclmulqdq xmm7, xmm1, xmm1, 0 + vpclmulqdq xmm10, xmm1, xmm1, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+80], xmm4 + ; H ^ 7 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm3, xmm1, 0 + vpclmulqdq xmm8, xmm3, xmm1, 1 + vpclmulqdq xmm9, xmm3, xmm1, 16 + vpclmulqdq xmm10, xmm3, xmm1, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+96], xmm4 + ; H ^ 8 + vpclmulqdq xmm7, xmm3, xmm3, 0 + vpclmulqdq xmm10, xmm3, xmm3, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+112], xmm4 + cmp r9d, 256 + jl L_AES_GCM_decrypt_update_vaes_no_ext + ; H ^ 9 + vmovdqu xmm0, OWORD PTR [rsp+48] + vmovdqu xmm1, OWORD PTR [rsp+64] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+128], xmm4 + ; H ^ 10 + vmovdqu xmm0, OWORD PTR [rsp+64] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+144], xmm4 + ; H ^ 11 + vmovdqu xmm0, OWORD PTR [rsp+64] + vmovdqu xmm1, OWORD PTR [rsp+80] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+160], xmm4 + ; H ^ 12 + vmovdqu xmm0, OWORD PTR [rsp+80] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+176], xmm4 + ; H ^ 13 + vmovdqu xmm0, OWORD PTR [rsp+80] + vmovdqu xmm1, OWORD PTR [rsp+96] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+192], xmm4 + ; H ^ 14 + vmovdqu xmm0, OWORD PTR [rsp+96] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+208], xmm4 + ; H ^ 15 + vmovdqu xmm0, OWORD PTR [rsp+96] + vmovdqu xmm1, OWORD PTR [rsp+112] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+224], xmm4 + ; H ^ 16 + vmovdqu xmm0, OWORD PTR [rsp+112] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+240], xmm4 + vmovdqu ymm7, YMMWORD PTR [rsp+224] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+192] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+160] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp+128] + vpermq ymm10, ymm10, 78 + vmovdqu YMMWORD PTR [rsp+256], ymm7 + vmovdqu YMMWORD PTR [rsp+288], ymm8 + vmovdqu YMMWORD PTR [rsp+320], ymm9 + vmovdqu YMMWORD PTR [rsp+352], ymm10 + vmovdqu ymm7, YMMWORD PTR [rsp+96] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+64] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+32] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp] + vpermq ymm10, ymm10, 78 + vmovdqu YMMWORD PTR [rsp+384], ymm7 + vmovdqu YMMWORD PTR [rsp+416], ymm8 + vmovdqu YMMWORD PTR [rsp+448], ymm9 + vmovdqu YMMWORD PTR [rsp+480], ymm10 +L_AES_GCM_decrypt_update_vaes_no_ext: + vbroadcasti128 ymm14, ptr_L_vaes_aes_gcm_mod2_128 + cmp r9d, 256 + jl L_AES_GCM_decrypt_update_vaes_after_256 + mov r13d, r9d + and r13d, 4294967040 +L_AES_GCM_decrypt_update_vaes_loop_256: + ; 256 bytes of input + lea rbx, QWORD PTR [r11+rdi] + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask + vpxor ymm4, ymm4, ymm4 + vinserti128 ymm4, ymm4, xmm15, 0 + vmovdqu ymm7, YMMWORD PTR [rsp+256] + vmovdqu ymm8, YMMWORD PTR [rsp+288] + vmovdqu ymm9, YMMWORD PTR [rsp+320] + vmovdqu ymm10, YMMWORD PTR [rsp+352] + vmovdqu ymm5, YMMWORD PTR [rbx] + vpshufb ymm5, ymm5, ymm6 + vpxor ymm5, ymm5, ymm4 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vmovdqa ymm11, ymm0 + vpxor ymm12, ymm2, ymm1 + vmovdqa ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rbx+32] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rbx+64] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rbx+96] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm7, YMMWORD PTR [rsp+384] + vmovdqu ymm8, YMMWORD PTR [rsp+416] + vmovdqu ymm9, YMMWORD PTR [rsp+448] + vmovdqu ymm10, YMMWORD PTR [rsp+480] + vmovdqu ymm5, YMMWORD PTR [rbx+128] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rbx+160] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rbx+192] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rbx+224] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vpclmulqdq ymm5, ymm14, ymm11, 1 + vpshufd ymm11, ymm11, 78 + vpxor ymm12, ymm12, ymm5 + vpxor ymm12, ymm12, ymm11 + vpclmulqdq ymm5, ymm14, ymm12, 1 + vpshufd ymm12, ymm12, 78 + vpxor ymm13, ymm13, ymm5 + vpxor ymm13, ymm13, ymm12 + vextracti128 xmm0, ymm13, 1 + vpxor xmm15, xmm13, xmm0 + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 + vbroadcasti128 ymm4, [r15] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [r15] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [r15], xmm7 + vbroadcasti128 ymm4, [rax] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 11 + vbroadcasti128 ymm4, [rax+160] + jl L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 13 + vbroadcasti128 ymm4, [rax+192] + jl L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+224] +L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add edi, 128 + vbroadcasti128 ymm4, [r15] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [r15] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [r15], xmm7 + vbroadcasti128 ymm4, [rax] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 11 + vbroadcasti128 ymm4, [rax+160] + jl L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 13 + vbroadcasti128 ymm4, [rax+192] + jl L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+224] +L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add edi, 128 + cmp edi, r13d + jl L_AES_GCM_decrypt_update_vaes_loop_256 +L_AES_GCM_decrypt_update_vaes_after_256: + vmovdqu ymm7, YMMWORD PTR [rsp+96] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+64] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+32] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp] + vpermq ymm10, ymm10, 78 + mov r13d, r9d + and r13d, 4294967168 + cmp edi, r13d + jge L_AES_GCM_decrypt_update_vaes_after_128 + ; 128 bytes of input + lea rbx, QWORD PTR [r11+rdi] + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask + vpxor ymm4, ymm4, ymm4 + vinserti128 ymm4, ymm4, xmm15, 0 + vmovdqu ymm5, YMMWORD PTR [rbx] + vpshufb ymm5, ymm5, ymm6 + vpxor ymm5, ymm5, ymm4 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vmovdqa ymm11, ymm0 + vpxor ymm12, ymm2, ymm1 + vmovdqa ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rbx+32] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rbx+64] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rbx+96] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vpclmulqdq ymm5, ymm14, ymm11, 1 + vpshufd ymm11, ymm11, 78 + vpxor ymm12, ymm12, ymm5 + vpxor ymm12, ymm12, ymm11 + vpclmulqdq ymm5, ymm14, ymm12, 1 + vpshufd ymm12, ymm12, 78 + vpxor ymm13, ymm13, ymm5 + vpxor ymm13, ymm13, ymm12 + vextracti128 xmm0, ymm13, 1 + vpxor xmm15, xmm13, xmm0 + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 + vbroadcasti128 ymm4, [r15] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [r15] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [r15], xmm7 + vbroadcasti128 ymm4, [rax] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 11 + vbroadcasti128 ymm4, [rax+160] + jl L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 13 + vbroadcasti128 ymm4, [rax+192] + jl L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+224] +L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add edi, 128 +L_AES_GCM_decrypt_update_vaes_after_128: + vmovdqu xmm6, OWORD PTR [rsp] +L_AES_GCM_decrypt_update_vaes_done_128: + mov edx, r9d + cmp edi, edx + jge L_AES_GCM_decrypt_update_vaes_done_dec + mov r13d, r9d + and r13d, 4294967280 + cmp edi, r13d + jge L_AES_GCM_decrypt_update_vaes_last_block_done +L_AES_GCM_decrypt_update_vaes_last_block_start: + vmovdqu xmm12, OWORD PTR [r11+rdi] + vmovdqa xmm0, xmm6 + vpshufb xmm1, xmm12, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm1, xmm1, xmm15 + vmovdqu xmm8, OWORD PTR [r15] + vpshufb xmm7, xmm8, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpaddd xmm8, xmm8, OWORD PTR L_vaes_aes_gcm_one + vmovdqu OWORD PTR [r15], xmm8 + vpxor xmm7, xmm7, [rax] + vpclmulqdq xmm9, xmm1, xmm0, 16 + vaesenc xmm7, xmm7, [rax+16] + vaesenc xmm7, xmm7, [rax+32] + vpclmulqdq xmm10, xmm1, xmm0, 1 + vaesenc xmm7, xmm7, [rax+48] + vaesenc xmm7, xmm7, [rax+64] + vpclmulqdq xmm11, xmm1, xmm0, 0 + vaesenc xmm7, xmm7, [rax+80] + vpclmulqdq xmm1, xmm1, xmm0, 17 + vaesenc xmm7, xmm7, [rax+96] + vpxor xmm9, xmm9, xmm10 + vpslldq xmm2, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vaesenc xmm7, xmm7, [rax+112] + vpxor xmm2, xmm2, xmm11 + vpxor xmm3, xmm1, xmm9 + vmovdqa xmm0, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm10, xmm2, xmm0, 16 + vaesenc xmm7, xmm7, [rax+128] + vpshufd xmm9, xmm2, 78 + vpxor xmm9, xmm9, xmm10 + vpclmulqdq xmm10, xmm9, xmm0, 16 + vaesenc xmm7, xmm7, [rax+144] + vpshufd xmm9, xmm9, 78 + vpxor xmm9, xmm9, xmm10 + vpxor xmm15, xmm9, xmm3 + cmp r8d, 11 + vmovdqa xmm8, OWORD PTR [rax+160] + jl L_AES_GCM_decrypt_update_vaes_aesenc_gfmul_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [rax+176] + cmp r8d, 13 + vmovdqa xmm8, OWORD PTR [rax+192] + jl L_AES_GCM_decrypt_update_vaes_aesenc_gfmul_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [rax+208] + vmovdqa xmm8, OWORD PTR [rax+224] +L_AES_GCM_decrypt_update_vaes_aesenc_gfmul_last: + vaesenclast xmm7, xmm7, xmm8 + vmovdqa xmm0, xmm12 + vpxor xmm7, xmm7, xmm0 + vmovdqu OWORD PTR [r10+rdi], xmm7 + add edi, 16 + cmp edi, r13d + jl L_AES_GCM_decrypt_update_vaes_last_block_start +L_AES_GCM_decrypt_update_vaes_last_block_done: +L_AES_GCM_decrypt_update_vaes_done_dec: + vmovdqa OWORD PTR [r12], xmm15 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+528] + vmovdqu xmm7, OWORD PTR [rsp+544] + vmovdqu xmm8, OWORD PTR [rsp+560] + vmovdqu xmm9, OWORD PTR [rsp+576] + vmovdqu xmm10, OWORD PTR [rsp+592] + vmovdqu xmm11, OWORD PTR [rsp+608] + vmovdqu xmm12, OWORD PTR [rsp+624] + vmovdqu xmm13, OWORD PTR [rsp+640] + vmovdqu xmm14, OWORD PTR [rsp+656] + vmovdqu xmm15, OWORD PTR [rsp+672] + add rsp, 688 + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r12 + pop r13 + ret +AES_GCM_decrypt_update_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_decrypt_final_vaes PROC + push r13 + push r12 + push r14 + push rbp + push r15 + mov rax, rcx + mov r10d, r9d + mov r9, rdx + mov r11d, DWORD PTR [rsp+80] + mov r12, QWORD PTR [rsp+88] + mov r14, QWORD PTR [rsp+96] + mov rbp, QWORD PTR [rsp+104] + sub rsp, 160 + vmovdqu OWORD PTR [rsp+16], xmm6 + vmovdqu OWORD PTR [rsp+32], xmm7 + vmovdqu OWORD PTR [rsp+48], xmm8 + vmovdqu OWORD PTR [rsp+64], xmm9 + vmovdqu OWORD PTR [rsp+80], xmm10 + vmovdqu OWORD PTR [rsp+96], xmm11 + vmovdqu OWORD PTR [rsp+112], xmm12 + vmovdqu OWORD PTR [rsp+128], xmm13 + vmovdqu OWORD PTR [rsp+144], xmm15 + vmovdqa xmm6, OWORD PTR [rax] + vmovdqa xmm5, OWORD PTR [r12] + vmovdqa xmm15, OWORD PTR [r14] + vpsrlq xmm8, xmm5, 63 + vpsllq xmm7, xmm5, 1 + vpslldq xmm8, xmm8, 8 + vpor xmm7, xmm7, xmm8 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpxor xmm5, xmm5, xmm7 + mov edx, r10d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + vmovq xmm0, rdx + vmovq xmm1, rcx + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm6, xmm6, xmm0 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm6, xmm5, 0 + vpclmulqdq xmm8, xmm6, xmm5, 1 + vpclmulqdq xmm9, xmm6, xmm5, 16 + vpclmulqdq xmm10, xmm6, xmm5, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm6, xmm10 + vpshufb xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm0, xmm6, xmm15 + cmp r8d, 16 + je L_AES_GCM_decrypt_final_vaes_cmp_tag_16 + sub rsp, 16 + xor rcx, rcx + xor r15, r15 + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_decrypt_final_vaes_cmp_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + xor r13b, BYTE PTR [r9+rcx] + or r15b, r13b + inc ecx + cmp ecx, r8d + jne L_AES_GCM_decrypt_final_vaes_cmp_tag_loop + cmp r15b, 0 + sete r15b + add rsp, 16 + xor rcx, rcx + jmp L_AES_GCM_decrypt_final_vaes_cmp_tag_done +L_AES_GCM_decrypt_final_vaes_cmp_tag_16: + vmovdqu xmm1, OWORD PTR [r9] + vpcmpeqb xmm0, xmm0, xmm1 + vpmovmskb rdx, xmm0 + ; %%edx == 0xFFFF then return 1 else => return 0 + xor r15d, r15d + cmp edx, 65535 + sete r15b +L_AES_GCM_decrypt_final_vaes_cmp_tag_done: + mov DWORD PTR [rbp], r15d + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+16] + vmovdqu xmm7, OWORD PTR [rsp+32] + vmovdqu xmm8, OWORD PTR [rsp+48] + vmovdqu xmm9, OWORD PTR [rsp+64] + vmovdqu xmm10, OWORD PTR [rsp+80] + vmovdqu xmm11, OWORD PTR [rsp+96] + vmovdqu xmm12, OWORD PTR [rsp+112] + vmovdqu xmm13, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + pop r15 + pop rbp + pop r14 + pop r12 + pop r13 + ret +AES_GCM_decrypt_final_vaes ENDP +_TEXT ENDS +ENDIF +IFDEF HAVE_INTEL_AVX512 +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_inc_z0 QWORD \ + 0000000000000000h, 0000000000000000h, + 0000000000000000h, 0000000000000001h, + 0000000000000000h, 0000000000000002h, + 0000000000000000h, 0000000000000003h +ptr_L_avx512_aes_gcm_inc_z0 QWORD L_avx512_aes_gcm_inc_z0 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_inc_z1 QWORD \ + 0000000000000000h, 0000000000000004h, + 0000000000000000h, 0000000000000005h, + 0000000000000000h, 0000000000000006h, + 0000000000000000h, 0000000000000007h +ptr_L_avx512_aes_gcm_inc_z1 QWORD L_avx512_aes_gcm_inc_z1 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_inc_z2 QWORD \ + 0000000000000000h, 0000000000000008h, + 0000000000000000h, 0000000000000009h, + 0000000000000000h, 000000000000000ah, + 0000000000000000h, 000000000000000bh +ptr_L_avx512_aes_gcm_inc_z2 QWORD L_avx512_aes_gcm_inc_z2 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_inc_z3 QWORD \ + 0000000000000000h, 000000000000000ch, + 0000000000000000h, 000000000000000dh, + 0000000000000000h, 000000000000000eh, + 0000000000000000h, 000000000000000fh +ptr_L_avx512_aes_gcm_inc_z3 QWORD L_avx512_aes_gcm_inc_z3 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_sixteen QWORD \ + 0000000000000000h, 0000000000000010h +ptr_L_avx512_aes_gcm_sixteen QWORD L_avx512_aes_gcm_sixteen +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_GCM_generate_m0_avx512_rev8 QWORD \ + 08090a0b0c0d0e0fh, 0001020304050607h +ptr_L_GCM_generate_m0_avx512_rev8 QWORD L_GCM_generate_m0_avx512_rev8 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_GCM_generate_m0_avx512_mod2_128 QWORD \ + 0000000000000000h, 0e100000000000000h +ptr_L_GCM_generate_m0_avx512_mod2_128 QWORD L_GCM_generate_m0_avx512_mod2_128 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +GCM_generate_m0_avx512 PROC + sub rsp, 80 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu xmm9, OWORD PTR L_GCM_generate_m0_avx512_rev8 + vmovdqu xmm10, OWORD PTR L_GCM_generate_m0_avx512_mod2_128 + vpxor xmm8, xmm8, xmm8 + vmovdqu xmm0, OWORD PTR [rcx] + vmovdqu OWORD PTR [rdx], xmm8 + vmovdqu xmm8, xmm0 + vpshufb xmm0, xmm0, xmm9 + vpsllq xmm5, xmm0, 63 + vpsrlq xmm4, xmm0, 1 + vpslldq xmm1, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpshufd xmm1, xmm1, 255 + vpor xmm4, xmm4, xmm5 + vpsrad xmm1, xmm1, 31 + vpand xmm1, xmm1, xmm10 + vpxor xmm1, xmm1, xmm4 + vpsllq xmm5, xmm1, 63 + vpsrlq xmm4, xmm1, 1 + vpslldq xmm2, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpshufd xmm2, xmm2, 255 + vpor xmm4, xmm4, xmm5 + vpsrad xmm2, xmm2, 31 + vpand xmm2, xmm2, xmm10 + vpxor xmm2, xmm2, xmm4 + vpsllq xmm5, xmm2, 63 + vpsrlq xmm4, xmm2, 1 + vpslldq xmm3, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpshufd xmm3, xmm3, 255 + vpor xmm4, xmm4, xmm5 + vpsrad xmm3, xmm3, 31 + vpand xmm3, xmm3, xmm10 + vpxor xmm3, xmm3, xmm4 + vpshufb xmm3, xmm3, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm0, xmm0, xmm9 + vpxor xmm8, xmm3, xmm2 + vmovdqu OWORD PTR [rdx+16], xmm3 + vmovdqu OWORD PTR [rdx+32], xmm2 + vmovdqu OWORD PTR [rdx+48], xmm8 + vmovdqu OWORD PTR [rdx+64], xmm1 + vpxor xmm4, xmm3, xmm1 + vpxor xmm5, xmm2, xmm1 + vpxor xmm6, xmm8, xmm1 + vmovdqu OWORD PTR [rdx+80], xmm4 + vmovdqu OWORD PTR [rdx+96], xmm5 + vmovdqu OWORD PTR [rdx+112], xmm6 + vmovdqu OWORD PTR [rdx+128], xmm0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm3, xmm0 + vpxor xmm6, xmm2, xmm0 + vmovdqu OWORD PTR [rdx+144], xmm4 + vmovdqu OWORD PTR [rdx+160], xmm6 + vpxor xmm6, xmm3, xmm6 + vmovdqu OWORD PTR [rdx+176], xmm6 + vmovdqu OWORD PTR [rdx+192], xmm1 + vpxor xmm4, xmm3, xmm1 + vpxor xmm5, xmm2, xmm1 + vpxor xmm6, xmm8, xmm1 + vmovdqu OWORD PTR [rdx+208], xmm4 + vmovdqu OWORD PTR [rdx+224], xmm5 + vmovdqu OWORD PTR [rdx+240], xmm6 + vmovdqu xmm0, OWORD PTR [rdx] + vmovdqu xmm1, OWORD PTR [rdx+16] + vmovdqu xmm2, OWORD PTR [rdx+32] + vmovdqu xmm3, OWORD PTR [rdx+48] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+256], xmm0 + vmovdqu OWORD PTR [rdx+272], xmm1 + vmovdqu OWORD PTR [rdx+288], xmm2 + vmovdqu OWORD PTR [rdx+304], xmm3 + vmovdqu xmm0, OWORD PTR [rdx+64] + vmovdqu xmm1, OWORD PTR [rdx+80] + vmovdqu xmm2, OWORD PTR [rdx+96] + vmovdqu xmm3, OWORD PTR [rdx+112] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+320], xmm0 + vmovdqu OWORD PTR [rdx+336], xmm1 + vmovdqu OWORD PTR [rdx+352], xmm2 + vmovdqu OWORD PTR [rdx+368], xmm3 + vmovdqu xmm0, OWORD PTR [rdx+128] + vmovdqu xmm1, OWORD PTR [rdx+144] + vmovdqu xmm2, OWORD PTR [rdx+160] + vmovdqu xmm3, OWORD PTR [rdx+176] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+384], xmm0 + vmovdqu OWORD PTR [rdx+400], xmm1 + vmovdqu OWORD PTR [rdx+416], xmm2 + vmovdqu OWORD PTR [rdx+432], xmm3 + vmovdqu xmm0, OWORD PTR [rdx+192] + vmovdqu xmm1, OWORD PTR [rdx+208] + vmovdqu xmm2, OWORD PTR [rdx+224] + vmovdqu xmm3, OWORD PTR [rdx+240] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+448], xmm0 + vmovdqu OWORD PTR [rdx+464], xmm1 + vmovdqu OWORD PTR [rdx+480], xmm2 + vmovdqu OWORD PTR [rdx+496], xmm3 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + add rsp, 80 + ret +GCM_generate_m0_avx512 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_one QWORD \ + 0000000000000000h, 0000000000000001h +ptr_L_avx512_aes_gcm_one QWORD L_avx512_aes_gcm_one +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_two QWORD \ + 0000000000000000h, 0000000000000002h +ptr_L_avx512_aes_gcm_two QWORD L_avx512_aes_gcm_two +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_three QWORD \ + 0000000000000000h, 0000000000000003h +ptr_L_avx512_aes_gcm_three QWORD L_avx512_aes_gcm_three +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_four QWORD \ + 0000000000000000h, 0000000000000004h +ptr_L_avx512_aes_gcm_four QWORD L_avx512_aes_gcm_four +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_five QWORD \ + 0000000000000000h, 0000000000000005h +ptr_L_avx512_aes_gcm_five QWORD L_avx512_aes_gcm_five +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_six QWORD \ + 0000000000000000h, 0000000000000006h +ptr_L_avx512_aes_gcm_six QWORD L_avx512_aes_gcm_six +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_seven QWORD \ + 0000000000000000h, 0000000000000007h +ptr_L_avx512_aes_gcm_seven QWORD L_avx512_aes_gcm_seven +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_eight QWORD \ + 0000000000000000h, 0000000000000008h +ptr_L_avx512_aes_gcm_eight QWORD L_avx512_aes_gcm_eight +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_bswap_epi64 QWORD \ + 0001020304050607h, 08090a0b0c0d0e0fh +ptr_L_avx512_aes_gcm_bswap_epi64 QWORD L_avx512_aes_gcm_bswap_epi64 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_bswap_mask QWORD \ + 08090a0b0c0d0e0fh, 0001020304050607h +ptr_L_avx512_aes_gcm_bswap_mask QWORD L_avx512_aes_gcm_bswap_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_mod2_128 QWORD \ + 0000000000000001h, 0c200000000000000h +ptr_L_avx512_aes_gcm_mod2_128 QWORD L_avx512_aes_gcm_mod2_128 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_avx512 PROC + push r13 + push rdi + push rsi + push r12 + push rbx + push r14 + push r15 + mov rdi, rcx + mov rsi, rdx + mov r12, r8 + mov rax, r9 + mov r8, QWORD PTR [rsp+96] + mov r9d, DWORD PTR [rsp+104] + mov r11d, DWORD PTR [rsp+112] + mov ebx, DWORD PTR [rsp+120] + mov r14d, DWORD PTR [rsp+128] + mov r15, QWORD PTR [rsp+136] + mov r10d, DWORD PTR [rsp+144] + sub rsp, 1248 + vmovdqu OWORD PTR [rsp+1088], xmm6 + vmovdqu OWORD PTR [rsp+1104], xmm7 + vmovdqu OWORD PTR [rsp+1120], xmm8 + vmovdqu OWORD PTR [rsp+1136], xmm9 + vmovdqu OWORD PTR [rsp+1152], xmm10 + vmovdqu OWORD PTR [rsp+1168], xmm11 + vmovdqu OWORD PTR [rsp+1184], xmm12 + vmovdqu OWORD PTR [rsp+1200], xmm13 + vmovdqu OWORD PTR [rsp+1216], xmm14 + vmovdqu OWORD PTR [rsp+1232], xmm15 + vpxor xmm4, xmm4, xmm4 + vpxor xmm6, xmm6, xmm6 + mov edx, ebx + cmp edx, 12 + jne L_AES_GCM_encrypt_avx512_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + vmovq xmm4, QWORD PTR [rax] + vpinsrd xmm4, xmm4, DWORD PTR [rax+8], 2 + vpinsrd xmm4, xmm4, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqa xmm5, OWORD PTR [r15] + vpxor xmm1, xmm4, xmm5 + vmovdqa xmm7, OWORD PTR [r15+16] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+32] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+48] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+64] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+80] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+96] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+112] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+128] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+144] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + cmp r10d, 11 + vmovdqa xmm7, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_avx512_calc_iv_12_last + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+176] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + cmp r10d, 13 + vmovdqa xmm7, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_avx512_calc_iv_12_last + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+208] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+224] +L_AES_GCM_encrypt_avx512_calc_iv_12_last: + vaesenclast xmm5, xmm5, xmm7 + vaesenclast xmm1, xmm1, xmm7 + vpshufb xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_bswap_mask + vmovdqu OWORD PTR [rsp+1040], xmm1 + jmp L_AES_GCM_encrypt_avx512_iv_done +L_AES_GCM_encrypt_avx512_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqa xmm5, OWORD PTR [r15] + vaesenc xmm5, xmm5, [r15+16] + vaesenc xmm5, xmm5, [r15+32] + vaesenc xmm5, xmm5, [r15+48] + vaesenc xmm5, xmm5, [r15+64] + vaesenc xmm5, xmm5, [r15+80] + vaesenc xmm5, xmm5, [r15+96] + vaesenc xmm5, xmm5, [r15+112] + vaesenc xmm5, xmm5, [r15+128] + vaesenc xmm5, xmm5, [r15+144] + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_avx512_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm9 + vaesenc xmm5, xmm5, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_avx512_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm9 + vaesenc xmm5, xmm5, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_encrypt_avx512_calc_iv_1_aesenc_avx_last: + vaesenclast xmm5, xmm5, xmm9 + vpshufb xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov rcx, 0 + je L_AES_GCM_encrypt_avx512_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_encrypt_avx512_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_encrypt_avx512_calc_iv_16_loop: + vmovdqu xmm8, OWORD PTR [rax+rcx] + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_encrypt_avx512_calc_iv_16_loop + mov edx, ebx + cmp ecx, edx + je L_AES_GCM_encrypt_avx512_calc_iv_done +L_AES_GCM_encrypt_avx512_calc_iv_lt16: + sub rsp, 16 + vpxor xmm8, xmm8, xmm8 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm8 +L_AES_GCM_encrypt_avx512_calc_iv_loop: + movzx r13d, BYTE PTR [rax+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_encrypt_avx512_calc_iv_loop + vmovdqu xmm8, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 +L_AES_GCM_encrypt_avx512_calc_iv_done: + ; T = Encrypt counter + vpxor xmm0, xmm0, xmm0 + shl edx, 3 + vmovq xmm0, rdx + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 + vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_mask + ; Encrypt counter + vmovdqa xmm8, OWORD PTR [r15] + vpxor xmm8, xmm8, xmm4 + vaesenc xmm8, xmm8, [r15+16] + vaesenc xmm8, xmm8, [r15+32] + vaesenc xmm8, xmm8, [r15+48] + vaesenc xmm8, xmm8, [r15+64] + vaesenc xmm8, xmm8, [r15+80] + vaesenc xmm8, xmm8, [r15+96] + vaesenc xmm8, xmm8, [r15+112] + vaesenc xmm8, xmm8, [r15+128] + vaesenc xmm8, xmm8, [r15+144] + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_avx512_calc_iv_2_aesenc_avx_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_avx512_calc_iv_2_aesenc_avx_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_encrypt_avx512_calc_iv_2_aesenc_avx_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqu OWORD PTR [rsp+1040], xmm8 +L_AES_GCM_encrypt_avx512_iv_done: + ; Additional authentication data + mov edx, r11d + cmp edx, 0 + je L_AES_GCM_encrypt_avx512_calc_aad_done + xor ecx, ecx + cmp edx, 16 + jl L_AES_GCM_encrypt_avx512_calc_aad_lt16 + and edx, 4294967280 +L_AES_GCM_encrypt_avx512_calc_aad_16_loop: + vmovdqu xmm8, OWORD PTR [r12+rcx] + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm6, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm6, 17 + vpclmulqdq xmm0, xmm5, xmm6, 0 + vpxor xmm1, xmm1, xmm6 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm6, xmm6, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm6, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm6, xmm6, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm6, xmm6, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_encrypt_avx512_calc_aad_16_loop + mov edx, r11d + cmp ecx, edx + je L_AES_GCM_encrypt_avx512_calc_aad_done +L_AES_GCM_encrypt_avx512_calc_aad_lt16: + sub rsp, 16 + vpxor xmm8, xmm8, xmm8 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm8 +L_AES_GCM_encrypt_avx512_calc_aad_loop: + movzx r13d, BYTE PTR [r12+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_encrypt_avx512_calc_aad_loop + vmovdqu xmm8, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm6, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm6, 17 + vpclmulqdq xmm0, xmm5, xmm6, 0 + vpxor xmm1, xmm1, xmm6 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm6, xmm6, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm6, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm6, xmm6, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm6, xmm6, xmm2 +L_AES_GCM_encrypt_avx512_calc_aad_done: + ; Calculate counter and H + vpsrlq xmm9, xmm5, 63 + vpsllq xmm8, xmm5, 1 + vpslldq xmm9, xmm9, 8 + vpor xmm8, xmm8, xmm9 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpand xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpaddd xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_one + vpxor xmm5, xmm5, xmm8 + vmovdqu OWORD PTR [rsp+1024], xmm4 + xor ebx, ebx + cmp r9d, 256 + jl L_AES_GCM_encrypt_avx512_done_128 + vmovdqa xmm2, xmm6 + ; H ^ 1 + vmovdqu OWORD PTR [rsp], xmm5 + ; H ^ 2 + vpclmulqdq xmm8, xmm5, xmm5, 0 + vpclmulqdq xmm11, xmm5, xmm5, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm0, xmm11 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpxor xmm9, xmm9, xmm5 + vpshufd xmm10, xmm0, 78 + vpxor xmm10, xmm10, xmm0 + vpclmulqdq xmm8, xmm0, xmm5, 0 + vpclmulqdq xmm11, xmm0, xmm5, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm1, xmm11 + vmovdqu OWORD PTR [rsp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm3, xmm11 + vmovdqu OWORD PTR [rsp+48], xmm3 + ; H ^ 5 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+64], xmm7 + ; H ^ 6 + vpclmulqdq xmm8, xmm1, xmm1, 0 + vpclmulqdq xmm11, xmm1, xmm1, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+80], xmm7 + ; H ^ 7 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm1, 78 + vpxor xmm9, xmm9, xmm1 + vpshufd xmm10, xmm3, 78 + vpxor xmm10, xmm10, xmm3 + vpclmulqdq xmm8, xmm3, xmm1, 0 + vpclmulqdq xmm11, xmm3, xmm1, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+96], xmm7 + ; H ^ 8 + vpclmulqdq xmm8, xmm3, xmm3, 0 + vpclmulqdq xmm11, xmm3, xmm3, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+112], xmm7 + ; H ^ 9 + vmovdqu xmm0, OWORD PTR [rsp+48] + vmovdqu xmm1, OWORD PTR [rsp+64] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+128], xmm7 + ; H ^ 10 + vmovdqu xmm0, OWORD PTR [rsp+64] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+144], xmm7 + ; H ^ 11 + vmovdqu xmm0, OWORD PTR [rsp+64] + vmovdqu xmm1, OWORD PTR [rsp+80] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+160], xmm7 + ; H ^ 12 + vmovdqu xmm0, OWORD PTR [rsp+80] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+176], xmm7 + ; H ^ 13 + vmovdqu xmm0, OWORD PTR [rsp+80] + vmovdqu xmm1, OWORD PTR [rsp+96] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+192], xmm7 + ; H ^ 14 + vmovdqu xmm0, OWORD PTR [rsp+96] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+208], xmm7 + ; H ^ 15 + vmovdqu xmm0, OWORD PTR [rsp+96] + vmovdqu xmm1, OWORD PTR [rsp+112] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+224], xmm7 + ; H ^ 16 + vmovdqu xmm0, OWORD PTR [rsp+112] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+240], xmm7 + cmp r9d, 512 + jl L_AES_GCM_encrypt_avx512_no_ext + ; H ^ 17 + vmovdqu xmm0, OWORD PTR [rsp+112] + vmovdqu xmm1, OWORD PTR [rsp+128] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+256], xmm7 + ; H ^ 18 + vmovdqu xmm0, OWORD PTR [rsp+128] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+272], xmm7 + ; H ^ 19 + vmovdqu xmm0, OWORD PTR [rsp+128] + vmovdqu xmm1, OWORD PTR [rsp+144] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+288], xmm7 + ; H ^ 20 + vmovdqu xmm0, OWORD PTR [rsp+144] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+304], xmm7 + ; H ^ 21 + vmovdqu xmm0, OWORD PTR [rsp+144] + vmovdqu xmm1, OWORD PTR [rsp+160] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+320], xmm7 + ; H ^ 22 + vmovdqu xmm0, OWORD PTR [rsp+160] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+336], xmm7 + ; H ^ 23 + vmovdqu xmm0, OWORD PTR [rsp+160] + vmovdqu xmm1, OWORD PTR [rsp+176] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+352], xmm7 + ; H ^ 24 + vmovdqu xmm0, OWORD PTR [rsp+176] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+368], xmm7 + ; H ^ 25 + vmovdqu xmm0, OWORD PTR [rsp+176] + vmovdqu xmm1, OWORD PTR [rsp+192] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+384], xmm7 + ; H ^ 26 + vmovdqu xmm0, OWORD PTR [rsp+192] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+400], xmm7 + ; H ^ 27 + vmovdqu xmm0, OWORD PTR [rsp+192] + vmovdqu xmm1, OWORD PTR [rsp+208] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+416], xmm7 + ; H ^ 28 + vmovdqu xmm0, OWORD PTR [rsp+208] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+432], xmm7 + ; H ^ 29 + vmovdqu xmm0, OWORD PTR [rsp+208] + vmovdqu xmm1, OWORD PTR [rsp+224] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+448], xmm7 + ; H ^ 30 + vmovdqu xmm0, OWORD PTR [rsp+224] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+464], xmm7 + ; H ^ 31 + vmovdqu xmm0, OWORD PTR [rsp+224] + vmovdqu xmm1, OWORD PTR [rsp+240] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+480], xmm7 + ; H ^ 32 + vmovdqu xmm0, OWORD PTR [rsp+240] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+496], xmm7 +L_AES_GCM_encrypt_avx512_no_ext: + vbroadcasti32x4 zmm22, ptr_L_avx512_aes_gcm_bswap_epi64 + vbroadcasti32x4 zmm30, ptr_L_avx512_aes_gcm_bswap_mask + vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128 + vbroadcasti32x4 zmm9, [r15] + vbroadcasti32x4 zmm10, [r15+16] + vbroadcasti32x4 zmm11, [r15+32] + vbroadcasti32x4 zmm12, [r15+48] + vbroadcasti32x4 zmm13, [r15+64] + vbroadcasti32x4 zmm14, [r15+80] + vbroadcasti32x4 zmm15, [r15+96] + vbroadcasti32x4 zmm1, [r15+112] + vbroadcasti32x4 zmm2, [r15+128] + vbroadcasti32x4 zmm3, [r15+144] + cmp r9d, 512 + jl L_AES_GCM_encrypt_avx512_no_windows + mov r13d, r9d + and r13d, 4294966784 + vmovdqu64 zmm23, [rsp+448] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+384] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+320] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp+256] + vshufi64x2 zmm26, zmm26, zmm26, 27 + vmovdqu64 [rsp+512], zmm23 + vmovdqu64 [rsp+576], zmm24 + vmovdqu64 [rsp+640], zmm25 + vmovdqu64 [rsp+704], zmm26 + vmovdqu64 zmm23, [rsp+192] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+128] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+64] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp] + vshufi64x2 zmm26, zmm26, zmm26, 27 + vmovdqu64 [rsp+768], zmm23 + vmovdqu64 [rsp+832], zmm24 + vmovdqu64 [rsp+896], zmm25 + vmovdqu64 [rsp+960], zmm26 + ; 512 bytes of input + lea rcx, QWORD PTR [rsi+rbx] + mov QWORD PTR [rsp+1056], rcx + vbroadcasti32x4 zmm20, [rsp+1024] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [rsp+1024] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [rsp+1024], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r10d, 11 + vbroadcasti32x4 zmm20, [r15+160] + jl L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r10d, 13 + vbroadcasti32x4 zmm20, [r15+192] + jl L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+224] +L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add ebx, 256 + vbroadcasti32x4 zmm20, [rsp+1024] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [rsp+1024] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [rsp+1024], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r10d, 11 + vbroadcasti32x4 zmm20, [r15+160] + jl L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r10d, 13 + vbroadcasti32x4 zmm20, [r15+192] + jl L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+224] +L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add ebx, 256 + cmp ebx, r13d + jge L_AES_GCM_encrypt_avx512_last_win +L_AES_GCM_encrypt_avx512_win_loop: + lea rcx, QWORD PTR [rsi+rbx] + mov QWORD PTR [rsp+1072], rcx + mov r12, QWORD PTR [rsp+1056] + vpxorq zmm21, zmm21, zmm21 + vinserti32x4 zmm21, zmm21, xmm6, 0 + vbroadcasti32x4 zmm20, [rsp+1024] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [rsp+1024] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [rsp+1024], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vmovdqu64 zmm31, [r12] + vpshufb zmm31, zmm31, zmm30 + vpxorq zmm31, zmm31, zmm21 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vpclmulqdq zmm23, zmm31, [rsp+512], 0 + vpclmulqdq zmm24, zmm31, [rsp+512], 1 + vpclmulqdq zmm25, zmm31, [rsp+512], 16 + vpclmulqdq zmm26, zmm31, [rsp+512], 17 + vmovdqa64 zmm27, zmm23 + vpxorq zmm28, zmm25, zmm24 + vmovdqa64 zmm29, zmm26 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vmovdqu64 zmm31, [r12+64] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vpclmulqdq zmm23, zmm31, [rsp+576], 0 + vpclmulqdq zmm24, zmm31, [rsp+576], 1 + vpclmulqdq zmm25, zmm31, [rsp+576], 16 + vpclmulqdq zmm26, zmm31, [rsp+576], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vmovdqu64 zmm31, [r12+128] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vpclmulqdq zmm23, zmm31, [rsp+640], 0 + vpclmulqdq zmm24, zmm31, [rsp+640], 1 + vpclmulqdq zmm25, zmm31, [rsp+640], 16 + vpclmulqdq zmm26, zmm31, [rsp+640], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vmovdqu64 zmm31, [r12+192] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vpclmulqdq zmm23, zmm31, [rsp+704], 0 + vpclmulqdq zmm24, zmm31, [rsp+704], 1 + vpclmulqdq zmm25, zmm31, [rsp+704], 16 + vpclmulqdq zmm26, zmm31, [rsp+704], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r10d, 11 + vbroadcasti32x4 zmm20, [r15+160] + jl L_AES_GCM_encrypt_avx512_a_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r10d, 13 + vbroadcasti32x4 zmm20, [r15+192] + jl L_AES_GCM_encrypt_avx512_a_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+224] +L_AES_GCM_encrypt_avx512_a_il_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add ebx, 256 + vbroadcasti32x4 zmm20, [rsp+1024] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [rsp+1024] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [rsp+1024], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vmovdqu64 zmm31, [r12+256] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vpclmulqdq zmm23, zmm31, [rsp+768], 0 + vpclmulqdq zmm24, zmm31, [rsp+768], 1 + vpclmulqdq zmm25, zmm31, [rsp+768], 16 + vpclmulqdq zmm26, zmm31, [rsp+768], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vmovdqu64 zmm31, [r12+320] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vpclmulqdq zmm23, zmm31, [rsp+832], 0 + vpclmulqdq zmm24, zmm31, [rsp+832], 1 + vpclmulqdq zmm25, zmm31, [rsp+832], 16 + vpclmulqdq zmm26, zmm31, [rsp+832], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vmovdqu64 zmm31, [r12+384] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vpclmulqdq zmm23, zmm31, [rsp+896], 0 + vpclmulqdq zmm24, zmm31, [rsp+896], 1 + vpclmulqdq zmm25, zmm31, [rsp+896], 16 + vpclmulqdq zmm26, zmm31, [rsp+896], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vmovdqu64 zmm31, [r12+448] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vpclmulqdq zmm23, zmm31, [rsp+960], 0 + vpclmulqdq zmm24, zmm31, [rsp+960], 1 + vpclmulqdq zmm25, zmm31, [rsp+960], 16 + vpclmulqdq zmm26, zmm31, [rsp+960], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r10d, 11 + vbroadcasti32x4 zmm20, [r15+160] + jl L_AES_GCM_encrypt_avx512_b_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r10d, 13 + vbroadcasti32x4 zmm20, [r15+192] + jl L_AES_GCM_encrypt_avx512_b_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+224] +L_AES_GCM_encrypt_avx512_b_il_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add ebx, 256 + vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128 + vpclmulqdq zmm23, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm23, 150 + vpclmulqdq zmm23, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm23, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 + mov rcx, QWORD PTR [rsp+1072] + mov QWORD PTR [rsp+1056], rcx + cmp ebx, r13d + jl L_AES_GCM_encrypt_avx512_win_loop +L_AES_GCM_encrypt_avx512_last_win: + mov rcx, QWORD PTR [rsp+1056] + vpxorq zmm20, zmm20, zmm20 + vinserti32x4 zmm20, zmm20, xmm6, 0 + vmovdqu64 zmm23, [rsp+512] + vmovdqu64 zmm24, [rsp+576] + vmovdqu64 zmm25, [rsp+640] + vmovdqu64 zmm26, [rsp+704] + vmovdqu64 zmm21, [rcx] + vpshufb zmm21, zmm21, zmm30 + vpxorq zmm21, zmm21, zmm20 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vmovdqa64 zmm27, zmm16 + vpxorq zmm28, zmm18, zmm17 + vmovdqa64 zmm29, zmm19 + vmovdqu64 zmm21, [rcx+64] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rcx+128] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rcx+192] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm23, [rsp+768] + vmovdqu64 zmm24, [rsp+832] + vmovdqu64 zmm25, [rsp+896] + vmovdqu64 zmm26, [rsp+960] + vmovdqu64 zmm21, [rcx+256] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rcx+320] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rcx+384] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rcx+448] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vpclmulqdq zmm21, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm21, 150 + vpclmulqdq zmm21, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm21, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 +L_AES_GCM_encrypt_avx512_no_windows: + vmovdqu64 zmm23, [rsp+192] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+128] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+64] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp] + vshufi64x2 zmm26, zmm26, zmm26, 27 + mov r13d, r9d + and r13d, 4294967040 + cmp ebx, r13d + jge L_AES_GCM_encrypt_avx512_after_256 + ; 256 bytes of input + vbroadcasti32x4 zmm20, [rsp+1024] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [rsp+1024] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [rsp+1024], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r10d, 11 + vbroadcasti32x4 zmm20, [r15+160] + jl L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r10d, 13 + vbroadcasti32x4 zmm20, [r15+192] + jl L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+224] +L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + mov QWORD PTR [rsp+1056], rdx + add ebx, 256 + cmp ebx, r13d + jge L_AES_GCM_encrypt_avx512_last_ghash +L_AES_GCM_encrypt_avx512_ghash_128: + vbroadcasti32x4 zmm20, [rsp+1024] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [rsp+1024] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [rsp+1024], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r10d, 11 + vbroadcasti32x4 zmm20, [r15+160] + jl L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r10d, 13 + vbroadcasti32x4 zmm20, [r15+192] + jl L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+224] +L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + mov rcx, QWORD PTR [rsp+1056] + vpxorq zmm20, zmm20, zmm20 + vinserti32x4 zmm20, zmm20, xmm6, 0 + vmovdqu64 zmm21, [rcx] + vpshufb zmm21, zmm21, zmm30 + vpxorq zmm21, zmm21, zmm20 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vmovdqa64 zmm27, zmm16 + vpxorq zmm28, zmm18, zmm17 + vmovdqa64 zmm29, zmm19 + vmovdqu64 zmm21, [rcx+64] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rcx+128] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rcx+192] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vpclmulqdq zmm21, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm21, 150 + vpclmulqdq zmm21, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm21, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 + mov QWORD PTR [rsp+1056], rdx + add ebx, 256 + cmp ebx, r13d + jl L_AES_GCM_encrypt_avx512_ghash_128 +L_AES_GCM_encrypt_avx512_last_ghash: + mov rcx, QWORD PTR [rsp+1056] + vpxorq zmm20, zmm20, zmm20 + vinserti32x4 zmm20, zmm20, xmm6, 0 + vmovdqu64 zmm21, [rcx] + vpshufb zmm21, zmm21, zmm30 + vpxorq zmm21, zmm21, zmm20 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vmovdqa64 zmm27, zmm16 + vpxorq zmm28, zmm18, zmm17 + vmovdqa64 zmm29, zmm19 + vmovdqu64 zmm21, [rcx+64] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rcx+128] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rcx+192] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vpclmulqdq zmm21, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm21, 150 + vpclmulqdq zmm21, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm21, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 +L_AES_GCM_encrypt_avx512_after_256: + vmovdqu xmm5, OWORD PTR [rsp] +L_AES_GCM_encrypt_avx512_done_128: + mov edx, r9d + cmp ebx, edx + jge L_AES_GCM_encrypt_avx512_done_enc + mov r13d, r9d + and r13d, 4294967280 + cmp ebx, r13d + jge L_AES_GCM_encrypt_avx512_last_block_done + vmovdqu xmm9, OWORD PTR [rsp+1024] + vpshufb xmm8, xmm9, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm9, OWORD PTR L_avx512_aes_gcm_one + vmovdqu OWORD PTR [rsp+1024], xmm9 + vpxor xmm8, xmm8, [r15] + vaesenc xmm8, xmm8, [r15+16] + vaesenc xmm8, xmm8, [r15+32] + vaesenc xmm8, xmm8, [r15+48] + vaesenc xmm8, xmm8, [r15+64] + vaesenc xmm8, xmm8, [r15+80] + vaesenc xmm8, xmm8, [r15+96] + vaesenc xmm8, xmm8, [r15+112] + vaesenc xmm8, xmm8, [r15+128] + vaesenc xmm8, xmm8, [r15+144] + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_avx512_aesenc_block_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_avx512_aesenc_block_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_encrypt_avx512_aesenc_block_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqu xmm9, OWORD PTR [rdi+rbx] + vpxor xmm8, xmm8, xmm9 + vmovdqu OWORD PTR [rsi+rbx], xmm8 + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm8 + add ebx, 16 + cmp ebx, r13d + jge L_AES_GCM_encrypt_avx512_last_block_ghash +L_AES_GCM_encrypt_avx512_last_block_start: + vmovdqu xmm13, OWORD PTR [rdi+rbx] + vmovdqu xmm9, OWORD PTR [rsp+1024] + vpshufb xmm8, xmm9, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm9, OWORD PTR L_avx512_aes_gcm_one + vmovdqu OWORD PTR [rsp+1024], xmm9 + vpxor xmm8, xmm8, [r15] + vpclmulqdq xmm10, xmm6, xmm5, 16 + vaesenc xmm8, xmm8, [r15+16] + vaesenc xmm8, xmm8, [r15+32] + vpclmulqdq xmm11, xmm6, xmm5, 1 + vaesenc xmm8, xmm8, [r15+48] + vaesenc xmm8, xmm8, [r15+64] + vpclmulqdq xmm12, xmm6, xmm5, 0 + vaesenc xmm8, xmm8, [r15+80] + vpclmulqdq xmm1, xmm6, xmm5, 17 + vaesenc xmm8, xmm8, [r15+96] + vpxor xmm10, xmm10, xmm11 + vpslldq xmm2, xmm10, 8 + vpsrldq xmm10, xmm10, 8 + vaesenc xmm8, xmm8, [r15+112] + vpxor xmm2, xmm2, xmm12 + vpxor xmm3, xmm1, xmm10 + vmovdqa xmm0, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm2, xmm0, 16 + vaesenc xmm8, xmm8, [r15+128] + vpshufd xmm10, xmm2, 78 + vpxor xmm10, xmm10, xmm11 + vpclmulqdq xmm11, xmm10, xmm0, 16 + vaesenc xmm8, xmm8, [r15+144] + vpshufd xmm10, xmm10, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm6, xmm10, xmm3 + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_avx512_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_avx512_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_encrypt_avx512_aesenc_gfmul_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqa xmm0, xmm13 + vpxor xmm8, xmm8, xmm0 + vmovdqu OWORD PTR [rsi+rbx], xmm8 + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + add ebx, 16 + vpxor xmm6, xmm6, xmm8 + cmp ebx, r13d + jl L_AES_GCM_encrypt_avx512_last_block_start +L_AES_GCM_encrypt_avx512_last_block_ghash: + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpxor xmm9, xmm9, xmm5 + vpshufd xmm10, xmm6, 78 + vpxor xmm10, xmm10, xmm6 + vpclmulqdq xmm8, xmm6, xmm5, 0 + vpclmulqdq xmm11, xmm6, xmm5, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm6, xmm11 +L_AES_GCM_encrypt_avx512_last_block_done: + mov ecx, r9d + mov edx, ecx + and ecx, 15 + jz L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_done + vmovdqu xmm4, OWORD PTR [rsp+1024] + vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpxor xmm4, xmm4, [r15] + vaesenc xmm4, xmm4, [r15+16] + vaesenc xmm4, xmm4, [r15+32] + vaesenc xmm4, xmm4, [r15+48] + vaesenc xmm4, xmm4, [r15+64] + vaesenc xmm4, xmm4, [r15+80] + vaesenc xmm4, xmm4, [r15+96] + vaesenc xmm4, xmm4, [r15+112] + vaesenc xmm4, xmm4, [r15+128] + vaesenc xmm4, xmm4, [r15+144] + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc xmm4, xmm4, xmm9 + vaesenc xmm4, xmm4, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc xmm4, xmm4, xmm9 + vaesenc xmm4, xmm4, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_aesenc_avx_last: + vaesenclast xmm4, xmm4, xmm9 + sub rsp, 16 + xor ecx, ecx + vmovdqu OWORD PTR [rsp], xmm4 +L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_loop: + movzx r13d, BYTE PTR [rdi+rbx] + xor r13b, BYTE PTR [rsp+rcx] + mov BYTE PTR [rsi+rbx], r13b + mov BYTE PTR [rsp+rcx], r13b + inc ebx + inc ecx + cmp ebx, edx + jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_loop + xor r13, r13 + cmp ecx, 16 + je L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_finish_enc +L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_byte_loop: + mov BYTE PTR [rsp+rcx], r13b + inc ecx + cmp ecx, 16 + jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_byte_loop +L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_finish_enc: + vmovdqu xmm4, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm4 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpxor xmm9, xmm9, xmm5 + vpshufd xmm10, xmm6, 78 + vpxor xmm10, xmm10, xmm6 + vpclmulqdq xmm8, xmm6, xmm5, 0 + vpclmulqdq xmm11, xmm6, xmm5, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm6, xmm11 +L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_done: +L_AES_GCM_encrypt_avx512_done_enc: + mov edx, r9d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + vmovq xmm0, rdx + vmovq xmm1, rcx + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm6, xmm6, xmm0 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpxor xmm9, xmm9, xmm5 + vpshufd xmm10, xmm6, 78 + vpxor xmm10, xmm10, xmm6 + vpclmulqdq xmm8, xmm6, xmm5, 0 + vpclmulqdq xmm11, xmm6, xmm5, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm6, xmm11 + vpshufb xmm6, xmm6, OWORD PTR L_avx512_aes_gcm_bswap_mask + vmovdqu xmm0, OWORD PTR [rsp+1040] + vpxor xmm0, xmm0, xmm6 + cmp r14d, 16 + je L_AES_GCM_encrypt_avx512_store_tag_16 + xor rcx, rcx + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_encrypt_avx512_store_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + mov BYTE PTR [r8+rcx], r13b + inc ecx + cmp ecx, r14d + jne L_AES_GCM_encrypt_avx512_store_tag_loop + jmp L_AES_GCM_encrypt_avx512_store_tag_done +L_AES_GCM_encrypt_avx512_store_tag_16: + vmovdqu OWORD PTR [r8], xmm0 +L_AES_GCM_encrypt_avx512_store_tag_done: + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+1088] + vmovdqu xmm7, OWORD PTR [rsp+1104] + vmovdqu xmm8, OWORD PTR [rsp+1120] + vmovdqu xmm9, OWORD PTR [rsp+1136] + vmovdqu xmm10, OWORD PTR [rsp+1152] + vmovdqu xmm11, OWORD PTR [rsp+1168] + vmovdqu xmm12, OWORD PTR [rsp+1184] + vmovdqu xmm13, OWORD PTR [rsp+1200] + vmovdqu xmm14, OWORD PTR [rsp+1216] + vmovdqu xmm15, OWORD PTR [rsp+1232] + add rsp, 1248 + pop r15 + pop r14 + pop rbx + pop r12 + pop rsi + pop rdi + pop r13 + ret +AES_GCM_encrypt_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_decrypt_avx512 PROC + push r13 + push rdi + push rsi + push r12 + push rbx + push r14 + push r15 + push rbp + mov rdi, rcx + mov rsi, rdx + mov r12, r8 + mov rax, r9 + mov r8, QWORD PTR [rsp+104] + mov r9d, DWORD PTR [rsp+112] + mov r11d, DWORD PTR [rsp+120] + mov ebx, DWORD PTR [rsp+128] + mov r14d, DWORD PTR [rsp+136] + mov r15, QWORD PTR [rsp+144] + mov r10d, DWORD PTR [rsp+152] + mov rbp, QWORD PTR [rsp+160] + sub rsp, 1216 + vmovdqu OWORD PTR [rsp+1056], xmm6 + vmovdqu OWORD PTR [rsp+1072], xmm7 + vmovdqu OWORD PTR [rsp+1088], xmm8 + vmovdqu OWORD PTR [rsp+1104], xmm9 + vmovdqu OWORD PTR [rsp+1120], xmm10 + vmovdqu OWORD PTR [rsp+1136], xmm11 + vmovdqu OWORD PTR [rsp+1152], xmm12 + vmovdqu OWORD PTR [rsp+1168], xmm13 + vmovdqu OWORD PTR [rsp+1184], xmm14 + vmovdqu OWORD PTR [rsp+1200], xmm15 + vpxor xmm4, xmm4, xmm4 + vpxor xmm6, xmm6, xmm6 + cmp ebx, 12 + mov edx, ebx + jne L_AES_GCM_decrypt_avx512_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + vmovq xmm4, QWORD PTR [rax] + vpinsrd xmm4, xmm4, DWORD PTR [rax+8], 2 + vpinsrd xmm4, xmm4, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqa xmm5, OWORD PTR [r15] + vpxor xmm1, xmm4, xmm5 + vmovdqa xmm7, OWORD PTR [r15+16] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+32] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+48] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+64] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+80] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+96] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+112] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+128] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+144] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + cmp r10d, 11 + vmovdqa xmm7, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_avx512_calc_iv_12_last + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+176] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + cmp r10d, 13 + vmovdqa xmm7, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_avx512_calc_iv_12_last + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+208] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+224] +L_AES_GCM_decrypt_avx512_calc_iv_12_last: + vaesenclast xmm5, xmm5, xmm7 + vaesenclast xmm1, xmm1, xmm7 + vpshufb xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_bswap_mask + vmovdqu OWORD PTR [rsp+1040], xmm1 + jmp L_AES_GCM_decrypt_avx512_iv_done +L_AES_GCM_decrypt_avx512_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqa xmm5, OWORD PTR [r15] + vaesenc xmm5, xmm5, [r15+16] + vaesenc xmm5, xmm5, [r15+32] + vaesenc xmm5, xmm5, [r15+48] + vaesenc xmm5, xmm5, [r15+64] + vaesenc xmm5, xmm5, [r15+80] + vaesenc xmm5, xmm5, [r15+96] + vaesenc xmm5, xmm5, [r15+112] + vaesenc xmm5, xmm5, [r15+128] + vaesenc xmm5, xmm5, [r15+144] + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_avx512_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm9 + vaesenc xmm5, xmm5, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_avx512_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm9 + vaesenc xmm5, xmm5, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_decrypt_avx512_calc_iv_1_aesenc_avx_last: + vaesenclast xmm5, xmm5, xmm9 + vpshufb xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov rcx, 0 + je L_AES_GCM_decrypt_avx512_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_decrypt_avx512_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_decrypt_avx512_calc_iv_16_loop: + vmovdqu xmm8, OWORD PTR [rax+rcx] + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_decrypt_avx512_calc_iv_16_loop + mov edx, ebx + cmp ecx, edx + je L_AES_GCM_decrypt_avx512_calc_iv_done +L_AES_GCM_decrypt_avx512_calc_iv_lt16: + sub rsp, 16 + vpxor xmm8, xmm8, xmm8 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm8 +L_AES_GCM_decrypt_avx512_calc_iv_loop: + movzx r13d, BYTE PTR [rax+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_decrypt_avx512_calc_iv_loop + vmovdqu xmm8, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 +L_AES_GCM_decrypt_avx512_calc_iv_done: + ; T = Encrypt counter + vpxor xmm0, xmm0, xmm0 + shl edx, 3 + vmovq xmm0, rdx + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 + vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_mask + ; Encrypt counter + vmovdqa xmm8, OWORD PTR [r15] + vpxor xmm8, xmm8, xmm4 + vaesenc xmm8, xmm8, [r15+16] + vaesenc xmm8, xmm8, [r15+32] + vaesenc xmm8, xmm8, [r15+48] + vaesenc xmm8, xmm8, [r15+64] + vaesenc xmm8, xmm8, [r15+80] + vaesenc xmm8, xmm8, [r15+96] + vaesenc xmm8, xmm8, [r15+112] + vaesenc xmm8, xmm8, [r15+128] + vaesenc xmm8, xmm8, [r15+144] + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_avx512_calc_iv_2_aesenc_avx_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_avx512_calc_iv_2_aesenc_avx_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_decrypt_avx512_calc_iv_2_aesenc_avx_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqu OWORD PTR [rsp+1040], xmm8 +L_AES_GCM_decrypt_avx512_iv_done: + ; Additional authentication data + mov edx, r11d + cmp edx, 0 + je L_AES_GCM_decrypt_avx512_calc_aad_done + xor ecx, ecx + cmp edx, 16 + jl L_AES_GCM_decrypt_avx512_calc_aad_lt16 + and edx, 4294967280 +L_AES_GCM_decrypt_avx512_calc_aad_16_loop: + vmovdqu xmm8, OWORD PTR [r12+rcx] + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm6, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm6, 17 + vpclmulqdq xmm0, xmm5, xmm6, 0 + vpxor xmm1, xmm1, xmm6 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm6, xmm6, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm6, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm6, xmm6, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm6, xmm6, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_decrypt_avx512_calc_aad_16_loop + mov edx, r11d + cmp ecx, edx + je L_AES_GCM_decrypt_avx512_calc_aad_done +L_AES_GCM_decrypt_avx512_calc_aad_lt16: + sub rsp, 16 + vpxor xmm8, xmm8, xmm8 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm8 +L_AES_GCM_decrypt_avx512_calc_aad_loop: + movzx r13d, BYTE PTR [r12+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_decrypt_avx512_calc_aad_loop + vmovdqu xmm8, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm6, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm6, 17 + vpclmulqdq xmm0, xmm5, xmm6, 0 + vpxor xmm1, xmm1, xmm6 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm6, xmm6, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm6, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm6, xmm6, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm6, xmm6, xmm2 +L_AES_GCM_decrypt_avx512_calc_aad_done: + ; Calculate counter and H + vpsrlq xmm9, xmm5, 63 + vpsllq xmm8, xmm5, 1 + vpslldq xmm9, xmm9, 8 + vpor xmm8, xmm8, xmm9 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpand xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpaddd xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_one + vpxor xmm5, xmm5, xmm8 + vmovdqu OWORD PTR [rsp+1024], xmm4 + xor ebx, ebx + cmp r9d, 256 + jl L_AES_GCM_decrypt_avx512_done_128 + vmovdqa xmm2, xmm6 + ; H ^ 1 + vmovdqu OWORD PTR [rsp], xmm5 + ; H ^ 2 + vpclmulqdq xmm8, xmm5, xmm5, 0 + vpclmulqdq xmm11, xmm5, xmm5, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm0, xmm11 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpxor xmm9, xmm9, xmm5 + vpshufd xmm10, xmm0, 78 + vpxor xmm10, xmm10, xmm0 + vpclmulqdq xmm8, xmm0, xmm5, 0 + vpclmulqdq xmm11, xmm0, xmm5, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm1, xmm11 + vmovdqu OWORD PTR [rsp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm3, xmm11 + vmovdqu OWORD PTR [rsp+48], xmm3 + ; H ^ 5 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+64], xmm7 + ; H ^ 6 + vpclmulqdq xmm8, xmm1, xmm1, 0 + vpclmulqdq xmm11, xmm1, xmm1, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+80], xmm7 + ; H ^ 7 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm1, 78 + vpxor xmm9, xmm9, xmm1 + vpshufd xmm10, xmm3, 78 + vpxor xmm10, xmm10, xmm3 + vpclmulqdq xmm8, xmm3, xmm1, 0 + vpclmulqdq xmm11, xmm3, xmm1, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+96], xmm7 + ; H ^ 8 + vpclmulqdq xmm8, xmm3, xmm3, 0 + vpclmulqdq xmm11, xmm3, xmm3, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+112], xmm7 + ; H ^ 9 + vmovdqu xmm0, OWORD PTR [rsp+48] + vmovdqu xmm1, OWORD PTR [rsp+64] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+128], xmm7 + ; H ^ 10 + vmovdqu xmm0, OWORD PTR [rsp+64] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+144], xmm7 + ; H ^ 11 + vmovdqu xmm0, OWORD PTR [rsp+64] + vmovdqu xmm1, OWORD PTR [rsp+80] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+160], xmm7 + ; H ^ 12 + vmovdqu xmm0, OWORD PTR [rsp+80] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+176], xmm7 + ; H ^ 13 + vmovdqu xmm0, OWORD PTR [rsp+80] + vmovdqu xmm1, OWORD PTR [rsp+96] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+192], xmm7 + ; H ^ 14 + vmovdqu xmm0, OWORD PTR [rsp+96] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+208], xmm7 + ; H ^ 15 + vmovdqu xmm0, OWORD PTR [rsp+96] + vmovdqu xmm1, OWORD PTR [rsp+112] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+224], xmm7 + ; H ^ 16 + vmovdqu xmm0, OWORD PTR [rsp+112] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+240], xmm7 + cmp r9d, 512 + jl L_AES_GCM_decrypt_avx512_no_ext + ; H ^ 17 + vmovdqu xmm0, OWORD PTR [rsp+112] + vmovdqu xmm1, OWORD PTR [rsp+128] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+256], xmm7 + ; H ^ 18 + vmovdqu xmm0, OWORD PTR [rsp+128] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+272], xmm7 + ; H ^ 19 + vmovdqu xmm0, OWORD PTR [rsp+128] + vmovdqu xmm1, OWORD PTR [rsp+144] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+288], xmm7 + ; H ^ 20 + vmovdqu xmm0, OWORD PTR [rsp+144] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+304], xmm7 + ; H ^ 21 + vmovdqu xmm0, OWORD PTR [rsp+144] + vmovdqu xmm1, OWORD PTR [rsp+160] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+320], xmm7 + ; H ^ 22 + vmovdqu xmm0, OWORD PTR [rsp+160] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+336], xmm7 + ; H ^ 23 + vmovdqu xmm0, OWORD PTR [rsp+160] + vmovdqu xmm1, OWORD PTR [rsp+176] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+352], xmm7 + ; H ^ 24 + vmovdqu xmm0, OWORD PTR [rsp+176] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+368], xmm7 + ; H ^ 25 + vmovdqu xmm0, OWORD PTR [rsp+176] + vmovdqu xmm1, OWORD PTR [rsp+192] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+384], xmm7 + ; H ^ 26 + vmovdqu xmm0, OWORD PTR [rsp+192] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+400], xmm7 + ; H ^ 27 + vmovdqu xmm0, OWORD PTR [rsp+192] + vmovdqu xmm1, OWORD PTR [rsp+208] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+416], xmm7 + ; H ^ 28 + vmovdqu xmm0, OWORD PTR [rsp+208] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+432], xmm7 + ; H ^ 29 + vmovdqu xmm0, OWORD PTR [rsp+208] + vmovdqu xmm1, OWORD PTR [rsp+224] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+448], xmm7 + ; H ^ 30 + vmovdqu xmm0, OWORD PTR [rsp+224] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+464], xmm7 + ; H ^ 31 + vmovdqu xmm0, OWORD PTR [rsp+224] + vmovdqu xmm1, OWORD PTR [rsp+240] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+480], xmm7 + ; H ^ 32 + vmovdqu xmm0, OWORD PTR [rsp+240] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+496], xmm7 +L_AES_GCM_decrypt_avx512_no_ext: + vbroadcasti32x4 zmm22, ptr_L_avx512_aes_gcm_bswap_epi64 + vbroadcasti32x4 zmm30, ptr_L_avx512_aes_gcm_bswap_mask + vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128 + vbroadcasti32x4 zmm9, [r15] + vbroadcasti32x4 zmm10, [r15+16] + vbroadcasti32x4 zmm11, [r15+32] + vbroadcasti32x4 zmm12, [r15+48] + vbroadcasti32x4 zmm13, [r15+64] + vbroadcasti32x4 zmm14, [r15+80] + vbroadcasti32x4 zmm15, [r15+96] + vbroadcasti32x4 zmm1, [r15+112] + vbroadcasti32x4 zmm2, [r15+128] + vbroadcasti32x4 zmm3, [r15+144] + cmp r9d, 512 + jl L_AES_GCM_decrypt_avx512_no_windows + mov r13d, r9d + and r13d, 4294966784 + vmovdqu64 zmm23, [rsp+448] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+384] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+320] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp+256] + vshufi64x2 zmm26, zmm26, zmm26, 27 + vmovdqu64 [rsp+512], zmm23 + vmovdqu64 [rsp+576], zmm24 + vmovdqu64 [rsp+640], zmm25 + vmovdqu64 [rsp+704], zmm26 + vmovdqu64 zmm23, [rsp+192] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+128] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+64] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp] + vshufi64x2 zmm26, zmm26, zmm26, 27 + vmovdqu64 [rsp+768], zmm23 + vmovdqu64 [rsp+832], zmm24 + vmovdqu64 [rsp+896], zmm25 + vmovdqu64 [rsp+960], zmm26 + ; 512 bytes of input + xor r12d, r12d + lea rax, QWORD PTR [rdi+rbx] + vpxorq zmm20, zmm20, zmm20 + vinserti32x4 zmm20, zmm20, xmm6, 0 + vmovdqu64 zmm23, [rsp+512] + vmovdqu64 zmm24, [rsp+576] + vmovdqu64 zmm25, [rsp+640] + vmovdqu64 zmm26, [rsp+704] + vmovdqu64 zmm21, [rax] + vpshufb zmm21, zmm21, zmm30 + vpxorq zmm21, zmm21, zmm20 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vmovdqa64 zmm27, zmm16 + vpxorq zmm28, zmm18, zmm17 + vmovdqa64 zmm29, zmm19 + vmovdqu64 zmm21, [rax+64] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rax+128] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rax+192] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm23, [rsp+768] + vmovdqu64 zmm24, [rsp+832] + vmovdqu64 zmm25, [rsp+896] + vmovdqu64 zmm26, [rsp+960] + vmovdqu64 zmm21, [rax+256] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rax+320] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rax+384] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rax+448] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vpclmulqdq zmm21, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm21, 150 + vpclmulqdq zmm21, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm21, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 + add ebx, 512 + cmp ebx, r13d + jge L_AES_GCM_decrypt_avx512_last_aes +L_AES_GCM_decrypt_avx512_win_loop: + lea rax, QWORD PTR [rdi+rbx] + vpxorq zmm21, zmm21, zmm21 + vinserti32x4 zmm21, zmm21, xmm6, 0 + vbroadcasti32x4 zmm20, [rsp+1024] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [rsp+1024] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [rsp+1024], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vmovdqu64 zmm31, [rax] + vpshufb zmm31, zmm31, zmm30 + vpxorq zmm31, zmm31, zmm21 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vpclmulqdq zmm23, zmm31, [rsp+512], 0 + vpclmulqdq zmm24, zmm31, [rsp+512], 1 + vpclmulqdq zmm25, zmm31, [rsp+512], 16 + vpclmulqdq zmm26, zmm31, [rsp+512], 17 + vmovdqa64 zmm27, zmm23 + vpxorq zmm28, zmm25, zmm24 + vmovdqa64 zmm29, zmm26 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vmovdqu64 zmm31, [rax+64] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vpclmulqdq zmm23, zmm31, [rsp+576], 0 + vpclmulqdq zmm24, zmm31, [rsp+576], 1 + vpclmulqdq zmm25, zmm31, [rsp+576], 16 + vpclmulqdq zmm26, zmm31, [rsp+576], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vmovdqu64 zmm31, [rax+128] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vpclmulqdq zmm23, zmm31, [rsp+640], 0 + vpclmulqdq zmm24, zmm31, [rsp+640], 1 + vpclmulqdq zmm25, zmm31, [rsp+640], 16 + vpclmulqdq zmm26, zmm31, [rsp+640], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vmovdqu64 zmm31, [rax+192] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vpclmulqdq zmm23, zmm31, [rsp+704], 0 + vpclmulqdq zmm24, zmm31, [rsp+704], 1 + vpclmulqdq zmm25, zmm31, [rsp+704], 16 + vpclmulqdq zmm26, zmm31, [rsp+704], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r10d, 11 + vbroadcasti32x4 zmm20, [r15+160] + jl L_AES_GCM_decrypt_avx512_a_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r10d, 13 + vbroadcasti32x4 zmm20, [r15+192] + jl L_AES_GCM_decrypt_avx512_a_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+224] +L_AES_GCM_decrypt_avx512_a_il_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add r12d, 256 + vbroadcasti32x4 zmm20, [rsp+1024] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [rsp+1024] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [rsp+1024], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vmovdqu64 zmm31, [rax+256] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vpclmulqdq zmm23, zmm31, [rsp+768], 0 + vpclmulqdq zmm24, zmm31, [rsp+768], 1 + vpclmulqdq zmm25, zmm31, [rsp+768], 16 + vpclmulqdq zmm26, zmm31, [rsp+768], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vmovdqu64 zmm31, [rax+320] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vpclmulqdq zmm23, zmm31, [rsp+832], 0 + vpclmulqdq zmm24, zmm31, [rsp+832], 1 + vpclmulqdq zmm25, zmm31, [rsp+832], 16 + vpclmulqdq zmm26, zmm31, [rsp+832], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vmovdqu64 zmm31, [rax+384] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vpclmulqdq zmm23, zmm31, [rsp+896], 0 + vpclmulqdq zmm24, zmm31, [rsp+896], 1 + vpclmulqdq zmm25, zmm31, [rsp+896], 16 + vpclmulqdq zmm26, zmm31, [rsp+896], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vmovdqu64 zmm31, [rax+448] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vpclmulqdq zmm23, zmm31, [rsp+960], 0 + vpclmulqdq zmm24, zmm31, [rsp+960], 1 + vpclmulqdq zmm25, zmm31, [rsp+960], 16 + vpclmulqdq zmm26, zmm31, [rsp+960], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r10d, 11 + vbroadcasti32x4 zmm20, [r15+160] + jl L_AES_GCM_decrypt_avx512_b_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r10d, 13 + vbroadcasti32x4 zmm20, [r15+192] + jl L_AES_GCM_decrypt_avx512_b_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+224] +L_AES_GCM_decrypt_avx512_b_il_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add r12d, 256 + vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128 + vpclmulqdq zmm23, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm23, 150 + vpclmulqdq zmm23, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm23, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 + add ebx, 512 + cmp ebx, r13d + jl L_AES_GCM_decrypt_avx512_win_loop +L_AES_GCM_decrypt_avx512_last_aes: + vbroadcasti32x4 zmm20, [rsp+1024] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [rsp+1024] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [rsp+1024], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r10d, 11 + vbroadcasti32x4 zmm20, [r15+160] + jl L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r10d, 13 + vbroadcasti32x4 zmm20, [r15+192] + jl L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+224] +L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add r12d, 256 + vbroadcasti32x4 zmm20, [rsp+1024] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [rsp+1024] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [rsp+1024], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r10d, 11 + vbroadcasti32x4 zmm20, [r15+160] + jl L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r10d, 13 + vbroadcasti32x4 zmm20, [r15+192] + jl L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+224] +L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add r12d, 256 +L_AES_GCM_decrypt_avx512_no_windows: + vmovdqu64 zmm23, [rsp+192] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+128] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+64] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp] + vshufi64x2 zmm26, zmm26, zmm26, 27 + mov r13d, r9d + and r13d, 4294967040 + cmp ebx, r13d + jge L_AES_GCM_decrypt_avx512_after_256 + ; 256 bytes of input + lea rax, QWORD PTR [rdi+rbx] + vpxorq zmm20, zmm20, zmm20 + vinserti32x4 zmm20, zmm20, xmm6, 0 + vmovdqu64 zmm21, [rax] + vpshufb zmm21, zmm21, zmm30 + vpxorq zmm21, zmm21, zmm20 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vmovdqa64 zmm27, zmm16 + vpxorq zmm28, zmm18, zmm17 + vmovdqa64 zmm29, zmm19 + vmovdqu64 zmm21, [rax+64] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rax+128] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rax+192] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vpclmulqdq zmm21, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm21, 150 + vpclmulqdq zmm21, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm21, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 + vbroadcasti32x4 zmm20, [rsp+1024] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [rsp+1024] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [rsp+1024], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r10d, 11 + vbroadcasti32x4 zmm20, [r15+160] + jl L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r10d, 13 + vbroadcasti32x4 zmm20, [r15+192] + jl L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+224] +L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add ebx, 256 +L_AES_GCM_decrypt_avx512_after_256: + vmovdqu xmm5, OWORD PTR [rsp] +L_AES_GCM_decrypt_avx512_done_128: + mov edx, r9d + cmp ebx, edx + jge L_AES_GCM_decrypt_avx512_done_dec + mov r13d, r9d + and r13d, 4294967280 + cmp ebx, r13d + jge L_AES_GCM_decrypt_avx512_last_block_done +L_AES_GCM_decrypt_avx512_last_block_start: + vmovdqu xmm13, OWORD PTR [rdi+rbx] + vmovdqa xmm0, xmm5 + vpshufb xmm1, xmm13, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm1, xmm1, xmm6 + vmovdqu xmm9, OWORD PTR [rsp+1024] + vpshufb xmm8, xmm9, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm9, OWORD PTR L_avx512_aes_gcm_one + vmovdqu OWORD PTR [rsp+1024], xmm9 + vpxor xmm8, xmm8, [r15] + vpclmulqdq xmm10, xmm1, xmm0, 16 + vaesenc xmm8, xmm8, [r15+16] + vaesenc xmm8, xmm8, [r15+32] + vpclmulqdq xmm11, xmm1, xmm0, 1 + vaesenc xmm8, xmm8, [r15+48] + vaesenc xmm8, xmm8, [r15+64] + vpclmulqdq xmm12, xmm1, xmm0, 0 + vaesenc xmm8, xmm8, [r15+80] + vpclmulqdq xmm1, xmm1, xmm0, 17 + vaesenc xmm8, xmm8, [r15+96] + vpxor xmm10, xmm10, xmm11 + vpslldq xmm2, xmm10, 8 + vpsrldq xmm10, xmm10, 8 + vaesenc xmm8, xmm8, [r15+112] + vpxor xmm2, xmm2, xmm12 + vpxor xmm3, xmm1, xmm10 + vmovdqa xmm0, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm2, xmm0, 16 + vaesenc xmm8, xmm8, [r15+128] + vpshufd xmm10, xmm2, 78 + vpxor xmm10, xmm10, xmm11 + vpclmulqdq xmm11, xmm10, xmm0, 16 + vaesenc xmm8, xmm8, [r15+144] + vpshufd xmm10, xmm10, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm6, xmm10, xmm3 + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_avx512_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_avx512_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_decrypt_avx512_aesenc_gfmul_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqa xmm0, xmm13 + vpxor xmm8, xmm8, xmm0 + vmovdqu OWORD PTR [rsi+rbx], xmm8 + add ebx, 16 + cmp ebx, r13d + jl L_AES_GCM_decrypt_avx512_last_block_start +L_AES_GCM_decrypt_avx512_last_block_done: + mov ecx, r9d + mov edx, ecx + and ecx, 15 + jz L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_done + vmovdqu xmm4, OWORD PTR [rsp+1024] + vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpxor xmm4, xmm4, [r15] + vaesenc xmm4, xmm4, [r15+16] + vaesenc xmm4, xmm4, [r15+32] + vaesenc xmm4, xmm4, [r15+48] + vaesenc xmm4, xmm4, [r15+64] + vaesenc xmm4, xmm4, [r15+80] + vaesenc xmm4, xmm4, [r15+96] + vaesenc xmm4, xmm4, [r15+112] + vaesenc xmm4, xmm4, [r15+128] + vaesenc xmm4, xmm4, [r15+144] + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc xmm4, xmm4, xmm9 + vaesenc xmm4, xmm4, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc xmm4, xmm4, xmm9 + vaesenc xmm4, xmm4, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_aesenc_avx_last: + vaesenclast xmm4, xmm4, xmm9 + sub rsp, 32 + xor ecx, ecx + vmovdqu OWORD PTR [rsp], xmm4 + vpxor xmm0, xmm0, xmm0 + vmovdqu OWORD PTR [rsp+16], xmm0 +L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_loop: + movzx r13d, BYTE PTR [rdi+rbx] + mov BYTE PTR [rsp+rcx+16], r13b + xor r13b, BYTE PTR [rsp+rcx] + mov BYTE PTR [rsi+rbx], r13b + inc ebx + inc ecx + cmp ebx, edx + jl L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_loop + vmovdqu xmm4, OWORD PTR [rsp+16] + add rsp, 32 + vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm4 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpxor xmm9, xmm9, xmm5 + vpshufd xmm10, xmm6, 78 + vpxor xmm10, xmm10, xmm6 + vpclmulqdq xmm8, xmm6, xmm5, 0 + vpclmulqdq xmm11, xmm6, xmm5, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm6, xmm11 +L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_done: +L_AES_GCM_decrypt_avx512_done_dec: + mov edx, r9d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + vmovq xmm0, rdx + vmovq xmm1, rcx + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm6, xmm6, xmm0 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpxor xmm9, xmm9, xmm5 + vpshufd xmm10, xmm6, 78 + vpxor xmm10, xmm10, xmm6 + vpclmulqdq xmm8, xmm6, xmm5, 0 + vpclmulqdq xmm11, xmm6, xmm5, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm6, xmm11 + vpshufb xmm6, xmm6, OWORD PTR L_avx512_aes_gcm_bswap_mask + vmovdqu xmm0, OWORD PTR [rsp+1040] + vpxor xmm0, xmm0, xmm6 + cmp r14d, 16 + je L_AES_GCM_decrypt_avx512_cmp_tag_16 + sub rsp, 16 + xor rcx, rcx + xor rbx, rbx + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_decrypt_avx512_cmp_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + xor r13b, BYTE PTR [r8+rcx] + or bl, r13b + inc ecx + cmp ecx, r14d + jne L_AES_GCM_decrypt_avx512_cmp_tag_loop + cmp bl, 0 + sete bl + add rsp, 16 + xor rcx, rcx + jmp L_AES_GCM_decrypt_avx512_cmp_tag_done +L_AES_GCM_decrypt_avx512_cmp_tag_16: + vmovdqu xmm1, OWORD PTR [r8] + vpcmpeqb xmm0, xmm0, xmm1 + vpmovmskb rdx, xmm0 + ; %%edx == 0xFFFF then return 1 else => return 0 + xor ebx, ebx + cmp edx, 65535 + sete bl +L_AES_GCM_decrypt_avx512_cmp_tag_done: + mov DWORD PTR [rbp], ebx + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+1056] + vmovdqu xmm7, OWORD PTR [rsp+1072] + vmovdqu xmm8, OWORD PTR [rsp+1088] + vmovdqu xmm9, OWORD PTR [rsp+1104] + vmovdqu xmm10, OWORD PTR [rsp+1120] + vmovdqu xmm11, OWORD PTR [rsp+1136] + vmovdqu xmm12, OWORD PTR [rsp+1152] + vmovdqu xmm13, OWORD PTR [rsp+1168] + vmovdqu xmm14, OWORD PTR [rsp+1184] + vmovdqu xmm15, OWORD PTR [rsp+1200] + add rsp, 1216 + pop rbp + pop r15 + pop r14 + pop rbx + pop r12 + pop rsi + pop rdi + pop r13 + ret +AES_GCM_decrypt_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_init_avx512 PROC + push rdi + push rsi + push r12 + push r13 + mov rdi, rcx + mov rsi, rdx + mov r10, r8 + mov r11d, r9d + mov rax, QWORD PTR [rsp+72] + mov r8, QWORD PTR [rsp+80] + mov r9, QWORD PTR [rsp+88] + sub rsp, 80 + vmovdqu OWORD PTR [rsp+16], xmm6 + vmovdqu OWORD PTR [rsp+32], xmm7 + vmovdqu OWORD PTR [rsp+48], xmm8 + vmovdqu OWORD PTR [rsp+64], xmm15 + vpxor xmm4, xmm4, xmm4 + mov edx, r11d + cmp edx, 12 + jne L_AES_GCM_init_avx512_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + vmovq xmm4, QWORD PTR [r10] + vpinsrd xmm4, xmm4, DWORD PTR [r10+8], 2 + vpinsrd xmm4, xmm4, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqa xmm5, OWORD PTR [rdi] + vpxor xmm1, xmm4, xmm5 + vmovdqa xmm6, OWORD PTR [rdi+16] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+32] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+48] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+64] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+80] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+96] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+112] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+128] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+144] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + cmp esi, 11 + vmovdqa xmm6, OWORD PTR [rdi+160] + jl L_AES_GCM_init_avx512_calc_iv_12_last + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+176] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + cmp esi, 13 + vmovdqa xmm6, OWORD PTR [rdi+192] + jl L_AES_GCM_init_avx512_calc_iv_12_last + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+208] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+224] +L_AES_GCM_init_avx512_calc_iv_12_last: + vaesenclast xmm5, xmm5, xmm6 + vaesenclast xmm1, xmm1, xmm6 + vpshufb xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_bswap_mask + vmovdqu xmm15, xmm1 + jmp L_AES_GCM_init_avx512_iv_done +L_AES_GCM_init_avx512_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqa xmm5, OWORD PTR [rdi] + vaesenc xmm5, xmm5, [rdi+16] + vaesenc xmm5, xmm5, [rdi+32] + vaesenc xmm5, xmm5, [rdi+48] + vaesenc xmm5, xmm5, [rdi+64] + vaesenc xmm5, xmm5, [rdi+80] + vaesenc xmm5, xmm5, [rdi+96] + vaesenc xmm5, xmm5, [rdi+112] + vaesenc xmm5, xmm5, [rdi+128] + vaesenc xmm5, xmm5, [rdi+144] + cmp esi, 11 + vmovdqa xmm8, OWORD PTR [rdi+160] + jl L_AES_GCM_init_avx512_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm8 + vaesenc xmm5, xmm5, [rdi+176] + cmp esi, 13 + vmovdqa xmm8, OWORD PTR [rdi+192] + jl L_AES_GCM_init_avx512_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm8 + vaesenc xmm5, xmm5, [rdi+208] + vmovdqa xmm8, OWORD PTR [rdi+224] +L_AES_GCM_init_avx512_calc_iv_1_aesenc_avx_last: + vaesenclast xmm5, xmm5, xmm8 + vpshufb xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov rcx, 0 + je L_AES_GCM_init_avx512_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_init_avx512_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_init_avx512_calc_iv_16_loop: + vmovdqu xmm7, OWORD PTR [r10+rcx] + vpshufb xmm7, xmm7, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm6, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm6, xmm6, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm6, 31 + vpslld xmm1, xmm6, 30 + vpslld xmm2, xmm6, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm6, xmm6, xmm0 + vpsrld xmm2, xmm6, 1 + vpsrld xmm3, xmm6, 2 + vpsrld xmm0, xmm6, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm6 + vpxor xmm4, xmm4, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_init_avx512_calc_iv_16_loop + mov edx, r11d + cmp ecx, edx + je L_AES_GCM_init_avx512_calc_iv_done +L_AES_GCM_init_avx512_calc_iv_lt16: + sub rsp, 16 + vpxor xmm7, xmm7, xmm7 + xor r13d, r13d + vmovdqu OWORD PTR [rsp], xmm7 +L_AES_GCM_init_avx512_calc_iv_loop: + movzx r12d, BYTE PTR [r10+rcx] + mov BYTE PTR [rsp+r13], r12b + inc ecx + inc r13d + cmp ecx, edx + jl L_AES_GCM_init_avx512_calc_iv_loop + vmovdqu xmm7, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm7, xmm7, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm6, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm6, xmm6, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm6, 31 + vpslld xmm1, xmm6, 30 + vpslld xmm2, xmm6, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm6, xmm6, xmm0 + vpsrld xmm2, xmm6, 1 + vpsrld xmm3, xmm6, 2 + vpsrld xmm0, xmm6, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm6 + vpxor xmm4, xmm4, xmm2 +L_AES_GCM_init_avx512_calc_iv_done: + ; T = Encrypt counter + vpxor xmm0, xmm0, xmm0 + shl edx, 3 + vmovq xmm0, rdx + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm6, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm6, xmm6, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm6, 31 + vpslld xmm1, xmm6, 30 + vpslld xmm2, xmm6, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm6, xmm6, xmm0 + vpsrld xmm2, xmm6, 1 + vpsrld xmm3, xmm6, 2 + vpsrld xmm0, xmm6, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm6 + vpxor xmm4, xmm4, xmm2 + vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_mask + ; Encrypt counter + vmovdqa xmm7, OWORD PTR [rdi] + vpxor xmm7, xmm7, xmm4 + vaesenc xmm7, xmm7, [rdi+16] + vaesenc xmm7, xmm7, [rdi+32] + vaesenc xmm7, xmm7, [rdi+48] + vaesenc xmm7, xmm7, [rdi+64] + vaesenc xmm7, xmm7, [rdi+80] + vaesenc xmm7, xmm7, [rdi+96] + vaesenc xmm7, xmm7, [rdi+112] + vaesenc xmm7, xmm7, [rdi+128] + vaesenc xmm7, xmm7, [rdi+144] + cmp esi, 11 + vmovdqa xmm8, OWORD PTR [rdi+160] + jl L_AES_GCM_init_avx512_calc_iv_2_aesenc_avx_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [rdi+176] + cmp esi, 13 + vmovdqa xmm8, OWORD PTR [rdi+192] + jl L_AES_GCM_init_avx512_calc_iv_2_aesenc_avx_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [rdi+208] + vmovdqa xmm8, OWORD PTR [rdi+224] +L_AES_GCM_init_avx512_calc_iv_2_aesenc_avx_last: + vaesenclast xmm7, xmm7, xmm8 + vmovdqu xmm15, xmm7 +L_AES_GCM_init_avx512_iv_done: + vmovdqa OWORD PTR [r9], xmm15 + vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpaddd xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_one + vmovdqa OWORD PTR [rax], xmm5 + vmovdqa OWORD PTR [r8], xmm4 + vmovdqu xmm6, OWORD PTR [rsp+16] + vmovdqu xmm7, OWORD PTR [rsp+32] + vmovdqu xmm8, OWORD PTR [rsp+48] + vmovdqu xmm15, OWORD PTR [rsp+64] + add rsp, 80 + pop r13 + pop r12 + pop rsi + pop rdi + ret +AES_GCM_init_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_aad_update_avx512 PROC + mov rax, rcx + sub rsp, 32 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqa xmm5, OWORD PTR [r8] + vmovdqa xmm6, OWORD PTR [r9] + xor ecx, ecx +L_AES_GCM_aad_update_avx512_16_loop: + vmovdqu xmm7, OWORD PTR [rax+rcx] + vpshufb xmm7, xmm7, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm5, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm5, xmm5, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm5, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm5, xmm5, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm5, xmm5, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm5, xmm5, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm5, xmm5, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_aad_update_avx512_16_loop + vmovdqa OWORD PTR [r8], xmm5 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + add rsp, 32 + ret +AES_GCM_aad_update_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_block_avx512 PROC + mov r10, r8 + mov r11, r9 + mov rax, QWORD PTR [rsp+40] + vmovdqu xmm1, OWORD PTR [rax] + vpshufb xmm0, xmm1, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpaddd xmm1, xmm1, OWORD PTR L_avx512_aes_gcm_one + vmovdqu OWORD PTR [rax], xmm1 + vpxor xmm0, xmm0, [rcx] + vaesenc xmm0, xmm0, [rcx+16] + vaesenc xmm0, xmm0, [rcx+32] + vaesenc xmm0, xmm0, [rcx+48] + vaesenc xmm0, xmm0, [rcx+64] + vaesenc xmm0, xmm0, [rcx+80] + vaesenc xmm0, xmm0, [rcx+96] + vaesenc xmm0, xmm0, [rcx+112] + vaesenc xmm0, xmm0, [rcx+128] + vaesenc xmm0, xmm0, [rcx+144] + cmp edx, 11 + vmovdqa xmm1, OWORD PTR [rcx+160] + jl L_AES_GCM_encrypt_block_avx512_aesenc_block_last + vaesenc xmm0, xmm0, xmm1 + vaesenc xmm0, xmm0, [rcx+176] + cmp edx, 13 + vmovdqa xmm1, OWORD PTR [rcx+192] + jl L_AES_GCM_encrypt_block_avx512_aesenc_block_last + vaesenc xmm0, xmm0, xmm1 + vaesenc xmm0, xmm0, [rcx+208] + vmovdqa xmm1, OWORD PTR [rcx+224] +L_AES_GCM_encrypt_block_avx512_aesenc_block_last: + vaesenclast xmm0, xmm0, xmm1 + vmovdqu xmm1, OWORD PTR [r11] + vpxor xmm0, xmm0, xmm1 + vmovdqu OWORD PTR [r10], xmm0 + vpshufb xmm0, xmm0, OWORD PTR L_avx512_aes_gcm_bswap_mask + vzeroupper + ret +AES_GCM_encrypt_block_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_ghash_block_avx512 PROC + sub rsp, 32 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqa xmm4, OWORD PTR [rdx] + vmovdqa xmm5, OWORD PTR [r8] + vmovdqu xmm7, OWORD PTR [rcx] + vpshufb xmm7, xmm7, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm6, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm6, xmm6, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm6, 31 + vpslld xmm1, xmm6, 30 + vpslld xmm2, xmm6, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm6, xmm6, xmm0 + vpsrld xmm2, xmm6, 1 + vpsrld xmm3, xmm6, 2 + vpsrld xmm0, xmm6, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm6 + vpxor xmm4, xmm4, xmm2 + vmovdqa OWORD PTR [rdx], xmm4 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + add rsp, 32 + ret +AES_GCM_ghash_block_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_update_avx512 PROC + push r13 + push r12 + push r14 + push r15 + push rdi + push rsi + push rbx + push rbp + mov rax, rcx + mov r10, r8 + mov r8d, edx + mov r11, r9 + mov r9d, DWORD PTR [rsp+104] + mov r12, QWORD PTR [rsp+112] + mov r14, QWORD PTR [rsp+120] + mov r15, QWORD PTR [rsp+128] + sub rsp, 1200 + vmovdqu OWORD PTR [rsp+1040], xmm6 + vmovdqu OWORD PTR [rsp+1056], xmm7 + vmovdqu OWORD PTR [rsp+1072], xmm8 + vmovdqu OWORD PTR [rsp+1088], xmm9 + vmovdqu OWORD PTR [rsp+1104], xmm10 + vmovdqu OWORD PTR [rsp+1120], xmm11 + vmovdqu OWORD PTR [rsp+1136], xmm12 + vmovdqu OWORD PTR [rsp+1152], xmm13 + vmovdqu OWORD PTR [rsp+1168], xmm14 + vmovdqu OWORD PTR [rsp+1184], xmm15 + vmovdqa xmm6, OWORD PTR [r12] + vmovdqa xmm5, OWORD PTR [r14] + vpsrlq xmm9, xmm5, 63 + vpsllq xmm8, xmm5, 1 + vpslldq xmm9, xmm9, 8 + vpor xmm8, xmm8, xmm9 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpxor xmm5, xmm5, xmm8 + xor edi, edi + cmp r9d, 256 + jl L_AES_GCM_encrypt_update_avx512_done_128 + vmovdqa xmm2, xmm6 + ; H ^ 1 + vmovdqu OWORD PTR [rsp], xmm5 + ; H ^ 2 + vpclmulqdq xmm8, xmm5, xmm5, 0 + vpclmulqdq xmm11, xmm5, xmm5, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm0, xmm11 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpxor xmm9, xmm9, xmm5 + vpshufd xmm10, xmm0, 78 + vpxor xmm10, xmm10, xmm0 + vpclmulqdq xmm8, xmm0, xmm5, 0 + vpclmulqdq xmm11, xmm0, xmm5, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm1, xmm11 + vmovdqu OWORD PTR [rsp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm3, xmm11 + vmovdqu OWORD PTR [rsp+48], xmm3 + ; H ^ 5 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+64], xmm7 + ; H ^ 6 + vpclmulqdq xmm8, xmm1, xmm1, 0 + vpclmulqdq xmm11, xmm1, xmm1, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+80], xmm7 + ; H ^ 7 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm1, 78 + vpxor xmm9, xmm9, xmm1 + vpshufd xmm10, xmm3, 78 + vpxor xmm10, xmm10, xmm3 + vpclmulqdq xmm8, xmm3, xmm1, 0 + vpclmulqdq xmm11, xmm3, xmm1, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+96], xmm7 + ; H ^ 8 + vpclmulqdq xmm8, xmm3, xmm3, 0 + vpclmulqdq xmm11, xmm3, xmm3, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+112], xmm7 + ; H ^ 9 + vmovdqu xmm0, OWORD PTR [rsp+48] + vmovdqu xmm1, OWORD PTR [rsp+64] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+128], xmm7 + ; H ^ 10 + vmovdqu xmm0, OWORD PTR [rsp+64] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+144], xmm7 + ; H ^ 11 + vmovdqu xmm0, OWORD PTR [rsp+64] + vmovdqu xmm1, OWORD PTR [rsp+80] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+160], xmm7 + ; H ^ 12 + vmovdqu xmm0, OWORD PTR [rsp+80] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+176], xmm7 + ; H ^ 13 + vmovdqu xmm0, OWORD PTR [rsp+80] + vmovdqu xmm1, OWORD PTR [rsp+96] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+192], xmm7 + ; H ^ 14 + vmovdqu xmm0, OWORD PTR [rsp+96] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+208], xmm7 + ; H ^ 15 + vmovdqu xmm0, OWORD PTR [rsp+96] + vmovdqu xmm1, OWORD PTR [rsp+112] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+224], xmm7 + ; H ^ 16 + vmovdqu xmm0, OWORD PTR [rsp+112] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+240], xmm7 + cmp r9d, 512 + jl L_AES_GCM_encrypt_update_avx512_no_ext + ; H ^ 17 + vmovdqu xmm0, OWORD PTR [rsp+112] + vmovdqu xmm1, OWORD PTR [rsp+128] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+256], xmm7 + ; H ^ 18 + vmovdqu xmm0, OWORD PTR [rsp+128] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+272], xmm7 + ; H ^ 19 + vmovdqu xmm0, OWORD PTR [rsp+128] + vmovdqu xmm1, OWORD PTR [rsp+144] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+288], xmm7 + ; H ^ 20 + vmovdqu xmm0, OWORD PTR [rsp+144] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+304], xmm7 + ; H ^ 21 + vmovdqu xmm0, OWORD PTR [rsp+144] + vmovdqu xmm1, OWORD PTR [rsp+160] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+320], xmm7 + ; H ^ 22 + vmovdqu xmm0, OWORD PTR [rsp+160] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+336], xmm7 + ; H ^ 23 + vmovdqu xmm0, OWORD PTR [rsp+160] + vmovdqu xmm1, OWORD PTR [rsp+176] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+352], xmm7 + ; H ^ 24 + vmovdqu xmm0, OWORD PTR [rsp+176] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+368], xmm7 + ; H ^ 25 + vmovdqu xmm0, OWORD PTR [rsp+176] + vmovdqu xmm1, OWORD PTR [rsp+192] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+384], xmm7 + ; H ^ 26 + vmovdqu xmm0, OWORD PTR [rsp+192] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+400], xmm7 + ; H ^ 27 + vmovdqu xmm0, OWORD PTR [rsp+192] + vmovdqu xmm1, OWORD PTR [rsp+208] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+416], xmm7 + ; H ^ 28 + vmovdqu xmm0, OWORD PTR [rsp+208] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+432], xmm7 + ; H ^ 29 + vmovdqu xmm0, OWORD PTR [rsp+208] + vmovdqu xmm1, OWORD PTR [rsp+224] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+448], xmm7 + ; H ^ 30 + vmovdqu xmm0, OWORD PTR [rsp+224] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+464], xmm7 + ; H ^ 31 + vmovdqu xmm0, OWORD PTR [rsp+224] + vmovdqu xmm1, OWORD PTR [rsp+240] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+480], xmm7 + ; H ^ 32 + vmovdqu xmm0, OWORD PTR [rsp+240] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+496], xmm7 +L_AES_GCM_encrypt_update_avx512_no_ext: + vbroadcasti32x4 zmm22, ptr_L_avx512_aes_gcm_bswap_epi64 + vbroadcasti32x4 zmm30, ptr_L_avx512_aes_gcm_bswap_mask + vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128 + vbroadcasti32x4 zmm9, [rax] + vbroadcasti32x4 zmm10, [rax+16] + vbroadcasti32x4 zmm11, [rax+32] + vbroadcasti32x4 zmm12, [rax+48] + vbroadcasti32x4 zmm13, [rax+64] + vbroadcasti32x4 zmm14, [rax+80] + vbroadcasti32x4 zmm15, [rax+96] + vbroadcasti32x4 zmm1, [rax+112] + vbroadcasti32x4 zmm2, [rax+128] + vbroadcasti32x4 zmm3, [rax+144] + cmp r9d, 512 + jl L_AES_GCM_encrypt_update_avx512_no_windows + mov ebp, r9d + and ebp, 4294966784 + vmovdqu64 zmm23, [rsp+448] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+384] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+320] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp+256] + vshufi64x2 zmm26, zmm26, zmm26, 27 + vmovdqu64 [rsp+512], zmm23 + vmovdqu64 [rsp+576], zmm24 + vmovdqu64 [rsp+640], zmm25 + vmovdqu64 [rsp+704], zmm26 + vmovdqu64 zmm23, [rsp+192] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+128] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+64] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp] + vshufi64x2 zmm26, zmm26, zmm26, 27 + vmovdqu64 [rsp+768], zmm23 + vmovdqu64 [rsp+832], zmm24 + vmovdqu64 [rsp+896], zmm25 + vmovdqu64 [rsp+960], zmm26 + ; 512 bytes of input + lea rsi, QWORD PTR [r10+rdi] + vbroadcasti32x4 zmm20, [r15] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [r15] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [r15], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r8d, 11 + vbroadcasti32x4 zmm20, [rax+160] + jl L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r8d, 13 + vbroadcasti32x4 zmm20, [rax+192] + jl L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+224] +L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add edi, 256 + vbroadcasti32x4 zmm20, [r15] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [r15] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [r15], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r8d, 11 + vbroadcasti32x4 zmm20, [rax+160] + jl L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r8d, 13 + vbroadcasti32x4 zmm20, [rax+192] + jl L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+224] +L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add edi, 256 + cmp edi, ebp + jge L_AES_GCM_encrypt_update_avx512_last_win +L_AES_GCM_encrypt_update_avx512_win_loop: + lea rbx, QWORD PTR [r10+rdi] + vpxorq zmm21, zmm21, zmm21 + vinserti32x4 zmm21, zmm21, xmm6, 0 + vbroadcasti32x4 zmm20, [r15] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [r15] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [r15], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vmovdqu64 zmm31, [rsi] + vpshufb zmm31, zmm31, zmm30 + vpxorq zmm31, zmm31, zmm21 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vpclmulqdq zmm23, zmm31, [rsp+512], 0 + vpclmulqdq zmm24, zmm31, [rsp+512], 1 + vpclmulqdq zmm25, zmm31, [rsp+512], 16 + vpclmulqdq zmm26, zmm31, [rsp+512], 17 + vmovdqa64 zmm27, zmm23 + vpxorq zmm28, zmm25, zmm24 + vmovdqa64 zmm29, zmm26 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vmovdqu64 zmm31, [rsi+64] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vpclmulqdq zmm23, zmm31, [rsp+576], 0 + vpclmulqdq zmm24, zmm31, [rsp+576], 1 + vpclmulqdq zmm25, zmm31, [rsp+576], 16 + vpclmulqdq zmm26, zmm31, [rsp+576], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vmovdqu64 zmm31, [rsi+128] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vpclmulqdq zmm23, zmm31, [rsp+640], 0 + vpclmulqdq zmm24, zmm31, [rsp+640], 1 + vpclmulqdq zmm25, zmm31, [rsp+640], 16 + vpclmulqdq zmm26, zmm31, [rsp+640], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vmovdqu64 zmm31, [rsi+192] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vpclmulqdq zmm23, zmm31, [rsp+704], 0 + vpclmulqdq zmm24, zmm31, [rsp+704], 1 + vpclmulqdq zmm25, zmm31, [rsp+704], 16 + vpclmulqdq zmm26, zmm31, [rsp+704], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r8d, 11 + vbroadcasti32x4 zmm20, [rax+160] + jl L_AES_GCM_encrypt_update_avx512_a_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r8d, 13 + vbroadcasti32x4 zmm20, [rax+192] + jl L_AES_GCM_encrypt_update_avx512_a_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+224] +L_AES_GCM_encrypt_update_avx512_a_il_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add edi, 256 + vbroadcasti32x4 zmm20, [r15] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [r15] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [r15], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vmovdqu64 zmm31, [rsi+256] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vpclmulqdq zmm23, zmm31, [rsp+768], 0 + vpclmulqdq zmm24, zmm31, [rsp+768], 1 + vpclmulqdq zmm25, zmm31, [rsp+768], 16 + vpclmulqdq zmm26, zmm31, [rsp+768], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vmovdqu64 zmm31, [rsi+320] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vpclmulqdq zmm23, zmm31, [rsp+832], 0 + vpclmulqdq zmm24, zmm31, [rsp+832], 1 + vpclmulqdq zmm25, zmm31, [rsp+832], 16 + vpclmulqdq zmm26, zmm31, [rsp+832], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vmovdqu64 zmm31, [rsi+384] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vpclmulqdq zmm23, zmm31, [rsp+896], 0 + vpclmulqdq zmm24, zmm31, [rsp+896], 1 + vpclmulqdq zmm25, zmm31, [rsp+896], 16 + vpclmulqdq zmm26, zmm31, [rsp+896], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vmovdqu64 zmm31, [rsi+448] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vpclmulqdq zmm23, zmm31, [rsp+960], 0 + vpclmulqdq zmm24, zmm31, [rsp+960], 1 + vpclmulqdq zmm25, zmm31, [rsp+960], 16 + vpclmulqdq zmm26, zmm31, [rsp+960], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r8d, 11 + vbroadcasti32x4 zmm20, [rax+160] + jl L_AES_GCM_encrypt_update_avx512_b_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r8d, 13 + vbroadcasti32x4 zmm20, [rax+192] + jl L_AES_GCM_encrypt_update_avx512_b_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+224] +L_AES_GCM_encrypt_update_avx512_b_il_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add edi, 256 + vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128 + vpclmulqdq zmm23, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm23, 150 + vpclmulqdq zmm23, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm23, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 + mov rsi, rbx + cmp edi, ebp + jl L_AES_GCM_encrypt_update_avx512_win_loop +L_AES_GCM_encrypt_update_avx512_last_win: + vpxorq zmm20, zmm20, zmm20 + vinserti32x4 zmm20, zmm20, xmm6, 0 + vmovdqu64 zmm23, [rsp+512] + vmovdqu64 zmm24, [rsp+576] + vmovdqu64 zmm25, [rsp+640] + vmovdqu64 zmm26, [rsp+704] + vmovdqu64 zmm21, [rsi] + vpshufb zmm21, zmm21, zmm30 + vpxorq zmm21, zmm21, zmm20 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vmovdqa64 zmm27, zmm16 + vpxorq zmm28, zmm18, zmm17 + vmovdqa64 zmm29, zmm19 + vmovdqu64 zmm21, [rsi+64] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rsi+128] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rsi+192] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm23, [rsp+768] + vmovdqu64 zmm24, [rsp+832] + vmovdqu64 zmm25, [rsp+896] + vmovdqu64 zmm26, [rsp+960] + vmovdqu64 zmm21, [rsi+256] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rsi+320] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rsi+384] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rsi+448] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vpclmulqdq zmm21, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm21, 150 + vpclmulqdq zmm21, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm21, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 +L_AES_GCM_encrypt_update_avx512_no_windows: + vmovdqu64 zmm23, [rsp+192] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+128] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+64] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp] + vshufi64x2 zmm26, zmm26, zmm26, 27 + mov r13d, r9d + and r13d, 4294967040 + cmp edi, r13d + jge L_AES_GCM_encrypt_update_avx512_after_256 + ; 256 bytes of input + vbroadcasti32x4 zmm20, [r15] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [r15] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [r15], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r8d, 11 + vbroadcasti32x4 zmm20, [rax+160] + jl L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r8d, 13 + vbroadcasti32x4 zmm20, [rax+192] + jl L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+224] +L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + mov rsi, rdx + add edi, 256 + cmp edi, r13d + jge L_AES_GCM_encrypt_update_avx512_last_ghash +L_AES_GCM_encrypt_update_avx512_ghash_128: + vbroadcasti32x4 zmm20, [r15] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [r15] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [r15], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r8d, 11 + vbroadcasti32x4 zmm20, [rax+160] + jl L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r8d, 13 + vbroadcasti32x4 zmm20, [rax+192] + jl L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+224] +L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + vpxorq zmm20, zmm20, zmm20 + vinserti32x4 zmm20, zmm20, xmm6, 0 + vmovdqu64 zmm21, [rsi] + vpshufb zmm21, zmm21, zmm30 + vpxorq zmm21, zmm21, zmm20 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vmovdqa64 zmm27, zmm16 + vpxorq zmm28, zmm18, zmm17 + vmovdqa64 zmm29, zmm19 + vmovdqu64 zmm21, [rsi+64] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rsi+128] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rsi+192] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vpclmulqdq zmm21, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm21, 150 + vpclmulqdq zmm21, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm21, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 + mov rsi, rdx + add edi, 256 + cmp edi, r13d + jl L_AES_GCM_encrypt_update_avx512_ghash_128 +L_AES_GCM_encrypt_update_avx512_last_ghash: + vpxorq zmm20, zmm20, zmm20 + vinserti32x4 zmm20, zmm20, xmm6, 0 + vmovdqu64 zmm21, [rsi] + vpshufb zmm21, zmm21, zmm30 + vpxorq zmm21, zmm21, zmm20 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vmovdqa64 zmm27, zmm16 + vpxorq zmm28, zmm18, zmm17 + vmovdqa64 zmm29, zmm19 + vmovdqu64 zmm21, [rsi+64] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rsi+128] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rsi+192] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vpclmulqdq zmm21, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm21, 150 + vpclmulqdq zmm21, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm21, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 +L_AES_GCM_encrypt_update_avx512_after_256: + vmovdqu xmm5, OWORD PTR [rsp] +L_AES_GCM_encrypt_update_avx512_done_128: + mov edx, r9d + cmp edi, edx + jge L_AES_GCM_encrypt_update_avx512_done_enc + mov r13d, r9d + and r13d, 4294967280 + cmp edi, r13d + jge L_AES_GCM_encrypt_update_avx512_last_block_done + vmovdqu xmm9, OWORD PTR [r15] + vpshufb xmm8, xmm9, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm9, OWORD PTR L_avx512_aes_gcm_one + vmovdqu OWORD PTR [r15], xmm9 + vpxor xmm8, xmm8, [rax] + vaesenc xmm8, xmm8, [rax+16] + vaesenc xmm8, xmm8, [rax+32] + vaesenc xmm8, xmm8, [rax+48] + vaesenc xmm8, xmm8, [rax+64] + vaesenc xmm8, xmm8, [rax+80] + vaesenc xmm8, xmm8, [rax+96] + vaesenc xmm8, xmm8, [rax+112] + vaesenc xmm8, xmm8, [rax+128] + vaesenc xmm8, xmm8, [rax+144] + cmp r8d, 11 + vmovdqa xmm9, OWORD PTR [rax+160] + jl L_AES_GCM_encrypt_update_avx512_aesenc_block_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [rax+176] + cmp r8d, 13 + vmovdqa xmm9, OWORD PTR [rax+192] + jl L_AES_GCM_encrypt_update_avx512_aesenc_block_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [rax+208] + vmovdqa xmm9, OWORD PTR [rax+224] +L_AES_GCM_encrypt_update_avx512_aesenc_block_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqu xmm9, OWORD PTR [r11+rdi] + vpxor xmm8, xmm8, xmm9 + vmovdqu OWORD PTR [r10+rdi], xmm8 + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm8 + add edi, 16 + cmp edi, r13d + jge L_AES_GCM_encrypt_update_avx512_last_block_ghash +L_AES_GCM_encrypt_update_avx512_last_block_start: + vmovdqu xmm13, OWORD PTR [r11+rdi] + vmovdqu xmm9, OWORD PTR [r15] + vpshufb xmm8, xmm9, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm9, OWORD PTR L_avx512_aes_gcm_one + vmovdqu OWORD PTR [r15], xmm9 + vpxor xmm8, xmm8, [rax] + vpclmulqdq xmm10, xmm6, xmm5, 16 + vaesenc xmm8, xmm8, [rax+16] + vaesenc xmm8, xmm8, [rax+32] + vpclmulqdq xmm11, xmm6, xmm5, 1 + vaesenc xmm8, xmm8, [rax+48] + vaesenc xmm8, xmm8, [rax+64] + vpclmulqdq xmm12, xmm6, xmm5, 0 + vaesenc xmm8, xmm8, [rax+80] + vpclmulqdq xmm1, xmm6, xmm5, 17 + vaesenc xmm8, xmm8, [rax+96] + vpxor xmm10, xmm10, xmm11 + vpslldq xmm2, xmm10, 8 + vpsrldq xmm10, xmm10, 8 + vaesenc xmm8, xmm8, [rax+112] + vpxor xmm2, xmm2, xmm12 + vpxor xmm3, xmm1, xmm10 + vmovdqa xmm0, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm2, xmm0, 16 + vaesenc xmm8, xmm8, [rax+128] + vpshufd xmm10, xmm2, 78 + vpxor xmm10, xmm10, xmm11 + vpclmulqdq xmm11, xmm10, xmm0, 16 + vaesenc xmm8, xmm8, [rax+144] + vpshufd xmm10, xmm10, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm6, xmm10, xmm3 + cmp r8d, 11 + vmovdqa xmm9, OWORD PTR [rax+160] + jl L_AES_GCM_encrypt_update_avx512_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [rax+176] + cmp r8d, 13 + vmovdqa xmm9, OWORD PTR [rax+192] + jl L_AES_GCM_encrypt_update_avx512_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [rax+208] + vmovdqa xmm9, OWORD PTR [rax+224] +L_AES_GCM_encrypt_update_avx512_aesenc_gfmul_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqa xmm0, xmm13 + vpxor xmm8, xmm8, xmm0 + vmovdqu OWORD PTR [r10+rdi], xmm8 + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + add edi, 16 + vpxor xmm6, xmm6, xmm8 + cmp edi, r13d + jl L_AES_GCM_encrypt_update_avx512_last_block_start +L_AES_GCM_encrypt_update_avx512_last_block_ghash: + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpxor xmm9, xmm9, xmm5 + vpshufd xmm10, xmm6, 78 + vpxor xmm10, xmm10, xmm6 + vpclmulqdq xmm8, xmm6, xmm5, 0 + vpclmulqdq xmm11, xmm6, xmm5, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm6, xmm11 +L_AES_GCM_encrypt_update_avx512_last_block_done: +L_AES_GCM_encrypt_update_avx512_done_enc: + vmovdqa OWORD PTR [r12], xmm6 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+1040] + vmovdqu xmm7, OWORD PTR [rsp+1056] + vmovdqu xmm8, OWORD PTR [rsp+1072] + vmovdqu xmm9, OWORD PTR [rsp+1088] + vmovdqu xmm10, OWORD PTR [rsp+1104] + vmovdqu xmm11, OWORD PTR [rsp+1120] + vmovdqu xmm12, OWORD PTR [rsp+1136] + vmovdqu xmm13, OWORD PTR [rsp+1152] + vmovdqu xmm14, OWORD PTR [rsp+1168] + vmovdqu xmm15, OWORD PTR [rsp+1184] + add rsp, 1200 + pop rbp + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r12 + pop r13 + ret +AES_GCM_encrypt_update_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_final_avx512 PROC + push r13 + push r12 + push r14 + mov rax, rcx + mov r10d, r9d + mov r9, rdx + mov r11d, DWORD PTR [rsp+64] + mov r12, QWORD PTR [rsp+72] + mov r14, QWORD PTR [rsp+80] + sub rsp, 144 + vmovdqu OWORD PTR [rsp+16], xmm6 + vmovdqu OWORD PTR [rsp+32], xmm7 + vmovdqu OWORD PTR [rsp+48], xmm8 + vmovdqu OWORD PTR [rsp+64], xmm9 + vmovdqu OWORD PTR [rsp+80], xmm10 + vmovdqu OWORD PTR [rsp+96], xmm11 + vmovdqu OWORD PTR [rsp+112], xmm12 + vmovdqu OWORD PTR [rsp+128], xmm13 + vmovdqa xmm4, OWORD PTR [rax] + vmovdqa xmm5, OWORD PTR [r12] + vmovdqa xmm6, OWORD PTR [r14] + vpsrlq xmm8, xmm5, 63 + vpsllq xmm7, xmm5, 1 + vpslldq xmm8, xmm8, 8 + vpor xmm7, xmm7, xmm8 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpxor xmm5, xmm5, xmm7 + mov edx, r10d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + vmovq xmm0, rdx + vmovq xmm1, rcx + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_red_avx + vpshufd xmm8, xmm5, 78 + vpxor xmm8, xmm8, xmm5 + vpshufd xmm9, xmm4, 78 + vpxor xmm9, xmm9, xmm4 + vpclmulqdq xmm7, xmm4, xmm5, 0 + vpclmulqdq xmm10, xmm4, xmm5, 17 + vpclmulqdq xmm8, xmm8, xmm9, 0 + vpternlogq xmm8, xmm10, xmm7, 150 + vmovdqa xmm9, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpternlogq xmm8, xmm7, xmm11, 150 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm10, xmm8, xmm11, 150 + vmovdqa xmm4, xmm10 + vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm0, xmm4, xmm6 + cmp r8d, 16 + je L_AES_GCM_encrypt_final_avx512_store_tag_16 + xor rcx, rcx + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_encrypt_final_avx512_store_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + mov BYTE PTR [r9+rcx], r13b + inc ecx + cmp ecx, r8d + jne L_AES_GCM_encrypt_final_avx512_store_tag_loop + jmp L_AES_GCM_encrypt_final_avx512_store_tag_done +L_AES_GCM_encrypt_final_avx512_store_tag_16: + vmovdqu OWORD PTR [r9], xmm0 +L_AES_GCM_encrypt_final_avx512_store_tag_done: + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+16] + vmovdqu xmm7, OWORD PTR [rsp+32] + vmovdqu xmm8, OWORD PTR [rsp+48] + vmovdqu xmm9, OWORD PTR [rsp+64] + vmovdqu xmm10, OWORD PTR [rsp+80] + vmovdqu xmm11, OWORD PTR [rsp+96] + vmovdqu xmm12, OWORD PTR [rsp+112] + vmovdqu xmm13, OWORD PTR [rsp+128] + add rsp, 144 + pop r14 + pop r12 + pop r13 + ret +AES_GCM_encrypt_final_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_decrypt_update_avx512 PROC + push r13 + push r12 + push r14 + push r15 + push rdi + push rsi + push rbx + mov rax, rcx + mov r10, r8 + mov r8d, edx + mov r11, r9 + mov r9d, DWORD PTR [rsp+96] + mov r12, QWORD PTR [rsp+104] + mov r14, QWORD PTR [rsp+112] + mov r15, QWORD PTR [rsp+120] + sub rsp, 1200 + vmovdqu OWORD PTR [rsp+1040], xmm6 + vmovdqu OWORD PTR [rsp+1056], xmm7 + vmovdqu OWORD PTR [rsp+1072], xmm8 + vmovdqu OWORD PTR [rsp+1088], xmm9 + vmovdqu OWORD PTR [rsp+1104], xmm10 + vmovdqu OWORD PTR [rsp+1120], xmm11 + vmovdqu OWORD PTR [rsp+1136], xmm12 + vmovdqu OWORD PTR [rsp+1152], xmm13 + vmovdqu OWORD PTR [rsp+1168], xmm14 + vmovdqu OWORD PTR [rsp+1184], xmm15 + vmovdqa xmm6, OWORD PTR [r12] + vmovdqa xmm5, OWORD PTR [r14] + vpsrlq xmm9, xmm5, 63 + vpsllq xmm8, xmm5, 1 + vpslldq xmm9, xmm9, 8 + vpor xmm8, xmm8, xmm9 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpxor xmm5, xmm5, xmm8 + xor edi, edi + cmp r9d, 256 + jl L_AES_GCM_decrypt_update_avx512_done_128 + vmovdqa xmm2, xmm6 + ; H ^ 1 + vmovdqu OWORD PTR [rsp], xmm5 + ; H ^ 2 + vpclmulqdq xmm8, xmm5, xmm5, 0 + vpclmulqdq xmm11, xmm5, xmm5, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm0, xmm11 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpxor xmm9, xmm9, xmm5 + vpshufd xmm10, xmm0, 78 + vpxor xmm10, xmm10, xmm0 + vpclmulqdq xmm8, xmm0, xmm5, 0 + vpclmulqdq xmm11, xmm0, xmm5, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm1, xmm11 + vmovdqu OWORD PTR [rsp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm3, xmm11 + vmovdqu OWORD PTR [rsp+48], xmm3 + ; H ^ 5 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+64], xmm7 + ; H ^ 6 + vpclmulqdq xmm8, xmm1, xmm1, 0 + vpclmulqdq xmm11, xmm1, xmm1, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+80], xmm7 + ; H ^ 7 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm1, 78 + vpxor xmm9, xmm9, xmm1 + vpshufd xmm10, xmm3, 78 + vpxor xmm10, xmm10, xmm3 + vpclmulqdq xmm8, xmm3, xmm1, 0 + vpclmulqdq xmm11, xmm3, xmm1, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+96], xmm7 + ; H ^ 8 + vpclmulqdq xmm8, xmm3, xmm3, 0 + vpclmulqdq xmm11, xmm3, xmm3, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+112], xmm7 + ; H ^ 9 + vmovdqu xmm0, OWORD PTR [rsp+48] + vmovdqu xmm1, OWORD PTR [rsp+64] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+128], xmm7 + ; H ^ 10 + vmovdqu xmm0, OWORD PTR [rsp+64] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+144], xmm7 + ; H ^ 11 + vmovdqu xmm0, OWORD PTR [rsp+64] + vmovdqu xmm1, OWORD PTR [rsp+80] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+160], xmm7 + ; H ^ 12 + vmovdqu xmm0, OWORD PTR [rsp+80] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+176], xmm7 + ; H ^ 13 + vmovdqu xmm0, OWORD PTR [rsp+80] + vmovdqu xmm1, OWORD PTR [rsp+96] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+192], xmm7 + ; H ^ 14 + vmovdqu xmm0, OWORD PTR [rsp+96] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+208], xmm7 + ; H ^ 15 + vmovdqu xmm0, OWORD PTR [rsp+96] + vmovdqu xmm1, OWORD PTR [rsp+112] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+224], xmm7 + ; H ^ 16 + vmovdqu xmm0, OWORD PTR [rsp+112] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+240], xmm7 + cmp r9d, 512 + jl L_AES_GCM_decrypt_update_avx512_no_ext + ; H ^ 17 + vmovdqu xmm0, OWORD PTR [rsp+112] + vmovdqu xmm1, OWORD PTR [rsp+128] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+256], xmm7 + ; H ^ 18 + vmovdqu xmm0, OWORD PTR [rsp+128] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+272], xmm7 + ; H ^ 19 + vmovdqu xmm0, OWORD PTR [rsp+128] + vmovdqu xmm1, OWORD PTR [rsp+144] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+288], xmm7 + ; H ^ 20 + vmovdqu xmm0, OWORD PTR [rsp+144] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+304], xmm7 + ; H ^ 21 + vmovdqu xmm0, OWORD PTR [rsp+144] + vmovdqu xmm1, OWORD PTR [rsp+160] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+320], xmm7 + ; H ^ 22 + vmovdqu xmm0, OWORD PTR [rsp+160] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+336], xmm7 + ; H ^ 23 + vmovdqu xmm0, OWORD PTR [rsp+160] + vmovdqu xmm1, OWORD PTR [rsp+176] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+352], xmm7 + ; H ^ 24 + vmovdqu xmm0, OWORD PTR [rsp+176] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+368], xmm7 + ; H ^ 25 + vmovdqu xmm0, OWORD PTR [rsp+176] + vmovdqu xmm1, OWORD PTR [rsp+192] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+384], xmm7 + ; H ^ 26 + vmovdqu xmm0, OWORD PTR [rsp+192] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+400], xmm7 + ; H ^ 27 + vmovdqu xmm0, OWORD PTR [rsp+192] + vmovdqu xmm1, OWORD PTR [rsp+208] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+416], xmm7 + ; H ^ 28 + vmovdqu xmm0, OWORD PTR [rsp+208] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+432], xmm7 + ; H ^ 29 + vmovdqu xmm0, OWORD PTR [rsp+208] + vmovdqu xmm1, OWORD PTR [rsp+224] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+448], xmm7 + ; H ^ 30 + vmovdqu xmm0, OWORD PTR [rsp+224] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+464], xmm7 + ; H ^ 31 + vmovdqu xmm0, OWORD PTR [rsp+224] + vmovdqu xmm1, OWORD PTR [rsp+240] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+480], xmm7 + ; H ^ 32 + vmovdqu xmm0, OWORD PTR [rsp+240] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+496], xmm7 +L_AES_GCM_decrypt_update_avx512_no_ext: + vbroadcasti32x4 zmm22, ptr_L_avx512_aes_gcm_bswap_epi64 + vbroadcasti32x4 zmm30, ptr_L_avx512_aes_gcm_bswap_mask + vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128 + vbroadcasti32x4 zmm9, [rax] + vbroadcasti32x4 zmm10, [rax+16] + vbroadcasti32x4 zmm11, [rax+32] + vbroadcasti32x4 zmm12, [rax+48] + vbroadcasti32x4 zmm13, [rax+64] + vbroadcasti32x4 zmm14, [rax+80] + vbroadcasti32x4 zmm15, [rax+96] + vbroadcasti32x4 zmm1, [rax+112] + vbroadcasti32x4 zmm2, [rax+128] + vbroadcasti32x4 zmm3, [rax+144] + cmp r9d, 512 + jl L_AES_GCM_decrypt_update_avx512_no_windows + mov r13d, r9d + and r13d, 4294966784 + vmovdqu64 zmm23, [rsp+448] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+384] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+320] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp+256] + vshufi64x2 zmm26, zmm26, zmm26, 27 + vmovdqu64 [rsp+512], zmm23 + vmovdqu64 [rsp+576], zmm24 + vmovdqu64 [rsp+640], zmm25 + vmovdqu64 [rsp+704], zmm26 + vmovdqu64 zmm23, [rsp+192] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+128] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+64] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp] + vshufi64x2 zmm26, zmm26, zmm26, 27 + vmovdqu64 [rsp+768], zmm23 + vmovdqu64 [rsp+832], zmm24 + vmovdqu64 [rsp+896], zmm25 + vmovdqu64 [rsp+960], zmm26 + ; 512 bytes of input + xor esi, esi + lea rbx, QWORD PTR [r11+rdi] + vpxorq zmm20, zmm20, zmm20 + vinserti32x4 zmm20, zmm20, xmm6, 0 + vmovdqu64 zmm23, [rsp+512] + vmovdqu64 zmm24, [rsp+576] + vmovdqu64 zmm25, [rsp+640] + vmovdqu64 zmm26, [rsp+704] + vmovdqu64 zmm21, [rbx] + vpshufb zmm21, zmm21, zmm30 + vpxorq zmm21, zmm21, zmm20 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vmovdqa64 zmm27, zmm16 + vpxorq zmm28, zmm18, zmm17 + vmovdqa64 zmm29, zmm19 + vmovdqu64 zmm21, [rbx+64] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rbx+128] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rbx+192] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm23, [rsp+768] + vmovdqu64 zmm24, [rsp+832] + vmovdqu64 zmm25, [rsp+896] + vmovdqu64 zmm26, [rsp+960] + vmovdqu64 zmm21, [rbx+256] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rbx+320] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rbx+384] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rbx+448] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vpclmulqdq zmm21, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm21, 150 + vpclmulqdq zmm21, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm21, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 + add edi, 512 + cmp edi, r13d + jge L_AES_GCM_decrypt_update_avx512_last_aes +L_AES_GCM_decrypt_update_avx512_win_loop: + lea rbx, QWORD PTR [r11+rdi] + vpxorq zmm21, zmm21, zmm21 + vinserti32x4 zmm21, zmm21, xmm6, 0 + vbroadcasti32x4 zmm20, [r15] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [r15] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [r15], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vmovdqu64 zmm31, [rbx] + vpshufb zmm31, zmm31, zmm30 + vpxorq zmm31, zmm31, zmm21 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vpclmulqdq zmm23, zmm31, [rsp+512], 0 + vpclmulqdq zmm24, zmm31, [rsp+512], 1 + vpclmulqdq zmm25, zmm31, [rsp+512], 16 + vpclmulqdq zmm26, zmm31, [rsp+512], 17 + vmovdqa64 zmm27, zmm23 + vpxorq zmm28, zmm25, zmm24 + vmovdqa64 zmm29, zmm26 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vmovdqu64 zmm31, [rbx+64] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vpclmulqdq zmm23, zmm31, [rsp+576], 0 + vpclmulqdq zmm24, zmm31, [rsp+576], 1 + vpclmulqdq zmm25, zmm31, [rsp+576], 16 + vpclmulqdq zmm26, zmm31, [rsp+576], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vmovdqu64 zmm31, [rbx+128] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vpclmulqdq zmm23, zmm31, [rsp+640], 0 + vpclmulqdq zmm24, zmm31, [rsp+640], 1 + vpclmulqdq zmm25, zmm31, [rsp+640], 16 + vpclmulqdq zmm26, zmm31, [rsp+640], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vmovdqu64 zmm31, [rbx+192] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vpclmulqdq zmm23, zmm31, [rsp+704], 0 + vpclmulqdq zmm24, zmm31, [rsp+704], 1 + vpclmulqdq zmm25, zmm31, [rsp+704], 16 + vpclmulqdq zmm26, zmm31, [rsp+704], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r8d, 11 + vbroadcasti32x4 zmm20, [rax+160] + jl L_AES_GCM_decrypt_update_avx512_a_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r8d, 13 + vbroadcasti32x4 zmm20, [rax+192] + jl L_AES_GCM_decrypt_update_avx512_a_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+224] +L_AES_GCM_decrypt_update_avx512_a_il_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [r11+rsi] + lea rdx, QWORD PTR [r10+rsi] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add esi, 256 + vbroadcasti32x4 zmm20, [r15] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [r15] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [r15], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vmovdqu64 zmm31, [rbx+256] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vpclmulqdq zmm23, zmm31, [rsp+768], 0 + vpclmulqdq zmm24, zmm31, [rsp+768], 1 + vpclmulqdq zmm25, zmm31, [rsp+768], 16 + vpclmulqdq zmm26, zmm31, [rsp+768], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vmovdqu64 zmm31, [rbx+320] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vpclmulqdq zmm23, zmm31, [rsp+832], 0 + vpclmulqdq zmm24, zmm31, [rsp+832], 1 + vpclmulqdq zmm25, zmm31, [rsp+832], 16 + vpclmulqdq zmm26, zmm31, [rsp+832], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vmovdqu64 zmm31, [rbx+384] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vpclmulqdq zmm23, zmm31, [rsp+896], 0 + vpclmulqdq zmm24, zmm31, [rsp+896], 1 + vpclmulqdq zmm25, zmm31, [rsp+896], 16 + vpclmulqdq zmm26, zmm31, [rsp+896], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vmovdqu64 zmm31, [rbx+448] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vpclmulqdq zmm23, zmm31, [rsp+960], 0 + vpclmulqdq zmm24, zmm31, [rsp+960], 1 + vpclmulqdq zmm25, zmm31, [rsp+960], 16 + vpclmulqdq zmm26, zmm31, [rsp+960], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r8d, 11 + vbroadcasti32x4 zmm20, [rax+160] + jl L_AES_GCM_decrypt_update_avx512_b_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r8d, 13 + vbroadcasti32x4 zmm20, [rax+192] + jl L_AES_GCM_decrypt_update_avx512_b_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+224] +L_AES_GCM_decrypt_update_avx512_b_il_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [r11+rsi] + lea rdx, QWORD PTR [r10+rsi] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add esi, 256 + vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128 + vpclmulqdq zmm23, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm23, 150 + vpclmulqdq zmm23, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm23, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 + add edi, 512 + cmp edi, r13d + jl L_AES_GCM_decrypt_update_avx512_win_loop +L_AES_GCM_decrypt_update_avx512_last_aes: + vbroadcasti32x4 zmm20, [r15] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [r15] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [r15], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r8d, 11 + vbroadcasti32x4 zmm20, [rax+160] + jl L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r8d, 13 + vbroadcasti32x4 zmm20, [rax+192] + jl L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+224] +L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [r11+rsi] + lea rdx, QWORD PTR [r10+rsi] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add esi, 256 + vbroadcasti32x4 zmm20, [r15] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [r15] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [r15], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r8d, 11 + vbroadcasti32x4 zmm20, [rax+160] + jl L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r8d, 13 + vbroadcasti32x4 zmm20, [rax+192] + jl L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+224] +L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [r11+rsi] + lea rdx, QWORD PTR [r10+rsi] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add esi, 256 +L_AES_GCM_decrypt_update_avx512_no_windows: + vmovdqu64 zmm23, [rsp+192] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+128] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+64] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp] + vshufi64x2 zmm26, zmm26, zmm26, 27 + mov r13d, r9d + and r13d, 4294967040 + cmp edi, r13d + jge L_AES_GCM_decrypt_update_avx512_after_256 + ; 256 bytes of input + lea rbx, QWORD PTR [r11+rdi] + vpxorq zmm20, zmm20, zmm20 + vinserti32x4 zmm20, zmm20, xmm6, 0 + vmovdqu64 zmm21, [rbx] + vpshufb zmm21, zmm21, zmm30 + vpxorq zmm21, zmm21, zmm20 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vmovdqa64 zmm27, zmm16 + vpxorq zmm28, zmm18, zmm17 + vmovdqa64 zmm29, zmm19 + vmovdqu64 zmm21, [rbx+64] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rbx+128] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rbx+192] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vpclmulqdq zmm21, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm21, 150 + vpclmulqdq zmm21, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm21, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 + vbroadcasti32x4 zmm20, [r15] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [r15] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [r15], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r8d, 11 + vbroadcasti32x4 zmm20, [rax+160] + jl L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r8d, 13 + vbroadcasti32x4 zmm20, [rax+192] + jl L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+224] +L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add edi, 256 +L_AES_GCM_decrypt_update_avx512_after_256: + vmovdqu xmm5, OWORD PTR [rsp] +L_AES_GCM_decrypt_update_avx512_done_128: + mov edx, r9d + cmp edi, edx + jge L_AES_GCM_decrypt_update_avx512_done_dec + mov r13d, r9d + and r13d, 4294967280 + cmp edi, r13d + jge L_AES_GCM_decrypt_update_avx512_last_block_done +L_AES_GCM_decrypt_update_avx512_last_block_start: + vmovdqu xmm13, OWORD PTR [r11+rdi] + vmovdqa xmm0, xmm5 + vpshufb xmm1, xmm13, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm1, xmm1, xmm6 + vmovdqu xmm9, OWORD PTR [r15] + vpshufb xmm8, xmm9, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm9, OWORD PTR L_avx512_aes_gcm_one + vmovdqu OWORD PTR [r15], xmm9 + vpxor xmm8, xmm8, [rax] + vpclmulqdq xmm10, xmm1, xmm0, 16 + vaesenc xmm8, xmm8, [rax+16] + vaesenc xmm8, xmm8, [rax+32] + vpclmulqdq xmm11, xmm1, xmm0, 1 + vaesenc xmm8, xmm8, [rax+48] + vaesenc xmm8, xmm8, [rax+64] + vpclmulqdq xmm12, xmm1, xmm0, 0 + vaesenc xmm8, xmm8, [rax+80] + vpclmulqdq xmm1, xmm1, xmm0, 17 + vaesenc xmm8, xmm8, [rax+96] + vpxor xmm10, xmm10, xmm11 + vpslldq xmm2, xmm10, 8 + vpsrldq xmm10, xmm10, 8 + vaesenc xmm8, xmm8, [rax+112] + vpxor xmm2, xmm2, xmm12 + vpxor xmm3, xmm1, xmm10 + vmovdqa xmm0, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm2, xmm0, 16 + vaesenc xmm8, xmm8, [rax+128] + vpshufd xmm10, xmm2, 78 + vpxor xmm10, xmm10, xmm11 + vpclmulqdq xmm11, xmm10, xmm0, 16 + vaesenc xmm8, xmm8, [rax+144] + vpshufd xmm10, xmm10, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm6, xmm10, xmm3 + cmp r8d, 11 + vmovdqa xmm9, OWORD PTR [rax+160] + jl L_AES_GCM_decrypt_update_avx512_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [rax+176] + cmp r8d, 13 + vmovdqa xmm9, OWORD PTR [rax+192] + jl L_AES_GCM_decrypt_update_avx512_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [rax+208] + vmovdqa xmm9, OWORD PTR [rax+224] +L_AES_GCM_decrypt_update_avx512_aesenc_gfmul_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqa xmm0, xmm13 + vpxor xmm8, xmm8, xmm0 + vmovdqu OWORD PTR [r10+rdi], xmm8 + add edi, 16 + cmp edi, r13d + jl L_AES_GCM_decrypt_update_avx512_last_block_start +L_AES_GCM_decrypt_update_avx512_last_block_done: +L_AES_GCM_decrypt_update_avx512_done_dec: + vmovdqa OWORD PTR [r12], xmm6 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+1040] + vmovdqu xmm7, OWORD PTR [rsp+1056] + vmovdqu xmm8, OWORD PTR [rsp+1072] + vmovdqu xmm9, OWORD PTR [rsp+1088] + vmovdqu xmm10, OWORD PTR [rsp+1104] + vmovdqu xmm11, OWORD PTR [rsp+1120] + vmovdqu xmm12, OWORD PTR [rsp+1136] + vmovdqu xmm13, OWORD PTR [rsp+1152] + vmovdqu xmm14, OWORD PTR [rsp+1168] + vmovdqu xmm15, OWORD PTR [rsp+1184] + add rsp, 1200 + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r12 + pop r13 + ret +AES_GCM_decrypt_update_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_decrypt_final_avx512 PROC + push r13 + push r12 + push r14 + push rbp + push r15 + mov rax, rcx + mov r10d, r9d + mov r9, rdx + mov r11d, DWORD PTR [rsp+80] + mov r12, QWORD PTR [rsp+88] + mov r14, QWORD PTR [rsp+96] + mov rbp, QWORD PTR [rsp+104] + sub rsp, 160 + vmovdqu OWORD PTR [rsp+16], xmm6 + vmovdqu OWORD PTR [rsp+32], xmm7 + vmovdqu OWORD PTR [rsp+48], xmm8 + vmovdqu OWORD PTR [rsp+64], xmm9 + vmovdqu OWORD PTR [rsp+80], xmm10 + vmovdqu OWORD PTR [rsp+96], xmm11 + vmovdqu OWORD PTR [rsp+112], xmm12 + vmovdqu OWORD PTR [rsp+128], xmm13 + vmovdqu OWORD PTR [rsp+144], xmm15 + vmovdqa xmm6, OWORD PTR [rax] + vmovdqa xmm5, OWORD PTR [r12] + vmovdqa xmm15, OWORD PTR [r14] + vpsrlq xmm8, xmm5, 63 + vpsllq xmm7, xmm5, 1 + vpslldq xmm8, xmm8, 8 + vpor xmm7, xmm7, xmm8 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpxor xmm5, xmm5, xmm7 + mov edx, r10d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + vmovq xmm0, rdx + vmovq xmm1, rcx + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm6, xmm6, xmm0 + ; ghash_gfmul_red_avx + vpshufd xmm8, xmm5, 78 + vpxor xmm8, xmm8, xmm5 + vpshufd xmm9, xmm6, 78 + vpxor xmm9, xmm9, xmm6 + vpclmulqdq xmm7, xmm6, xmm5, 0 + vpclmulqdq xmm10, xmm6, xmm5, 17 + vpclmulqdq xmm8, xmm8, xmm9, 0 + vpternlogq xmm8, xmm10, xmm7, 150 + vmovdqa xmm9, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpternlogq xmm8, xmm7, xmm11, 150 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm10, xmm8, xmm11, 150 + vmovdqa xmm6, xmm10 + vpshufb xmm6, xmm6, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm0, xmm6, xmm15 + cmp r8d, 16 + je L_AES_GCM_decrypt_final_avx512_cmp_tag_16 + sub rsp, 16 + xor rcx, rcx + xor r15, r15 + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_decrypt_final_avx512_cmp_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + xor r13b, BYTE PTR [r9+rcx] + or r15b, r13b + inc ecx + cmp ecx, r8d + jne L_AES_GCM_decrypt_final_avx512_cmp_tag_loop + cmp r15b, 0 + sete r15b + add rsp, 16 + xor rcx, rcx + jmp L_AES_GCM_decrypt_final_avx512_cmp_tag_done +L_AES_GCM_decrypt_final_avx512_cmp_tag_16: + vmovdqu xmm1, OWORD PTR [r9] + vpcmpeqb xmm0, xmm0, xmm1 + vpmovmskb rdx, xmm0 + ; %%edx == 0xFFFF then return 1 else => return 0 + xor r15d, r15d + cmp edx, 65535 + sete r15b +L_AES_GCM_decrypt_final_avx512_cmp_tag_done: + mov DWORD PTR [rbp], r15d + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+16] + vmovdqu xmm7, OWORD PTR [rsp+32] + vmovdqu xmm8, OWORD PTR [rsp+48] + vmovdqu xmm9, OWORD PTR [rsp+64] + vmovdqu xmm10, OWORD PTR [rsp+80] + vmovdqu xmm11, OWORD PTR [rsp+96] + vmovdqu xmm12, OWORD PTR [rsp+112] + vmovdqu xmm13, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + pop r15 + pop rbp + pop r14 + pop r12 + pop r13 + ret +AES_GCM_decrypt_final_avx512 ENDP +_TEXT ENDS +ENDIF END diff --git a/wolfcrypt/src/aes_x86_64_asm.S b/wolfcrypt/src/aes_x86_64_asm.S new file mode 100644 index 00000000000..9eb85b49c73 --- /dev/null +++ b/wolfcrypt/src/aes_x86_64_asm.S @@ -0,0 +1,4375 @@ +/* aes_x86_64_asm.S */ +/* + * Copyright (C) 2006-2026 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +#ifdef WOLFSSL_USER_SETTINGS +#ifdef WOLFSSL_USER_SETTINGS_ASM +/* + * user_settings_asm.h is a file generated by the script user_settings_asm.sh. + * The script takes in a user_settings.h and produces user_settings_asm.h, which + * is a stripped down version of user_settings.h containing only preprocessor + * directives. This makes the header safe to include in assembly (.S) files. + */ +#include "user_settings_asm.h" +#else +/* + * Note: if user_settings.h contains any C code (e.g. a typedef or function + * prototype), including it here in an assembly (.S) file will cause an + * assembler failure. See user_settings_asm.h above. + */ +#include "user_settings.h" +#endif /* WOLFSSL_USER_SETTINGS_ASM */ +#endif /* WOLFSSL_USER_SETTINGS */ + +#ifndef HAVE_INTEL_AVX1 +#define HAVE_INTEL_AVX1 +#endif /* HAVE_INTEL_AVX1 */ +#ifndef NO_AVX2_SUPPORT +#ifndef HAVE_INTEL_AVX2 +#define HAVE_INTEL_AVX2 +#endif /* HAVE_INTEL_AVX2 */ +#endif /* NO_AVX2_SUPPORT */ +#ifndef NO_VAES_SUPPORT +#ifndef HAVE_INTEL_VAES +#define HAVE_INTEL_VAES +#endif /* HAVE_INTEL_VAES */ +#endif /* NO_VAES_SUPPORT */ +#ifndef NO_AVX512_SUPPORT +#ifndef HAVE_INTEL_AVX512 +#define HAVE_INTEL_AVX512 +#endif /* HAVE_INTEL_AVX512 */ +#endif /* NO_AVX512_SUPPORT */ + +#ifdef WOLFSSL_X86_64_BUILD +#ifndef __APPLE__ +.text +.globl AES_128_Key_Expansion_AESNI +.type AES_128_Key_Expansion_AESNI,@function +.align 16 +AES_128_Key_Expansion_AESNI: +#else +.section __TEXT,__text +.globl _AES_128_Key_Expansion_AESNI +.p2align 4 +_AES_128_Key_Expansion_AESNI: +#endif /* __APPLE__ */ + movdqu (%rdi), %xmm0 + movdqu %xmm0, (%rsi) + aeskeygenassist $0x01, %xmm0, %xmm1 + pshufd $0xff, %xmm1, %xmm1 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqu %xmm0, 16(%rsi) + aeskeygenassist $2, %xmm0, %xmm1 + pshufd $0xff, %xmm1, %xmm1 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqu %xmm0, 32(%rsi) + aeskeygenassist $4, %xmm0, %xmm1 + pshufd $0xff, %xmm1, %xmm1 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqu %xmm0, 48(%rsi) + aeskeygenassist $8, %xmm0, %xmm1 + pshufd $0xff, %xmm1, %xmm1 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqu %xmm0, 64(%rsi) + aeskeygenassist $16, %xmm0, %xmm1 + pshufd $0xff, %xmm1, %xmm1 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqu %xmm0, 80(%rsi) + aeskeygenassist $32, %xmm0, %xmm1 + pshufd $0xff, %xmm1, %xmm1 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqu %xmm0, 96(%rsi) + aeskeygenassist $0x40, %xmm0, %xmm1 + pshufd $0xff, %xmm1, %xmm1 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqu %xmm0, 112(%rsi) + aeskeygenassist $0x80, %xmm0, %xmm1 + pshufd $0xff, %xmm1, %xmm1 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqu %xmm0, 128(%rsi) + aeskeygenassist $27, %xmm0, %xmm1 + pshufd $0xff, %xmm1, %xmm1 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqu %xmm0, 144(%rsi) + aeskeygenassist $54, %xmm0, %xmm1 + pshufd $0xff, %xmm1, %xmm1 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqu %xmm0, 160(%rsi) + repz retq +#ifndef __APPLE__ +.size AES_128_Key_Expansion_AESNI,.-AES_128_Key_Expansion_AESNI +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_192_Key_Expansion_AESNI +.type AES_192_Key_Expansion_AESNI,@function +.align 16 +AES_192_Key_Expansion_AESNI: +#else +.section __TEXT,__text +.globl _AES_192_Key_Expansion_AESNI +.p2align 4 +_AES_192_Key_Expansion_AESNI: +#endif /* __APPLE__ */ + movdqu (%rdi), %xmm0 + pxor %xmm1, %xmm1 + pinsrq $0x00, 16(%rdi), %xmm1 + movdqu %xmm0, (%rsi) + movdqa %xmm1, %xmm4 + aeskeygenassist $0x01, %xmm1, %xmm2 + pshufd $0x55, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + pshufd $0xff, %xmm0, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + shufpd $0x00, %xmm0, %xmm4 + movdqu %xmm4, 16(%rsi) + movdqa %xmm0, %xmm5 + shufpd $0x01, %xmm1, %xmm5 + movdqu %xmm5, 32(%rsi) + aeskeygenassist $2, %xmm1, %xmm2 + pshufd $0x55, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + pshufd $0xff, %xmm0, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + movdqu %xmm0, 48(%rsi) + movdqa %xmm1, %xmm4 + aeskeygenassist $4, %xmm1, %xmm2 + pshufd $0x55, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + pshufd $0xff, %xmm0, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + shufpd $0x00, %xmm0, %xmm4 + movdqu %xmm4, 64(%rsi) + movdqa %xmm0, %xmm5 + shufpd $0x01, %xmm1, %xmm5 + movdqu %xmm5, 80(%rsi) + aeskeygenassist $8, %xmm1, %xmm2 + pshufd $0x55, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + pshufd $0xff, %xmm0, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + movdqu %xmm0, 96(%rsi) + movdqa %xmm1, %xmm4 + aeskeygenassist $16, %xmm1, %xmm2 + pshufd $0x55, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + pshufd $0xff, %xmm0, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + shufpd $0x00, %xmm0, %xmm4 + movdqu %xmm4, 112(%rsi) + movdqa %xmm0, %xmm5 + shufpd $0x01, %xmm1, %xmm5 + movdqu %xmm5, 128(%rsi) + aeskeygenassist $32, %xmm1, %xmm2 + pshufd $0x55, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + pshufd $0xff, %xmm0, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + movdqu %xmm0, 144(%rsi) + movdqa %xmm1, %xmm4 + aeskeygenassist $0x40, %xmm1, %xmm2 + pshufd $0x55, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + pshufd $0xff, %xmm0, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + shufpd $0x00, %xmm0, %xmm4 + movdqu %xmm4, 160(%rsi) + movdqa %xmm0, %xmm5 + shufpd $0x01, %xmm1, %xmm5 + movdqu %xmm5, 176(%rsi) + aeskeygenassist $0x80, %xmm1, %xmm2 + pshufd $0x55, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + pshufd $0xff, %xmm0, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + movdqu %xmm0, 192(%rsi) + movdqu %xmm1, 208(%rsi) + repz retq +#ifndef __APPLE__ +.size AES_192_Key_Expansion_AESNI,.-AES_192_Key_Expansion_AESNI +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_256_Key_Expansion_AESNI +.type AES_256_Key_Expansion_AESNI,@function +.align 16 +AES_256_Key_Expansion_AESNI: +#else +.section __TEXT,__text +.globl _AES_256_Key_Expansion_AESNI +.p2align 4 +_AES_256_Key_Expansion_AESNI: +#endif /* __APPLE__ */ + movdqu (%rdi), %xmm0 + movdqu 16(%rdi), %xmm1 + movdqu %xmm0, (%rsi) + movdqu %xmm1, 16(%rsi) + aeskeygenassist $0x01, %xmm1, %xmm2 + pshufd $0xff, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + movdqu %xmm0, 32(%rsi) + aeskeygenassist $0x00, %xmm0, %xmm2 + pshufd $0xaa, %xmm2, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + movdqu %xmm1, 48(%rsi) + aeskeygenassist $2, %xmm1, %xmm2 + pshufd $0xff, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + movdqu %xmm0, 64(%rsi) + aeskeygenassist $0x00, %xmm0, %xmm2 + pshufd $0xaa, %xmm2, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + movdqu %xmm1, 80(%rsi) + aeskeygenassist $4, %xmm1, %xmm2 + pshufd $0xff, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + movdqu %xmm0, 96(%rsi) + aeskeygenassist $0x00, %xmm0, %xmm2 + pshufd $0xaa, %xmm2, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + movdqu %xmm1, 112(%rsi) + aeskeygenassist $8, %xmm1, %xmm2 + pshufd $0xff, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + movdqu %xmm0, 128(%rsi) + aeskeygenassist $0x00, %xmm0, %xmm2 + pshufd $0xaa, %xmm2, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + movdqu %xmm1, 144(%rsi) + aeskeygenassist $16, %xmm1, %xmm2 + pshufd $0xff, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + movdqu %xmm0, 160(%rsi) + aeskeygenassist $0x00, %xmm0, %xmm2 + pshufd $0xaa, %xmm2, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + movdqu %xmm1, 176(%rsi) + aeskeygenassist $32, %xmm1, %xmm2 + pshufd $0xff, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + movdqu %xmm0, 192(%rsi) + aeskeygenassist $0x00, %xmm0, %xmm2 + pshufd $0xaa, %xmm2, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + movdqu %xmm1, 208(%rsi) + aeskeygenassist $0x40, %xmm1, %xmm2 + pshufd $0xff, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + movdqu %xmm0, 224(%rsi) + repz retq +#ifndef __APPLE__ +.size AES_256_Key_Expansion_AESNI,.-AES_256_Key_Expansion_AESNI +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_ECB_encrypt_AESNI +.type AES_ECB_encrypt_AESNI,@function +.align 16 +AES_ECB_encrypt_AESNI: +#else +.section __TEXT,__text +.globl _AES_ECB_encrypt_AESNI +.p2align 4 +_AES_ECB_encrypt_AESNI: +#endif /* __APPLE__ */ + xorl %eax, %eax + cmpl $0x40, %edx + movl %edx, %r9d + jl L_AES_ECB_encrypt_AESNI_done_64 + andl $0xffffffc0, %r9d +L_AES_ECB_encrypt_AESNI_enc_64: + # 64 bytes of input + # aes_ecb_enc_64 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + movdqu (%r10), %xmm0 + movdqu 16(%r10), %xmm1 + movdqu 32(%r10), %xmm2 + movdqu 48(%r10), %xmm3 + # aes_enc_block + movdqu (%rcx), %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm4, %xmm1 + pxor %xmm4, %xmm2 + pxor %xmm4, %xmm3 + movdqu 16(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 32(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 48(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 64(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 80(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 96(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 112(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 128(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 144(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + cmpl $11, %r8d + movdqu 160(%rcx), %xmm4 + jl L_AES_ECB_encrypt_AESNI_64_aes_enc_block_last + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 176(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + cmpl $13, %r8d + movdqu 192(%rcx), %xmm4 + jl L_AES_ECB_encrypt_AESNI_64_aes_enc_block_last + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 208(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 224(%rcx), %xmm4 +L_AES_ECB_encrypt_AESNI_64_aes_enc_block_last: + aesenclast %xmm4, %xmm0 + aesenclast %xmm4, %xmm1 + aesenclast %xmm4, %xmm2 + aesenclast %xmm4, %xmm3 + movdqu %xmm0, (%r11) + movdqu %xmm1, 16(%r11) + movdqu %xmm2, 32(%r11) + movdqu %xmm3, 48(%r11) + addl $0x40, %eax + cmpl %r9d, %eax + jl L_AES_ECB_encrypt_AESNI_enc_64 +L_AES_ECB_encrypt_AESNI_done_64: + cmpl %edx, %eax + movl %edx, %r9d + je L_AES_ECB_encrypt_AESNI_done_enc + andl $0xfffffff0, %r9d +L_AES_ECB_encrypt_AESNI_enc_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + movdqu (%r10), %xmm0 + # aes_enc_block + pxor (%rcx), %xmm0 + movdqu 16(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 32(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 48(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 64(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 80(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 96(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 112(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 128(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 144(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + cmpl $11, %r8d + movdqu 160(%rcx), %xmm5 + jl L_AES_ECB_encrypt_AESNI_16_aes_enc_block_last + aesenc %xmm5, %xmm0 + movdqu 176(%rcx), %xmm6 + aesenc %xmm6, %xmm0 + cmpl $13, %r8d + movdqu 192(%rcx), %xmm5 + jl L_AES_ECB_encrypt_AESNI_16_aes_enc_block_last + aesenc %xmm5, %xmm0 + movdqu 208(%rcx), %xmm6 + aesenc %xmm6, %xmm0 + movdqu 224(%rcx), %xmm5 +L_AES_ECB_encrypt_AESNI_16_aes_enc_block_last: + aesenclast %xmm5, %xmm0 + leaq (%rsi,%rax,1), %r10 + movdqu %xmm0, (%r10) + addl $16, %eax + cmpl %r9d, %eax + jl L_AES_ECB_encrypt_AESNI_enc_16 +L_AES_ECB_encrypt_AESNI_done_enc: + repz retq +#ifndef __APPLE__ +.size AES_ECB_encrypt_AESNI,.-AES_ECB_encrypt_AESNI +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_ECB_decrypt_AESNI +.type AES_ECB_decrypt_AESNI,@function +.align 16 +AES_ECB_decrypt_AESNI: +#else +.section __TEXT,__text +.globl _AES_ECB_decrypt_AESNI +.p2align 4 +_AES_ECB_decrypt_AESNI: +#endif /* __APPLE__ */ + xorl %eax, %eax + cmpl $0x40, %edx + movl %edx, %r9d + jl L_AES_ECB_decrypt_AESNI_done_64 + andl $0xffffffc0, %r9d +L_AES_ECB_decrypt_AESNI_dec_64: + # 64 bytes of input + # aes_ecb_dec_64 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + movdqu (%r10), %xmm0 + movdqu 16(%r10), %xmm1 + movdqu 32(%r10), %xmm2 + movdqu 48(%r10), %xmm3 + # aes_dec_block + movdqu (%rcx), %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm4, %xmm1 + pxor %xmm4, %xmm2 + pxor %xmm4, %xmm3 + movdqu 16(%rcx), %xmm4 + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + movdqu 32(%rcx), %xmm4 + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + movdqu 48(%rcx), %xmm4 + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + movdqu 64(%rcx), %xmm4 + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + movdqu 80(%rcx), %xmm4 + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + movdqu 96(%rcx), %xmm4 + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + movdqu 112(%rcx), %xmm4 + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + movdqu 128(%rcx), %xmm4 + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + movdqu 144(%rcx), %xmm4 + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + cmpl $11, %r8d + movdqu 160(%rcx), %xmm4 + jl L_AES_ECB_decrypt_AESNI_64_aes_dec_block_last + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + movdqu 176(%rcx), %xmm4 + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + cmpl $13, %r8d + movdqu 192(%rcx), %xmm4 + jl L_AES_ECB_decrypt_AESNI_64_aes_dec_block_last + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + movdqu 208(%rcx), %xmm4 + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + movdqu 224(%rcx), %xmm4 +L_AES_ECB_decrypt_AESNI_64_aes_dec_block_last: + aesdeclast %xmm4, %xmm0 + aesdeclast %xmm4, %xmm1 + aesdeclast %xmm4, %xmm2 + aesdeclast %xmm4, %xmm3 + movdqu %xmm0, (%r11) + movdqu %xmm1, 16(%r11) + movdqu %xmm2, 32(%r11) + movdqu %xmm3, 48(%r11) + addl $0x40, %eax + cmpl %r9d, %eax + jl L_AES_ECB_decrypt_AESNI_dec_64 +L_AES_ECB_decrypt_AESNI_done_64: + cmpl %edx, %eax + movl %edx, %r9d + je L_AES_ECB_decrypt_AESNI_done_dec + andl $0xfffffff0, %r9d +L_AES_ECB_decrypt_AESNI_dec_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + movdqu (%r10), %xmm0 + # aes_dec_block + pxor (%rcx), %xmm0 + movdqu 16(%rcx), %xmm5 + aesdec %xmm5, %xmm0 + movdqu 32(%rcx), %xmm5 + aesdec %xmm5, %xmm0 + movdqu 48(%rcx), %xmm5 + aesdec %xmm5, %xmm0 + movdqu 64(%rcx), %xmm5 + aesdec %xmm5, %xmm0 + movdqu 80(%rcx), %xmm5 + aesdec %xmm5, %xmm0 + movdqu 96(%rcx), %xmm5 + aesdec %xmm5, %xmm0 + movdqu 112(%rcx), %xmm5 + aesdec %xmm5, %xmm0 + movdqu 128(%rcx), %xmm5 + aesdec %xmm5, %xmm0 + movdqu 144(%rcx), %xmm5 + aesdec %xmm5, %xmm0 + cmpl $11, %r8d + movdqu 160(%rcx), %xmm5 + jl L_AES_ECB_decrypt_AESNI_16_aes_dec_block_last + aesdec %xmm5, %xmm0 + movdqu 176(%rcx), %xmm6 + aesdec %xmm6, %xmm0 + cmpl $13, %r8d + movdqu 192(%rcx), %xmm5 + jl L_AES_ECB_decrypt_AESNI_16_aes_dec_block_last + aesdec %xmm5, %xmm0 + movdqu 208(%rcx), %xmm6 + aesdec %xmm6, %xmm0 + movdqu 224(%rcx), %xmm5 +L_AES_ECB_decrypt_AESNI_16_aes_dec_block_last: + aesdeclast %xmm5, %xmm0 + leaq (%rsi,%rax,1), %r10 + movdqu %xmm0, (%r10) + addl $16, %eax + cmpl %r9d, %eax + jl L_AES_ECB_decrypt_AESNI_dec_16 +L_AES_ECB_decrypt_AESNI_done_dec: + repz retq +#ifndef __APPLE__ +.size AES_ECB_decrypt_AESNI,.-AES_ECB_decrypt_AESNI +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_CBC_encrypt_AESNI +.type AES_CBC_encrypt_AESNI,@function +.align 16 +AES_CBC_encrypt_AESNI: +#else +.section __TEXT,__text +.globl _AES_CBC_encrypt_AESNI +.p2align 4 +_AES_CBC_encrypt_AESNI: +#endif /* __APPLE__ */ + movdqu (%rdx), %xmm0 + xorl %eax, %eax + cmpl %ecx, %eax + je L_AES_CBC_encrypt_AESNI_done +L_AES_CBC_encrypt_AESNI_loop: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + movdqu (%r10), %xmm1 + pxor %xmm0, %xmm1 + # aes_enc_block + pxor (%r8), %xmm1 + movdqu 16(%r8), %xmm3 + aesenc %xmm3, %xmm1 + movdqu 32(%r8), %xmm3 + aesenc %xmm3, %xmm1 + movdqu 48(%r8), %xmm3 + aesenc %xmm3, %xmm1 + movdqu 64(%r8), %xmm3 + aesenc %xmm3, %xmm1 + movdqu 80(%r8), %xmm3 + aesenc %xmm3, %xmm1 + movdqu 96(%r8), %xmm3 + aesenc %xmm3, %xmm1 + movdqu 112(%r8), %xmm3 + aesenc %xmm3, %xmm1 + movdqu 128(%r8), %xmm3 + aesenc %xmm3, %xmm1 + movdqu 144(%r8), %xmm3 + aesenc %xmm3, %xmm1 + cmpl $11, %r9d + movdqu 160(%r8), %xmm3 + jl L_AES_CBC_encrypt_AESNI_aes_enc_block_last + aesenc %xmm3, %xmm1 + movdqu 176(%r8), %xmm4 + aesenc %xmm4, %xmm1 + cmpl $13, %r9d + movdqu 192(%r8), %xmm3 + jl L_AES_CBC_encrypt_AESNI_aes_enc_block_last + aesenc %xmm3, %xmm1 + movdqu 208(%r8), %xmm4 + aesenc %xmm4, %xmm1 + movdqu 224(%r8), %xmm3 +L_AES_CBC_encrypt_AESNI_aes_enc_block_last: + aesenclast %xmm3, %xmm1 + leaq (%rsi,%rax,1), %r11 + movdqu %xmm1, (%r11) + movdqa %xmm1, %xmm0 + addl $16, %eax + cmpl %ecx, %eax + jl L_AES_CBC_encrypt_AESNI_loop +L_AES_CBC_encrypt_AESNI_done: + movdqu %xmm0, (%rdx) + repz retq +#ifndef __APPLE__ +.size AES_CBC_encrypt_AESNI,.-AES_CBC_encrypt_AESNI +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_CBC_decrypt_AESNI +.type AES_CBC_decrypt_AESNI,@function +.align 16 +AES_CBC_decrypt_AESNI: +#else +.section __TEXT,__text +.globl _AES_CBC_decrypt_AESNI +.p2align 4 +_AES_CBC_decrypt_AESNI: +#endif /* __APPLE__ */ + pushq %r12 + movdqu (%rdx), %xmm4 + xorl %eax, %eax + cmpl $0x40, %ecx + movl %ecx, %r10d + jl L_AES_CBC_decrypt_AESNI_done_64 + andl $0xffffffc0, %r10d +L_AES_CBC_decrypt_AESNI_dec_64: + # 64 bytes of input + # aes_cbc_dec_64 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %r12 + movdqu (%r11), %xmm0 + movdqu 16(%r11), %xmm1 + movdqu 32(%r11), %xmm2 + movdqu 48(%r11), %xmm3 + # aes_dec_block + movdqu (%r8), %xmm5 + pxor %xmm5, %xmm0 + pxor %xmm5, %xmm1 + pxor %xmm5, %xmm2 + pxor %xmm5, %xmm3 + movdqu 16(%r8), %xmm5 + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + movdqu 32(%r8), %xmm5 + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + movdqu 48(%r8), %xmm5 + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + movdqu 64(%r8), %xmm5 + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + movdqu 80(%r8), %xmm5 + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + movdqu 96(%r8), %xmm5 + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + movdqu 112(%r8), %xmm5 + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + movdqu 128(%r8), %xmm5 + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + movdqu 144(%r8), %xmm5 + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + cmpl $11, %r9d + movdqu 160(%r8), %xmm5 + jl L_AES_CBC_decrypt_AESNI_64_aes_dec_block_last + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + movdqu 176(%r8), %xmm5 + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + cmpl $13, %r9d + movdqu 192(%r8), %xmm5 + jl L_AES_CBC_decrypt_AESNI_64_aes_dec_block_last + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + movdqu 208(%r8), %xmm5 + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + movdqu 224(%r8), %xmm5 +L_AES_CBC_decrypt_AESNI_64_aes_dec_block_last: + aesdeclast %xmm5, %xmm0 + aesdeclast %xmm5, %xmm1 + aesdeclast %xmm5, %xmm2 + aesdeclast %xmm5, %xmm3 + pxor %xmm4, %xmm0 + movdqu (%r11), %xmm5 + pxor %xmm5, %xmm1 + movdqu 16(%r11), %xmm5 + pxor %xmm5, %xmm2 + movdqu 32(%r11), %xmm5 + pxor %xmm5, %xmm3 + movdqu 48(%r11), %xmm4 + movdqu %xmm0, (%r12) + movdqu %xmm1, 16(%r12) + movdqu %xmm2, 32(%r12) + movdqu %xmm3, 48(%r12) + addl $0x40, %eax + cmpl %r10d, %eax + jl L_AES_CBC_decrypt_AESNI_dec_64 +L_AES_CBC_decrypt_AESNI_done_64: + cmpl %ecx, %eax + movl %ecx, %r10d + je L_AES_CBC_decrypt_AESNI_done_dec + andl $0xfffffff0, %r10d +L_AES_CBC_decrypt_AESNI_dec_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r11 + movdqu (%r11), %xmm0 + movdqa %xmm0, %xmm8 + # aes_dec_block + pxor (%r8), %xmm0 + movdqu 16(%r8), %xmm6 + aesdec %xmm6, %xmm0 + movdqu 32(%r8), %xmm6 + aesdec %xmm6, %xmm0 + movdqu 48(%r8), %xmm6 + aesdec %xmm6, %xmm0 + movdqu 64(%r8), %xmm6 + aesdec %xmm6, %xmm0 + movdqu 80(%r8), %xmm6 + aesdec %xmm6, %xmm0 + movdqu 96(%r8), %xmm6 + aesdec %xmm6, %xmm0 + movdqu 112(%r8), %xmm6 + aesdec %xmm6, %xmm0 + movdqu 128(%r8), %xmm6 + aesdec %xmm6, %xmm0 + movdqu 144(%r8), %xmm6 + aesdec %xmm6, %xmm0 + cmpl $11, %r9d + movdqu 160(%r8), %xmm6 + jl L_AES_CBC_decrypt_AESNI_16_aes_dec_block_last + aesdec %xmm6, %xmm0 + movdqu 176(%r8), %xmm7 + aesdec %xmm7, %xmm0 + cmpl $13, %r9d + movdqu 192(%r8), %xmm6 + jl L_AES_CBC_decrypt_AESNI_16_aes_dec_block_last + aesdec %xmm6, %xmm0 + movdqu 208(%r8), %xmm7 + aesdec %xmm7, %xmm0 + movdqu 224(%r8), %xmm6 +L_AES_CBC_decrypt_AESNI_16_aes_dec_block_last: + aesdeclast %xmm6, %xmm0 + pxor %xmm4, %xmm0 + movdqa %xmm8, %xmm4 + leaq (%rsi,%rax,1), %r11 + movdqu %xmm0, (%r11) + addl $16, %eax + cmpl %r10d, %eax + jl L_AES_CBC_decrypt_AESNI_dec_16 +L_AES_CBC_decrypt_AESNI_done_dec: + movdqu %xmm4, (%rdx) + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_CBC_decrypt_AESNI,.-AES_CBC_decrypt_AESNI +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_aes_ctr_aesni_bswap: +.quad 0x08090a0b0c0d0e0f,0x0001020304050607 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_aes_ctr_aesni_one: +.quad 0x0000000000000001,0x0000000000000000 +#ifndef __APPLE__ +.text +.globl AES_CTR_encrypt_AESNI +.type AES_CTR_encrypt_AESNI,@function +.align 16 +AES_CTR_encrypt_AESNI: +#else +.section __TEXT,__text +.globl _AES_CTR_encrypt_AESNI +.p2align 4 +_AES_CTR_encrypt_AESNI: +#endif /* __APPLE__ */ + pushq %rbx + movdqu L_aes_ctr_aesni_bswap(%rip), %xmm8 + movdqu L_aes_ctr_aesni_one(%rip), %xmm9 + pxor %xmm10, %xmm10 + movdqu (%r9), %xmm7 + pshufb %xmm8, %xmm7 + xorl %eax, %eax + cmpl $0x40, %edx + movl %edx, %r10d + jl L_AES_CTR_encrypt_AESNI_done_64 + andl $0xffffffc0, %r10d +L_AES_CTR_encrypt_AESNI_enc_64: + # 64 bytes of input + # aes_ctr_enc_64 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %rbx + movdqa %xmm7, %xmm0 + pshufb %xmm8, %xmm0 + paddq %xmm9, %xmm7 + movdqa %xmm7, %xmm11 + pcmpeqq %xmm10, %xmm11 + pslldq $8, %xmm11 + psrlq $63, %xmm11 + paddq %xmm11, %xmm7 + movdqa %xmm7, %xmm1 + pshufb %xmm8, %xmm1 + paddq %xmm9, %xmm7 + movdqa %xmm7, %xmm11 + pcmpeqq %xmm10, %xmm11 + pslldq $8, %xmm11 + psrlq $63, %xmm11 + paddq %xmm11, %xmm7 + movdqa %xmm7, %xmm2 + pshufb %xmm8, %xmm2 + paddq %xmm9, %xmm7 + movdqa %xmm7, %xmm11 + pcmpeqq %xmm10, %xmm11 + pslldq $8, %xmm11 + psrlq $63, %xmm11 + paddq %xmm11, %xmm7 + movdqa %xmm7, %xmm3 + pshufb %xmm8, %xmm3 + paddq %xmm9, %xmm7 + movdqa %xmm7, %xmm11 + pcmpeqq %xmm10, %xmm11 + pslldq $8, %xmm11 + psrlq $63, %xmm11 + paddq %xmm11, %xmm7 + # aes_enc_block + movdqu (%rcx), %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm4, %xmm1 + pxor %xmm4, %xmm2 + pxor %xmm4, %xmm3 + movdqu 16(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 32(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 48(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 64(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 80(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 96(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 112(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 128(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 144(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + cmpl $11, %r8d + movdqu 160(%rcx), %xmm4 + jl L_AES_CTR_encrypt_AESNI_64_aes_enc_block_last + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 176(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + cmpl $13, %r8d + movdqu 192(%rcx), %xmm4 + jl L_AES_CTR_encrypt_AESNI_64_aes_enc_block_last + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 208(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 224(%rcx), %xmm4 +L_AES_CTR_encrypt_AESNI_64_aes_enc_block_last: + aesenclast %xmm4, %xmm0 + aesenclast %xmm4, %xmm1 + aesenclast %xmm4, %xmm2 + aesenclast %xmm4, %xmm3 + movdqu (%r11), %xmm4 + pxor %xmm4, %xmm0 + movdqu 16(%r11), %xmm4 + pxor %xmm4, %xmm1 + movdqu 32(%r11), %xmm4 + pxor %xmm4, %xmm2 + movdqu 48(%r11), %xmm4 + pxor %xmm4, %xmm3 + movdqu %xmm0, (%rbx) + movdqu %xmm1, 16(%rbx) + movdqu %xmm2, 32(%rbx) + movdqu %xmm3, 48(%rbx) + addl $0x40, %eax + cmpl %r10d, %eax + jl L_AES_CTR_encrypt_AESNI_enc_64 +L_AES_CTR_encrypt_AESNI_done_64: + cmpl %edx, %eax + movl %edx, %r10d + je L_AES_CTR_encrypt_AESNI_done_enc + andl $0xfffffff0, %r10d +L_AES_CTR_encrypt_AESNI_enc_16: + # 16 bytes of input + movdqa %xmm7, %xmm0 + pshufb %xmm8, %xmm0 + paddq %xmm9, %xmm7 + movdqa %xmm7, %xmm11 + pcmpeqq %xmm10, %xmm11 + pslldq $8, %xmm11 + psrlq $63, %xmm11 + paddq %xmm11, %xmm7 + # aes_enc_block + pxor (%rcx), %xmm0 + movdqu 16(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 32(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 48(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 64(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 80(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 96(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 112(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 128(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 144(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + cmpl $11, %r8d + movdqu 160(%rcx), %xmm5 + jl L_AES_CTR_encrypt_AESNI_16_aes_enc_block_last + aesenc %xmm5, %xmm0 + movdqu 176(%rcx), %xmm6 + aesenc %xmm6, %xmm0 + cmpl $13, %r8d + movdqu 192(%rcx), %xmm5 + jl L_AES_CTR_encrypt_AESNI_16_aes_enc_block_last + aesenc %xmm5, %xmm0 + movdqu 208(%rcx), %xmm6 + aesenc %xmm6, %xmm0 + movdqu 224(%rcx), %xmm5 +L_AES_CTR_encrypt_AESNI_16_aes_enc_block_last: + aesenclast %xmm5, %xmm0 + leaq (%rdi,%rax,1), %r11 + movdqu (%r11), %xmm4 + pxor %xmm4, %xmm0 + leaq (%rsi,%rax,1), %r11 + movdqu %xmm0, (%r11) + addl $16, %eax + cmpl %r10d, %eax + jl L_AES_CTR_encrypt_AESNI_enc_16 +L_AES_CTR_encrypt_AESNI_done_enc: + pshufb %xmm8, %xmm7 + movdqu %xmm7, (%r9) + popq %rbx + repz retq +#ifndef __APPLE__ +.size AES_CTR_encrypt_AESNI,.-AES_CTR_encrypt_AESNI +#endif /* __APPLE__ */ +#ifdef HAVE_INTEL_AVX1 +#ifndef __APPLE__ +.text +.globl AES_ECB_encrypt_avx1 +.type AES_ECB_encrypt_avx1,@function +.align 16 +AES_ECB_encrypt_avx1: +#else +.section __TEXT,__text +.globl _AES_ECB_encrypt_avx1 +.p2align 4 +_AES_ECB_encrypt_avx1: +#endif /* __APPLE__ */ + xorl %eax, %eax + cmpl $0x40, %edx + movl %edx, %r9d + jl L_AES_ECB_encrypt_avx1_done_64 + andl $0xffffffc0, %r9d +L_AES_ECB_encrypt_avx1_enc_64: + # 64 bytes of input + # aes_ecb_enc_64 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + vmovdqu (%r10), %xmm0 + vmovdqu 16(%r10), %xmm1 + vmovdqu 32(%r10), %xmm2 + vmovdqu 48(%r10), %xmm3 + # aes_enc_block + vmovdqu (%rcx), %xmm4 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm4, %xmm3, %xmm3 + vmovdqu 16(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 32(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 48(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 64(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 80(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 96(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 112(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 128(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 144(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm4 + jl L_AES_ECB_encrypt_avx1_64_aes_enc_block_last + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 176(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm4 + jl L_AES_ECB_encrypt_avx1_64_aes_enc_block_last + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 208(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 224(%rcx), %xmm4 +L_AES_ECB_encrypt_avx1_64_aes_enc_block_last: + vaesenclast %xmm4, %xmm0, %xmm0 + vaesenclast %xmm4, %xmm1, %xmm1 + vaesenclast %xmm4, %xmm2, %xmm2 + vaesenclast %xmm4, %xmm3, %xmm3 + vmovdqu %xmm0, (%r11) + vmovdqu %xmm1, 16(%r11) + vmovdqu %xmm2, 32(%r11) + vmovdqu %xmm3, 48(%r11) + addl $0x40, %eax + cmpl %r9d, %eax + jl L_AES_ECB_encrypt_avx1_enc_64 +L_AES_ECB_encrypt_avx1_done_64: + cmpl %edx, %eax + movl %edx, %r9d + je L_AES_ECB_encrypt_avx1_done_enc + andl $0xfffffff0, %r9d +L_AES_ECB_encrypt_avx1_enc_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + vmovdqu (%r10), %xmm0 + # aes_enc_block + vpxor (%rcx), %xmm0, %xmm0 + vmovdqu 16(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm5 + jl L_AES_ECB_encrypt_avx1_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm5 + jl L_AES_ECB_encrypt_avx1_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%rcx), %xmm5 +L_AES_ECB_encrypt_avx1_16_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + leaq (%rsi,%rax,1), %r10 + vmovdqu %xmm0, (%r10) + addl $16, %eax + cmpl %r9d, %eax + jl L_AES_ECB_encrypt_avx1_enc_16 +L_AES_ECB_encrypt_avx1_done_enc: + repz retq +#ifndef __APPLE__ +.size AES_ECB_encrypt_avx1,.-AES_ECB_encrypt_avx1 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_ECB_decrypt_avx1 +.type AES_ECB_decrypt_avx1,@function +.align 16 +AES_ECB_decrypt_avx1: +#else +.section __TEXT,__text +.globl _AES_ECB_decrypt_avx1 +.p2align 4 +_AES_ECB_decrypt_avx1: +#endif /* __APPLE__ */ + xorl %eax, %eax + cmpl $0x40, %edx + movl %edx, %r9d + jl L_AES_ECB_decrypt_avx1_done_64 + andl $0xffffffc0, %r9d +L_AES_ECB_decrypt_avx1_dec_64: + # 64 bytes of input + # aes_ecb_dec_64 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + vmovdqu (%r10), %xmm0 + vmovdqu 16(%r10), %xmm1 + vmovdqu 32(%r10), %xmm2 + vmovdqu 48(%r10), %xmm3 + # aes_dec_block + vmovdqu (%rcx), %xmm4 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm4, %xmm3, %xmm3 + vmovdqu 16(%rcx), %xmm4 + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + vmovdqu 32(%rcx), %xmm4 + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + vmovdqu 48(%rcx), %xmm4 + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + vmovdqu 64(%rcx), %xmm4 + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + vmovdqu 80(%rcx), %xmm4 + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + vmovdqu 96(%rcx), %xmm4 + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + vmovdqu 112(%rcx), %xmm4 + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + vmovdqu 128(%rcx), %xmm4 + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + vmovdqu 144(%rcx), %xmm4 + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm4 + jl L_AES_ECB_decrypt_avx1_64_aes_dec_block_last + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + vmovdqu 176(%rcx), %xmm4 + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm4 + jl L_AES_ECB_decrypt_avx1_64_aes_dec_block_last + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + vmovdqu 208(%rcx), %xmm4 + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + vmovdqu 224(%rcx), %xmm4 +L_AES_ECB_decrypt_avx1_64_aes_dec_block_last: + vaesdeclast %xmm4, %xmm0, %xmm0 + vaesdeclast %xmm4, %xmm1, %xmm1 + vaesdeclast %xmm4, %xmm2, %xmm2 + vaesdeclast %xmm4, %xmm3, %xmm3 + vmovdqu %xmm0, (%r11) + vmovdqu %xmm1, 16(%r11) + vmovdqu %xmm2, 32(%r11) + vmovdqu %xmm3, 48(%r11) + addl $0x40, %eax + cmpl %r9d, %eax + jl L_AES_ECB_decrypt_avx1_dec_64 +L_AES_ECB_decrypt_avx1_done_64: + cmpl %edx, %eax + movl %edx, %r9d + je L_AES_ECB_decrypt_avx1_done_dec + andl $0xfffffff0, %r9d +L_AES_ECB_decrypt_avx1_dec_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + vmovdqu (%r10), %xmm0 + # aes_dec_block + vpxor (%rcx), %xmm0, %xmm0 + vmovdqu 16(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm5 + jl L_AES_ECB_decrypt_avx1_16_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%rcx), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm5 + jl L_AES_ECB_decrypt_avx1_16_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%rcx), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%rcx), %xmm5 +L_AES_ECB_decrypt_avx1_16_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + leaq (%rsi,%rax,1), %r10 + vmovdqu %xmm0, (%r10) + addl $16, %eax + cmpl %r9d, %eax + jl L_AES_ECB_decrypt_avx1_dec_16 +L_AES_ECB_decrypt_avx1_done_dec: + repz retq +#ifndef __APPLE__ +.size AES_ECB_decrypt_avx1,.-AES_ECB_decrypt_avx1 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_CBC_encrypt_avx1 +.type AES_CBC_encrypt_avx1,@function +.align 16 +AES_CBC_encrypt_avx1: +#else +.section __TEXT,__text +.globl _AES_CBC_encrypt_avx1 +.p2align 4 +_AES_CBC_encrypt_avx1: +#endif /* __APPLE__ */ + vmovdqu (%rdx), %xmm0 + xorl %eax, %eax + cmpl %ecx, %eax + je L_AES_CBC_encrypt_avx1_done +L_AES_CBC_encrypt_avx1_loop: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + vmovdqu (%r10), %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + # aes_enc_block + vpxor (%r8), %xmm1, %xmm1 + vmovdqu 16(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 32(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 48(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 64(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 80(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 96(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 112(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 128(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 144(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + cmpl $11, %r9d + vmovdqu 160(%r8), %xmm3 + jl L_AES_CBC_encrypt_avx1_aes_enc_block_last + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 176(%r8), %xmm4 + vaesenc %xmm4, %xmm1, %xmm1 + cmpl $13, %r9d + vmovdqu 192(%r8), %xmm3 + jl L_AES_CBC_encrypt_avx1_aes_enc_block_last + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 208(%r8), %xmm4 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqu 224(%r8), %xmm3 +L_AES_CBC_encrypt_avx1_aes_enc_block_last: + vaesenclast %xmm3, %xmm1, %xmm1 + leaq (%rsi,%rax,1), %r11 + vmovdqu %xmm1, (%r11) + vmovdqa %xmm1, %xmm0 + addl $16, %eax + cmpl %ecx, %eax + jl L_AES_CBC_encrypt_avx1_loop +L_AES_CBC_encrypt_avx1_done: + vmovdqu %xmm0, (%rdx) + repz retq +#ifndef __APPLE__ +.size AES_CBC_encrypt_avx1,.-AES_CBC_encrypt_avx1 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_CBC_decrypt_avx1 +.type AES_CBC_decrypt_avx1,@function +.align 16 +AES_CBC_decrypt_avx1: +#else +.section __TEXT,__text +.globl _AES_CBC_decrypt_avx1 +.p2align 4 +_AES_CBC_decrypt_avx1: +#endif /* __APPLE__ */ + pushq %r12 + vmovdqu (%rdx), %xmm4 + xorl %eax, %eax + cmpl $0x40, %ecx + movl %ecx, %r10d + jl L_AES_CBC_decrypt_avx1_done_64 + andl $0xffffffc0, %r10d +L_AES_CBC_decrypt_avx1_dec_64: + # 64 bytes of input + # aes_cbc_dec_64 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %r12 + vmovdqu (%r11), %xmm0 + vmovdqu 16(%r11), %xmm1 + vmovdqu 32(%r11), %xmm2 + vmovdqu 48(%r11), %xmm3 + # aes_dec_block + vmovdqu (%r8), %xmm5 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm5, %xmm3, %xmm3 + vmovdqu 16(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + vmovdqu 32(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + vmovdqu 48(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + vmovdqu 64(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + vmovdqu 80(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + vmovdqu 96(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + vmovdqu 112(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + vmovdqu 128(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + vmovdqu 144(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + cmpl $11, %r9d + vmovdqu 160(%r8), %xmm5 + jl L_AES_CBC_decrypt_avx1_64_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + vmovdqu 176(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + cmpl $13, %r9d + vmovdqu 192(%r8), %xmm5 + jl L_AES_CBC_decrypt_avx1_64_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + vmovdqu 208(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + vmovdqu 224(%r8), %xmm5 +L_AES_CBC_decrypt_avx1_64_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vaesdeclast %xmm5, %xmm1, %xmm1 + vaesdeclast %xmm5, %xmm2, %xmm2 + vaesdeclast %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm0, %xmm0 + vpxor (%r11), %xmm1, %xmm1 + vpxor 16(%r11), %xmm2, %xmm2 + vpxor 32(%r11), %xmm3, %xmm3 + vmovdqu 48(%r11), %xmm4 + vmovdqu %xmm0, (%r12) + vmovdqu %xmm1, 16(%r12) + vmovdqu %xmm2, 32(%r12) + vmovdqu %xmm3, 48(%r12) + addl $0x40, %eax + cmpl %r10d, %eax + jl L_AES_CBC_decrypt_avx1_dec_64 +L_AES_CBC_decrypt_avx1_done_64: + cmpl %ecx, %eax + movl %ecx, %r10d + je L_AES_CBC_decrypt_avx1_done_dec + andl $0xfffffff0, %r10d +L_AES_CBC_decrypt_avx1_dec_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r11 + vmovdqu (%r11), %xmm0 + vmovdqa %xmm0, %xmm8 + # aes_dec_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r8), %xmm6 + jl L_AES_CBC_decrypt_avx1_16_aes_dec_block_last + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm7 + vaesdec %xmm7, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r8), %xmm6 + jl L_AES_CBC_decrypt_avx1_16_aes_dec_block_last + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm7 + vaesdec %xmm7, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm6 +L_AES_CBC_decrypt_avx1_16_aes_dec_block_last: + vaesdeclast %xmm6, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vmovdqa %xmm8, %xmm4 + leaq (%rsi,%rax,1), %r11 + vmovdqu %xmm0, (%r11) + addl $16, %eax + cmpl %r10d, %eax + jl L_AES_CBC_decrypt_avx1_dec_16 +L_AES_CBC_decrypt_avx1_done_dec: + vmovdqu %xmm4, (%rdx) + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_CBC_decrypt_avx1,.-AES_CBC_decrypt_avx1 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_aes_ctr_avx1_bswap: +.quad 0x08090a0b0c0d0e0f,0x0001020304050607 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_aes_ctr_avx1_one: +.quad 0x0000000000000001,0x0000000000000000 +#ifndef __APPLE__ +.text +.globl AES_CTR_encrypt_avx1 +.type AES_CTR_encrypt_avx1,@function +.align 16 +AES_CTR_encrypt_avx1: +#else +.section __TEXT,__text +.globl _AES_CTR_encrypt_avx1 +.p2align 4 +_AES_CTR_encrypt_avx1: +#endif /* __APPLE__ */ + pushq %rbx + vmovdqu L_aes_ctr_avx1_bswap(%rip), %xmm8 + vmovdqu L_aes_ctr_avx1_one(%rip), %xmm9 + vpxor %xmm10, %xmm10, %xmm10 + vmovdqu (%r9), %xmm7 + vpshufb %xmm8, %xmm7, %xmm7 + xorl %eax, %eax + cmpl $0x40, %edx + movl %edx, %r10d + jl L_AES_CTR_encrypt_avx1_done_64 + andl $0xffffffc0, %r10d +L_AES_CTR_encrypt_avx1_enc_64: + # 64 bytes of input + # aes_ctr_enc_64 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %rbx + vpshufb %xmm8, %xmm7, %xmm0 + vpaddq %xmm9, %xmm7, %xmm7 + vpcmpeqq %xmm10, %xmm7, %xmm11 + vpslldq $8, %xmm11, %xmm11 + vpsrlq $63, %xmm11, %xmm11 + vpaddq %xmm11, %xmm7, %xmm7 + vpshufb %xmm8, %xmm7, %xmm1 + vpaddq %xmm9, %xmm7, %xmm7 + vpcmpeqq %xmm10, %xmm7, %xmm11 + vpslldq $8, %xmm11, %xmm11 + vpsrlq $63, %xmm11, %xmm11 + vpaddq %xmm11, %xmm7, %xmm7 + vpshufb %xmm8, %xmm7, %xmm2 + vpaddq %xmm9, %xmm7, %xmm7 + vpcmpeqq %xmm10, %xmm7, %xmm11 + vpslldq $8, %xmm11, %xmm11 + vpsrlq $63, %xmm11, %xmm11 + vpaddq %xmm11, %xmm7, %xmm7 + vpshufb %xmm8, %xmm7, %xmm3 + vpaddq %xmm9, %xmm7, %xmm7 + vpcmpeqq %xmm10, %xmm7, %xmm11 + vpslldq $8, %xmm11, %xmm11 + vpsrlq $63, %xmm11, %xmm11 + vpaddq %xmm11, %xmm7, %xmm7 + # aes_enc_block + vmovdqu (%rcx), %xmm4 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm4, %xmm3, %xmm3 + vmovdqu 16(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 32(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 48(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 64(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 80(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 96(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 112(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 128(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 144(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm4 + jl L_AES_CTR_encrypt_avx1_64_aes_enc_block_last + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 176(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm4 + jl L_AES_CTR_encrypt_avx1_64_aes_enc_block_last + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 208(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 224(%rcx), %xmm4 +L_AES_CTR_encrypt_avx1_64_aes_enc_block_last: + vaesenclast %xmm4, %xmm0, %xmm0 + vaesenclast %xmm4, %xmm1, %xmm1 + vaesenclast %xmm4, %xmm2, %xmm2 + vaesenclast %xmm4, %xmm3, %xmm3 + vpxor (%r11), %xmm0, %xmm0 + vpxor 16(%r11), %xmm1, %xmm1 + vpxor 32(%r11), %xmm2, %xmm2 + vpxor 48(%r11), %xmm3, %xmm3 + vmovdqu %xmm0, (%rbx) + vmovdqu %xmm1, 16(%rbx) + vmovdqu %xmm2, 32(%rbx) + vmovdqu %xmm3, 48(%rbx) + addl $0x40, %eax + cmpl %r10d, %eax + jl L_AES_CTR_encrypt_avx1_enc_64 +L_AES_CTR_encrypt_avx1_done_64: + cmpl %edx, %eax + movl %edx, %r10d + je L_AES_CTR_encrypt_avx1_done_enc + andl $0xfffffff0, %r10d +L_AES_CTR_encrypt_avx1_enc_16: + # 16 bytes of input + vpshufb %xmm8, %xmm7, %xmm0 + vpaddq %xmm9, %xmm7, %xmm7 + vpcmpeqq %xmm10, %xmm7, %xmm11 + vpslldq $8, %xmm11, %xmm11 + vpsrlq $63, %xmm11, %xmm11 + vpaddq %xmm11, %xmm7, %xmm7 + # aes_enc_block + vpxor (%rcx), %xmm0, %xmm0 + vmovdqu 16(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm5 + jl L_AES_CTR_encrypt_avx1_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm5 + jl L_AES_CTR_encrypt_avx1_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%rcx), %xmm5 +L_AES_CTR_encrypt_avx1_16_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + leaq (%rdi,%rax,1), %r11 + vpxor (%r11), %xmm0, %xmm0 + leaq (%rsi,%rax,1), %r11 + vmovdqu %xmm0, (%r11) + addl $16, %eax + cmpl %r10d, %eax + jl L_AES_CTR_encrypt_avx1_enc_16 +L_AES_CTR_encrypt_avx1_done_enc: + vpshufb %xmm8, %xmm7, %xmm7 + vmovdqu %xmm7, (%r9) + popq %rbx + repz retq +#ifndef __APPLE__ +.size AES_CTR_encrypt_avx1,.-AES_CTR_encrypt_avx1 +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX1 */ +#ifdef HAVE_INTEL_VAES +#ifndef __APPLE__ +.text +.globl AES_ECB_encrypt_vaes +.type AES_ECB_encrypt_vaes,@function +.align 16 +AES_ECB_encrypt_vaes: +#else +.section __TEXT,__text +.globl _AES_ECB_encrypt_vaes +.p2align 4 +_AES_ECB_encrypt_vaes: +#endif /* __APPLE__ */ + xorl %eax, %eax + cmpl $0x80, %edx + movl %edx, %r9d + jl L_AES_ECB_encrypt_vaes_done_128 + andl $0xffffff80, %r9d +L_AES_ECB_encrypt_vaes_enc_128: + # 128 bytes of input + # aes_ecb_enc_128 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + vmovdqu (%r10), %ymm0 + vmovdqu 32(%r10), %ymm1 + vmovdqu 64(%r10), %ymm2 + vmovdqu 96(%r10), %ymm3 + # aes_enc_block + vbroadcasti128 (%rcx), %ymm7 + vpxor %ymm7, %ymm0, %ymm0 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm7, %ymm2, %ymm2 + vpxor %ymm7, %ymm3, %ymm3 + vbroadcasti128 16(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + vbroadcasti128 32(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + vbroadcasti128 48(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + vbroadcasti128 64(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + vbroadcasti128 80(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + vbroadcasti128 96(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + vbroadcasti128 112(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + vbroadcasti128 128(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + vbroadcasti128 144(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + cmpl $11, %r8d + vbroadcasti128 160(%rcx), %ymm7 + jl L_AES_ECB_encrypt_vaes_128_aes_enc_block_last + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + vbroadcasti128 176(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + cmpl $13, %r8d + vbroadcasti128 192(%rcx), %ymm7 + jl L_AES_ECB_encrypt_vaes_128_aes_enc_block_last + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + vbroadcasti128 208(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + vbroadcasti128 224(%rcx), %ymm7 +L_AES_ECB_encrypt_vaes_128_aes_enc_block_last: + vaesenclast %ymm7, %ymm0, %ymm0 + vaesenclast %ymm7, %ymm1, %ymm1 + vaesenclast %ymm7, %ymm2, %ymm2 + vaesenclast %ymm7, %ymm3, %ymm3 + vmovdqu %ymm0, (%r11) + vmovdqu %ymm1, 32(%r11) + vmovdqu %ymm2, 64(%r11) + vmovdqu %ymm3, 96(%r11) + addl $0x80, %eax + cmpl %r9d, %eax + jl L_AES_ECB_encrypt_vaes_enc_128 +L_AES_ECB_encrypt_vaes_done_128: + movl %edx, %r9d + andl $0xffffffe0, %r9d + cmpl %r9d, %eax + je L_AES_ECB_encrypt_vaes_done_32 +L_AES_ECB_encrypt_vaes_enc_32: + # 32 bytes of input + # aes_ecb_enc_32 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + vmovdqu (%r10), %ymm0 + # aes_enc_block + vbroadcasti128 (%rcx), %ymm7 + vpxor %ymm7, %ymm0, %ymm0 + vbroadcasti128 16(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vbroadcasti128 32(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vbroadcasti128 48(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vbroadcasti128 64(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vbroadcasti128 80(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vbroadcasti128 96(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vbroadcasti128 112(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vbroadcasti128 128(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vbroadcasti128 144(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + cmpl $11, %r8d + vbroadcasti128 160(%rcx), %ymm7 + jl L_AES_ECB_encrypt_vaes_32_aes_enc_block_last + vaesenc %ymm7, %ymm0, %ymm0 + vbroadcasti128 176(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + cmpl $13, %r8d + vbroadcasti128 192(%rcx), %ymm7 + jl L_AES_ECB_encrypt_vaes_32_aes_enc_block_last + vaesenc %ymm7, %ymm0, %ymm0 + vbroadcasti128 208(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vbroadcasti128 224(%rcx), %ymm7 +L_AES_ECB_encrypt_vaes_32_aes_enc_block_last: + vaesenclast %ymm7, %ymm0, %ymm0 + vmovdqu %ymm0, (%r11) + addl $32, %eax + cmpl %r9d, %eax + jl L_AES_ECB_encrypt_vaes_enc_32 +L_AES_ECB_encrypt_vaes_done_32: + cmpl %edx, %eax + movl %edx, %r9d + je L_AES_ECB_encrypt_vaes_done_enc + andl $0xfffffff0, %r9d +L_AES_ECB_encrypt_vaes_enc_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + vmovdqu (%r10), %xmm0 + # aes_enc_block + vpxor (%rcx), %xmm0, %xmm0 + vmovdqu 16(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm5 + jl L_AES_ECB_encrypt_vaes_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm5 + jl L_AES_ECB_encrypt_vaes_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%rcx), %xmm5 +L_AES_ECB_encrypt_vaes_16_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + leaq (%rsi,%rax,1), %r10 + vmovdqu %xmm0, (%r10) + addl $16, %eax + cmpl %r9d, %eax + jl L_AES_ECB_encrypt_vaes_enc_16 +L_AES_ECB_encrypt_vaes_done_enc: + repz retq +#ifndef __APPLE__ +.size AES_ECB_encrypt_vaes,.-AES_ECB_encrypt_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_ECB_decrypt_vaes +.type AES_ECB_decrypt_vaes,@function +.align 16 +AES_ECB_decrypt_vaes: +#else +.section __TEXT,__text +.globl _AES_ECB_decrypt_vaes +.p2align 4 +_AES_ECB_decrypt_vaes: +#endif /* __APPLE__ */ + xorl %eax, %eax + cmpl $0x80, %edx + movl %edx, %r9d + jl L_AES_ECB_decrypt_vaes_done_128 + andl $0xffffff80, %r9d +L_AES_ECB_decrypt_vaes_dec_128: + # 128 bytes of input + # aes_ecb_dec_128 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + vmovdqu (%r10), %ymm0 + vmovdqu 32(%r10), %ymm1 + vmovdqu 64(%r10), %ymm2 + vmovdqu 96(%r10), %ymm3 + # aes_dec_block + vbroadcasti128 (%rcx), %ymm7 + vpxor %ymm7, %ymm0, %ymm0 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm7, %ymm2, %ymm2 + vpxor %ymm7, %ymm3, %ymm3 + vbroadcasti128 16(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + vbroadcasti128 32(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + vbroadcasti128 48(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + vbroadcasti128 64(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + vbroadcasti128 80(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + vbroadcasti128 96(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + vbroadcasti128 112(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + vbroadcasti128 128(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + vbroadcasti128 144(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + cmpl $11, %r8d + vbroadcasti128 160(%rcx), %ymm7 + jl L_AES_ECB_decrypt_vaes_128_aes_dec_block_last + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + vbroadcasti128 176(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + cmpl $13, %r8d + vbroadcasti128 192(%rcx), %ymm7 + jl L_AES_ECB_decrypt_vaes_128_aes_dec_block_last + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + vbroadcasti128 208(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + vbroadcasti128 224(%rcx), %ymm7 +L_AES_ECB_decrypt_vaes_128_aes_dec_block_last: + vaesdeclast %ymm7, %ymm0, %ymm0 + vaesdeclast %ymm7, %ymm1, %ymm1 + vaesdeclast %ymm7, %ymm2, %ymm2 + vaesdeclast %ymm7, %ymm3, %ymm3 + vmovdqu %ymm0, (%r11) + vmovdqu %ymm1, 32(%r11) + vmovdqu %ymm2, 64(%r11) + vmovdqu %ymm3, 96(%r11) + addl $0x80, %eax + cmpl %r9d, %eax + jl L_AES_ECB_decrypt_vaes_dec_128 +L_AES_ECB_decrypt_vaes_done_128: + movl %edx, %r9d + andl $0xffffffe0, %r9d + cmpl %r9d, %eax + je L_AES_ECB_decrypt_vaes_done_32 +L_AES_ECB_decrypt_vaes_dec_32: + # 32 bytes of input + # aes_ecb_dec_32 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + vmovdqu (%r10), %ymm0 + # aes_dec_block + vbroadcasti128 (%rcx), %ymm7 + vpxor %ymm7, %ymm0, %ymm0 + vbroadcasti128 16(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vbroadcasti128 32(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vbroadcasti128 48(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vbroadcasti128 64(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vbroadcasti128 80(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vbroadcasti128 96(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vbroadcasti128 112(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vbroadcasti128 128(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vbroadcasti128 144(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + cmpl $11, %r8d + vbroadcasti128 160(%rcx), %ymm7 + jl L_AES_ECB_decrypt_vaes_32_aes_dec_block_last + vaesdec %ymm7, %ymm0, %ymm0 + vbroadcasti128 176(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + cmpl $13, %r8d + vbroadcasti128 192(%rcx), %ymm7 + jl L_AES_ECB_decrypt_vaes_32_aes_dec_block_last + vaesdec %ymm7, %ymm0, %ymm0 + vbroadcasti128 208(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vbroadcasti128 224(%rcx), %ymm7 +L_AES_ECB_decrypt_vaes_32_aes_dec_block_last: + vaesdeclast %ymm7, %ymm0, %ymm0 + vmovdqu %ymm0, (%r11) + addl $32, %eax + cmpl %r9d, %eax + jl L_AES_ECB_decrypt_vaes_dec_32 +L_AES_ECB_decrypt_vaes_done_32: + cmpl %edx, %eax + movl %edx, %r9d + je L_AES_ECB_decrypt_vaes_done_dec + andl $0xfffffff0, %r9d +L_AES_ECB_decrypt_vaes_dec_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + vmovdqu (%r10), %xmm0 + # aes_dec_block + vpxor (%rcx), %xmm0, %xmm0 + vmovdqu 16(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm5 + jl L_AES_ECB_decrypt_vaes_16_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%rcx), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm5 + jl L_AES_ECB_decrypt_vaes_16_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%rcx), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%rcx), %xmm5 +L_AES_ECB_decrypt_vaes_16_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + leaq (%rsi,%rax,1), %r10 + vmovdqu %xmm0, (%r10) + addl $16, %eax + cmpl %r9d, %eax + jl L_AES_ECB_decrypt_vaes_dec_16 +L_AES_ECB_decrypt_vaes_done_dec: + repz retq +#ifndef __APPLE__ +.size AES_ECB_decrypt_vaes,.-AES_ECB_decrypt_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_CBC_encrypt_vaes +.type AES_CBC_encrypt_vaes,@function +.align 16 +AES_CBC_encrypt_vaes: +#else +.section __TEXT,__text +.globl _AES_CBC_encrypt_vaes +.p2align 4 +_AES_CBC_encrypt_vaes: +#endif /* __APPLE__ */ + vmovdqu (%rdx), %xmm0 + xorl %eax, %eax + cmpl %ecx, %eax + je L_AES_CBC_encrypt_vaes_done +L_AES_CBC_encrypt_vaes_loop: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + vmovdqu (%r10), %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + # aes_enc_block + vpxor (%r8), %xmm1, %xmm1 + vmovdqu 16(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 32(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 48(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 64(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 80(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 96(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 112(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 128(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 144(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + cmpl $11, %r9d + vmovdqu 160(%r8), %xmm3 + jl L_AES_CBC_encrypt_vaes_aes_enc_block_last + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 176(%r8), %xmm4 + vaesenc %xmm4, %xmm1, %xmm1 + cmpl $13, %r9d + vmovdqu 192(%r8), %xmm3 + jl L_AES_CBC_encrypt_vaes_aes_enc_block_last + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 208(%r8), %xmm4 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqu 224(%r8), %xmm3 +L_AES_CBC_encrypt_vaes_aes_enc_block_last: + vaesenclast %xmm3, %xmm1, %xmm1 + leaq (%rsi,%rax,1), %r11 + vmovdqu %xmm1, (%r11) + vmovdqa %xmm1, %xmm0 + addl $16, %eax + cmpl %ecx, %eax + jl L_AES_CBC_encrypt_vaes_loop +L_AES_CBC_encrypt_vaes_done: + vmovdqu %xmm0, (%rdx) + repz retq +#ifndef __APPLE__ +.size AES_CBC_encrypt_vaes,.-AES_CBC_encrypt_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_CBC_decrypt_vaes +.type AES_CBC_decrypt_vaes,@function +.align 16 +AES_CBC_decrypt_vaes: +#else +.section __TEXT,__text +.globl _AES_CBC_decrypt_vaes +.p2align 4 +_AES_CBC_decrypt_vaes: +#endif /* __APPLE__ */ + pushq %r12 + vmovdqu (%rdx), %xmm8 + xorl %eax, %eax + cmpl $0x80, %ecx + movl %ecx, %r10d + jl L_AES_CBC_decrypt_vaes_done_128 + andl $0xffffff80, %r10d +L_AES_CBC_decrypt_vaes_dec_128: + # 128 bytes of input + # aes_cbc_dec_128 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %r12 + vmovdqu (%r11), %ymm0 + vmovdqu 32(%r11), %ymm1 + vmovdqu 64(%r11), %ymm2 + vmovdqu 96(%r11), %ymm3 + vinserti128 $0x01, %xmm0, %ymm8, %ymm10 + vmovdqu 16(%r11), %ymm11 + vmovdqu 48(%r11), %ymm12 + vmovdqu 80(%r11), %ymm13 + vextracti128 $0x01, %ymm3, %xmm8 + # aes_dec_block + vbroadcasti128 (%r8), %ymm9 + vpxor %ymm9, %ymm0, %ymm0 + vpxor %ymm9, %ymm1, %ymm1 + vpxor %ymm9, %ymm2, %ymm2 + vpxor %ymm9, %ymm3, %ymm3 + vbroadcasti128 16(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 32(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 48(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 64(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 80(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 96(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 112(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 128(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 144(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + cmpl $11, %r9d + vbroadcasti128 160(%r8), %ymm9 + jl L_AES_CBC_decrypt_vaes_128_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 176(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + cmpl $13, %r9d + vbroadcasti128 192(%r8), %ymm9 + jl L_AES_CBC_decrypt_vaes_128_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 208(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 224(%r8), %ymm9 +L_AES_CBC_decrypt_vaes_128_aes_dec_block_last: + vaesdeclast %ymm9, %ymm0, %ymm0 + vaesdeclast %ymm9, %ymm1, %ymm1 + vaesdeclast %ymm9, %ymm2, %ymm2 + vaesdeclast %ymm9, %ymm3, %ymm3 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vmovdqu %ymm0, (%r12) + vmovdqu %ymm1, 32(%r12) + vmovdqu %ymm2, 64(%r12) + vmovdqu %ymm3, 96(%r12) + addl $0x80, %eax + cmpl %r10d, %eax + jl L_AES_CBC_decrypt_vaes_dec_128 +L_AES_CBC_decrypt_vaes_done_128: + movl %ecx, %r10d + andl $0xffffffe0, %r10d + cmpl %r10d, %eax + je L_AES_CBC_decrypt_vaes_done_32 +L_AES_CBC_decrypt_vaes_dec_32: + # 32 bytes of input + # aes_cbc_dec_32 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %r12 + vmovdqu (%r11), %ymm0 + vinserti128 $0x01, %xmm0, %ymm8, %ymm10 + vextracti128 $0x01, %ymm0, %xmm8 + # aes_dec_block + vbroadcasti128 (%r8), %ymm9 + vpxor %ymm9, %ymm0, %ymm0 + vbroadcasti128 16(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 32(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 48(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 64(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 80(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 96(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 112(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 128(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 144(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + cmpl $11, %r9d + vbroadcasti128 160(%r8), %ymm9 + jl L_AES_CBC_decrypt_vaes_32_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 176(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + cmpl $13, %r9d + vbroadcasti128 192(%r8), %ymm9 + jl L_AES_CBC_decrypt_vaes_32_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 208(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 224(%r8), %ymm9 +L_AES_CBC_decrypt_vaes_32_aes_dec_block_last: + vaesdeclast %ymm9, %ymm0, %ymm0 + vpxor %ymm10, %ymm0, %ymm0 + vmovdqu %ymm0, (%r12) + addl $32, %eax + cmpl %r10d, %eax + jl L_AES_CBC_decrypt_vaes_dec_32 +L_AES_CBC_decrypt_vaes_done_32: + cmpl %ecx, %eax + movl %ecx, %r10d + je L_AES_CBC_decrypt_vaes_done_dec + andl $0xfffffff0, %r10d +L_AES_CBC_decrypt_vaes_dec_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r11 + vmovdqu (%r11), %xmm0 + vmovdqa %xmm0, %xmm7 + # aes_dec_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r8), %xmm5 + jl L_AES_CBC_decrypt_vaes_16_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r8), %xmm5 + jl L_AES_CBC_decrypt_vaes_16_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_CBC_decrypt_vaes_16_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + vmovdqa %xmm7, %xmm8 + leaq (%rsi,%rax,1), %r11 + vmovdqu %xmm0, (%r11) + addl $16, %eax + cmpl %r10d, %eax + jl L_AES_CBC_decrypt_vaes_dec_16 +L_AES_CBC_decrypt_vaes_done_dec: + vmovdqu %xmm8, (%rdx) + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_CBC_decrypt_vaes,.-AES_CBC_decrypt_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_aes_ctr_bswap_vaes: +.quad 0x08090a0b0c0d0e0f,0x0001020304050607 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_aes_ctr_inc_vaes: +.quad 0x0000000000000000,0x0000000000000000 +.quad 0x0000000000000001,0x0000000000000000 +.quad 0x0000000000000002,0x0000000000000000 +.quad 0x0000000000000003,0x0000000000000000 +.quad 0x0000000000000004,0x0000000000000000 +.quad 0x0000000000000005,0x0000000000000000 +.quad 0x0000000000000006,0x0000000000000000 +.quad 0x0000000000000007,0x0000000000000000 +.quad 0x0000000000000008,0x0000000000000000 +.quad 0x0000000000000009,0x0000000000000000 +.quad 0x000000000000000a,0x0000000000000000 +.quad 0x000000000000000b,0x0000000000000000 +.quad 0x000000000000000c,0x0000000000000000 +.quad 0x000000000000000d,0x0000000000000000 +.quad 0x000000000000000e,0x0000000000000000 +.quad 0x000000000000000f,0x0000000000000000 +.quad 0x0000000000000010,0x0000000000000000 +#ifndef __APPLE__ +.text +.globl AES_CTR_encrypt_vaes +.type AES_CTR_encrypt_vaes,@function +.align 16 +AES_CTR_encrypt_vaes: +#else +.section __TEXT,__text +.globl _AES_CTR_encrypt_vaes +.p2align 4 +_AES_CTR_encrypt_vaes: +#endif /* __APPLE__ */ + pushq %rbx + vbroadcasti128 L_aes_ctr_bswap_vaes(%rip), %ymm8 + vbroadcasti128 (%r9), %ymm7 + vpshufb %ymm8, %ymm7, %ymm7 + vbroadcasti128 128+L_aes_ctr_inc_vaes(%rip), %ymm10 + vbroadcasti128 32+L_aes_ctr_inc_vaes(%rip), %ymm11 + vbroadcasti128 16+L_aes_ctr_inc_vaes(%rip), %ymm12 + xorl %eax, %eax + cmpl $0x80, %edx + movl %edx, %r10d + jl L_AES_CTR_encrypt_vaes_done_128 + andl $0xffffff80, %r10d + vmovdqa %ymm7, %ymm9 + vpaddq 0+L_aes_ctr_inc_vaes(%rip), %ymm7, %ymm4 + vpand 0+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm14 + vpor 0+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm9 + vpandn %ymm9, %ymm4, %ymm9 + vpor %ymm14, %ymm9, %ymm9 + vpsrlq $63, %ymm9, %ymm9 + vpslldq $8, %ymm9, %ymm9 + vpaddq %ymm9, %ymm4, %ymm4 + vmovdqa %ymm7, %ymm9 + vpaddq 32+L_aes_ctr_inc_vaes(%rip), %ymm7, %ymm5 + vpand 32+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm14 + vpor 32+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm9 + vpandn %ymm9, %ymm5, %ymm9 + vpor %ymm14, %ymm9, %ymm9 + vpsrlq $63, %ymm9, %ymm9 + vpslldq $8, %ymm9, %ymm9 + vpaddq %ymm9, %ymm5, %ymm5 + vmovdqa %ymm7, %ymm9 + vpaddq 64+L_aes_ctr_inc_vaes(%rip), %ymm7, %ymm6 + vpand 64+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm14 + vpor 64+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm9 + vpandn %ymm9, %ymm6, %ymm9 + vpor %ymm14, %ymm9, %ymm9 + vpsrlq $63, %ymm9, %ymm9 + vpslldq $8, %ymm9, %ymm9 + vpaddq %ymm9, %ymm6, %ymm6 + vmovdqa %ymm7, %ymm9 + vpaddq 96+L_aes_ctr_inc_vaes(%rip), %ymm7, %ymm7 + vpand 96+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm14 + vpor 96+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm9 + vpandn %ymm9, %ymm7, %ymm9 + vpor %ymm14, %ymm9, %ymm9 + vpsrlq $63, %ymm9, %ymm9 + vpslldq $8, %ymm9, %ymm9 + vpaddq %ymm9, %ymm7, %ymm7 +L_AES_CTR_encrypt_vaes_enc_128: + # 128 bytes of input + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %rbx + vpshufb %ymm8, %ymm4, %ymm0 + vpshufb %ymm8, %ymm5, %ymm1 + vpshufb %ymm8, %ymm6, %ymm2 + vpshufb %ymm8, %ymm7, %ymm3 + vmovdqa %ymm4, %ymm9 + vpaddq %ymm10, %ymm4, %ymm4 + vpand %ymm10, %ymm9, %ymm14 + vpor %ymm10, %ymm9, %ymm9 + vpandn %ymm9, %ymm4, %ymm9 + vpor %ymm14, %ymm9, %ymm9 + vpsrlq $63, %ymm9, %ymm9 + vpslldq $8, %ymm9, %ymm9 + vpaddq %ymm9, %ymm4, %ymm4 + vmovdqa %ymm5, %ymm9 + vpaddq %ymm10, %ymm5, %ymm5 + vpand %ymm10, %ymm9, %ymm14 + vpor %ymm10, %ymm9, %ymm9 + vpandn %ymm9, %ymm5, %ymm9 + vpor %ymm14, %ymm9, %ymm9 + vpsrlq $63, %ymm9, %ymm9 + vpslldq $8, %ymm9, %ymm9 + vpaddq %ymm9, %ymm5, %ymm5 + vmovdqa %ymm6, %ymm9 + vpaddq %ymm10, %ymm6, %ymm6 + vpand %ymm10, %ymm9, %ymm14 + vpor %ymm10, %ymm9, %ymm9 + vpandn %ymm9, %ymm6, %ymm9 + vpor %ymm14, %ymm9, %ymm9 + vpsrlq $63, %ymm9, %ymm9 + vpslldq $8, %ymm9, %ymm9 + vpaddq %ymm9, %ymm6, %ymm6 + vmovdqa %ymm7, %ymm9 + vpaddq %ymm10, %ymm7, %ymm7 + vpand %ymm10, %ymm9, %ymm14 + vpor %ymm10, %ymm9, %ymm9 + vpandn %ymm9, %ymm7, %ymm9 + vpor %ymm14, %ymm9, %ymm9 + vpsrlq $63, %ymm9, %ymm9 + vpslldq $8, %ymm9, %ymm9 + vpaddq %ymm9, %ymm7, %ymm7 + # aes_enc_block + vbroadcasti128 (%rcx), %ymm13 + vpxor %ymm13, %ymm0, %ymm0 + vpxor %ymm13, %ymm1, %ymm1 + vpxor %ymm13, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vbroadcasti128 16(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + vbroadcasti128 32(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + vbroadcasti128 48(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + vbroadcasti128 64(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + vbroadcasti128 80(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + vbroadcasti128 96(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + vbroadcasti128 112(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + vbroadcasti128 128(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + vbroadcasti128 144(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + cmpl $11, %r8d + vbroadcasti128 160(%rcx), %ymm13 + jl L_AES_CTR_encrypt_vaes_128_aes_enc_block_last + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + vbroadcasti128 176(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + cmpl $13, %r8d + vbroadcasti128 192(%rcx), %ymm13 + jl L_AES_CTR_encrypt_vaes_128_aes_enc_block_last + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + vbroadcasti128 208(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + vbroadcasti128 224(%rcx), %ymm13 +L_AES_CTR_encrypt_vaes_128_aes_enc_block_last: + vaesenclast %ymm13, %ymm0, %ymm0 + vaesenclast %ymm13, %ymm1, %ymm1 + vaesenclast %ymm13, %ymm2, %ymm2 + vaesenclast %ymm13, %ymm3, %ymm3 + vpxor (%r11), %ymm0, %ymm0 + vpxor 32(%r11), %ymm1, %ymm1 + vpxor 64(%r11), %ymm2, %ymm2 + vpxor 96(%r11), %ymm3, %ymm3 + vmovdqu %ymm0, (%rbx) + vmovdqu %ymm1, 32(%rbx) + vmovdqu %ymm2, 64(%rbx) + vmovdqu %ymm3, 96(%rbx) + addl $0x80, %eax + cmpl %r10d, %eax + jl L_AES_CTR_encrypt_vaes_enc_128 + vperm2i128 $0x00, %ymm4, %ymm4, %ymm7 +L_AES_CTR_encrypt_vaes_done_128: + movl %edx, %r10d + andl $0xffffffe0, %r10d + cmpl %r10d, %eax + je L_AES_CTR_encrypt_vaes_done_32 +L_AES_CTR_encrypt_vaes_enc_32: + # 32 bytes of input + # aes_ctr_enc_32 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %rbx + vpaddq 0+L_aes_ctr_inc_vaes(%rip), %ymm7, %ymm0 + vmovdqa %ymm7, %ymm9 + vpand 0+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm14 + vpor 0+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm9 + vpandn %ymm9, %ymm0, %ymm9 + vpor %ymm14, %ymm9, %ymm9 + vpsrlq $63, %ymm9, %ymm9 + vpslldq $8, %ymm9, %ymm9 + vpaddq %ymm9, %ymm0, %ymm0 + vpshufb %ymm8, %ymm0, %ymm0 + vmovdqa %ymm7, %ymm9 + vpaddq %ymm11, %ymm7, %ymm7 + vpand %ymm11, %ymm9, %ymm14 + vpor %ymm11, %ymm9, %ymm9 + vpandn %ymm9, %ymm7, %ymm9 + vpor %ymm14, %ymm9, %ymm9 + vpsrlq $63, %ymm9, %ymm9 + vpslldq $8, %ymm9, %ymm9 + vpaddq %ymm9, %ymm7, %ymm7 + # aes_enc_block + vbroadcasti128 (%rcx), %ymm13 + vpxor %ymm13, %ymm0, %ymm0 + vbroadcasti128 16(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vbroadcasti128 32(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vbroadcasti128 48(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vbroadcasti128 64(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vbroadcasti128 80(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vbroadcasti128 96(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vbroadcasti128 112(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vbroadcasti128 128(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vbroadcasti128 144(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + cmpl $11, %r8d + vbroadcasti128 160(%rcx), %ymm13 + jl L_AES_CTR_encrypt_vaes_32_aes_enc_block_last + vaesenc %ymm13, %ymm0, %ymm0 + vbroadcasti128 176(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + cmpl $13, %r8d + vbroadcasti128 192(%rcx), %ymm13 + jl L_AES_CTR_encrypt_vaes_32_aes_enc_block_last + vaesenc %ymm13, %ymm0, %ymm0 + vbroadcasti128 208(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vbroadcasti128 224(%rcx), %ymm13 +L_AES_CTR_encrypt_vaes_32_aes_enc_block_last: + vaesenclast %ymm13, %ymm0, %ymm0 + vpxor (%r11), %ymm0, %ymm0 + vmovdqu %ymm0, (%rbx) + addl $32, %eax + cmpl %r10d, %eax + jl L_AES_CTR_encrypt_vaes_enc_32 +L_AES_CTR_encrypt_vaes_done_32: + cmpl %edx, %eax + movl %edx, %r10d + je L_AES_CTR_encrypt_vaes_done_enc + andl $0xfffffff0, %r10d +L_AES_CTR_encrypt_vaes_enc_16: + # 16 bytes of input + vpshufb %xmm8, %xmm7, %xmm0 + vmovdqa %ymm7, %ymm9 + vpaddq %ymm12, %ymm7, %ymm7 + vpand %ymm12, %ymm9, %ymm14 + vpor %ymm12, %ymm9, %ymm9 + vpandn %ymm9, %ymm7, %ymm9 + vpor %ymm14, %ymm9, %ymm9 + vpsrlq $63, %ymm9, %ymm9 + vpslldq $8, %ymm9, %ymm9 + vpaddq %ymm9, %ymm7, %ymm7 + # aes_enc_block + vpxor (%rcx), %xmm0, %xmm0 + vmovdqu 16(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm5 + jl L_AES_CTR_encrypt_vaes_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm5 + jl L_AES_CTR_encrypt_vaes_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%rcx), %xmm5 +L_AES_CTR_encrypt_vaes_16_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + leaq (%rdi,%rax,1), %r11 + vpxor (%r11), %xmm0, %xmm0 + leaq (%rsi,%rax,1), %r11 + vmovdqu %xmm0, (%r11) + addl $16, %eax + cmpl %r10d, %eax + jl L_AES_CTR_encrypt_vaes_enc_16 +L_AES_CTR_encrypt_vaes_done_enc: + vpshufb %xmm8, %xmm7, %xmm0 + vmovdqu %xmm0, (%r9) + popq %rbx + repz retq +#ifndef __APPLE__ +.size AES_CTR_encrypt_vaes,.-AES_CTR_encrypt_vaes +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_VAES */ +#ifdef HAVE_INTEL_AVX512 +#ifndef __APPLE__ +.text +.globl AES_ECB_encrypt_avx512 +.type AES_ECB_encrypt_avx512,@function +.align 16 +AES_ECB_encrypt_avx512: +#else +.section __TEXT,__text +.globl _AES_ECB_encrypt_avx512 +.p2align 4 +_AES_ECB_encrypt_avx512: +#endif /* __APPLE__ */ + xorl %eax, %eax + cmpl $0x40, %edx + jl L_AES_ECB_encrypt_avx512_done_64 + vbroadcasti32x4 (%rcx), %zmm8 + vbroadcasti32x4 16(%rcx), %zmm9 + vbroadcasti32x4 32(%rcx), %zmm10 + vbroadcasti32x4 48(%rcx), %zmm11 + vbroadcasti32x4 64(%rcx), %zmm12 + vbroadcasti32x4 80(%rcx), %zmm13 + vbroadcasti32x4 96(%rcx), %zmm14 + vbroadcasti32x4 112(%rcx), %zmm15 + vbroadcasti32x4 128(%rcx), %zmm16 + vbroadcasti32x4 144(%rcx), %zmm17 + vbroadcasti32x4 160(%rcx), %zmm18 + cmpl $11, %r8d + jl L_AES_ECB_encrypt_avx512_key_cached + vbroadcasti32x4 176(%rcx), %zmm19 + vbroadcasti32x4 192(%rcx), %zmm20 + cmpl $13, %r8d + jl L_AES_ECB_encrypt_avx512_key_cached + vbroadcasti32x4 208(%rcx), %zmm21 + vbroadcasti32x4 224(%rcx), %zmm22 +L_AES_ECB_encrypt_avx512_key_cached: + cmpl $0x100, %edx + movl %edx, %r9d + jl L_AES_ECB_encrypt_avx512_done_256 + andl $0xffffff00, %r9d +L_AES_ECB_encrypt_avx512_enc_256: + # 256 bytes of input + # aes_ecb_enc_256 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + vmovdqu64 (%r10), %zmm0 + vmovdqu64 64(%r10), %zmm1 + vmovdqu64 128(%r10), %zmm2 + vmovdqu64 192(%r10), %zmm3 + # aes_enc_block + vpxorq %zmm8, %zmm0, %zmm0 + vpxorq %zmm8, %zmm1, %zmm1 + vpxorq %zmm8, %zmm2, %zmm2 + vpxorq %zmm8, %zmm3, %zmm3 + vaesenc %zmm9, %zmm0, %zmm0 + vaesenc %zmm9, %zmm1, %zmm1 + vaesenc %zmm9, %zmm2, %zmm2 + vaesenc %zmm9, %zmm3, %zmm3 + vaesenc %zmm10, %zmm0, %zmm0 + vaesenc %zmm10, %zmm1, %zmm1 + vaesenc %zmm10, %zmm2, %zmm2 + vaesenc %zmm10, %zmm3, %zmm3 + vaesenc %zmm11, %zmm0, %zmm0 + vaesenc %zmm11, %zmm1, %zmm1 + vaesenc %zmm11, %zmm2, %zmm2 + vaesenc %zmm11, %zmm3, %zmm3 + vaesenc %zmm12, %zmm0, %zmm0 + vaesenc %zmm12, %zmm1, %zmm1 + vaesenc %zmm12, %zmm2, %zmm2 + vaesenc %zmm12, %zmm3, %zmm3 + vaesenc %zmm13, %zmm0, %zmm0 + vaesenc %zmm13, %zmm1, %zmm1 + vaesenc %zmm13, %zmm2, %zmm2 + vaesenc %zmm13, %zmm3, %zmm3 + vaesenc %zmm14, %zmm0, %zmm0 + vaesenc %zmm14, %zmm1, %zmm1 + vaesenc %zmm14, %zmm2, %zmm2 + vaesenc %zmm14, %zmm3, %zmm3 + vaesenc %zmm15, %zmm0, %zmm0 + vaesenc %zmm15, %zmm1, %zmm1 + vaesenc %zmm15, %zmm2, %zmm2 + vaesenc %zmm15, %zmm3, %zmm3 + vaesenc %zmm16, %zmm0, %zmm0 + vaesenc %zmm16, %zmm1, %zmm1 + vaesenc %zmm16, %zmm2, %zmm2 + vaesenc %zmm16, %zmm3, %zmm3 + vaesenc %zmm17, %zmm0, %zmm0 + vaesenc %zmm17, %zmm1, %zmm1 + vaesenc %zmm17, %zmm2, %zmm2 + vaesenc %zmm17, %zmm3, %zmm3 + cmpl $11, %r8d + vmovdqa64 %zmm18, %zmm7 + jl L_AES_ECB_encrypt_avx512_256_aes_enc_block_last + vaesenc %zmm18, %zmm0, %zmm0 + vaesenc %zmm18, %zmm1, %zmm1 + vaesenc %zmm18, %zmm2, %zmm2 + vaesenc %zmm18, %zmm3, %zmm3 + vaesenc %zmm19, %zmm0, %zmm0 + vaesenc %zmm19, %zmm1, %zmm1 + vaesenc %zmm19, %zmm2, %zmm2 + vaesenc %zmm19, %zmm3, %zmm3 + cmpl $13, %r8d + vmovdqa64 %zmm20, %zmm7 + jl L_AES_ECB_encrypt_avx512_256_aes_enc_block_last + vaesenc %zmm20, %zmm0, %zmm0 + vaesenc %zmm20, %zmm1, %zmm1 + vaesenc %zmm20, %zmm2, %zmm2 + vaesenc %zmm20, %zmm3, %zmm3 + vaesenc %zmm21, %zmm0, %zmm0 + vaesenc %zmm21, %zmm1, %zmm1 + vaesenc %zmm21, %zmm2, %zmm2 + vaesenc %zmm21, %zmm3, %zmm3 + vmovdqa64 %zmm22, %zmm7 +L_AES_ECB_encrypt_avx512_256_aes_enc_block_last: + vaesenclast %zmm7, %zmm0, %zmm0 + vaesenclast %zmm7, %zmm1, %zmm1 + vaesenclast %zmm7, %zmm2, %zmm2 + vaesenclast %zmm7, %zmm3, %zmm3 + vmovdqu64 %zmm0, (%r11) + vmovdqu64 %zmm1, 64(%r11) + vmovdqu64 %zmm2, 128(%r11) + vmovdqu64 %zmm3, 192(%r11) + addl $0x100, %eax + cmpl %r9d, %eax + jl L_AES_ECB_encrypt_avx512_enc_256 +L_AES_ECB_encrypt_avx512_done_256: + movl %edx, %r9d + andl $0xffffffc0, %r9d + cmpl %r9d, %eax + je L_AES_ECB_encrypt_avx512_done_64 +L_AES_ECB_encrypt_avx512_enc_64: + # 64 bytes of input + # aes_ecb_enc_64 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + vmovdqu64 (%r10), %zmm0 + # aes_enc_block + vpxorq %zmm8, %zmm0, %zmm0 + vaesenc %zmm9, %zmm0, %zmm0 + vaesenc %zmm10, %zmm0, %zmm0 + vaesenc %zmm11, %zmm0, %zmm0 + vaesenc %zmm12, %zmm0, %zmm0 + vaesenc %zmm13, %zmm0, %zmm0 + vaesenc %zmm14, %zmm0, %zmm0 + vaesenc %zmm15, %zmm0, %zmm0 + vaesenc %zmm16, %zmm0, %zmm0 + vaesenc %zmm17, %zmm0, %zmm0 + cmpl $11, %r8d + vmovdqa64 %zmm18, %zmm7 + jl L_AES_ECB_encrypt_avx512_64_aes_enc_block_last + vaesenc %zmm18, %zmm0, %zmm0 + vaesenc %zmm19, %zmm0, %zmm0 + cmpl $13, %r8d + vmovdqa64 %zmm20, %zmm7 + jl L_AES_ECB_encrypt_avx512_64_aes_enc_block_last + vaesenc %zmm20, %zmm0, %zmm0 + vaesenc %zmm21, %zmm0, %zmm0 + vmovdqa64 %zmm22, %zmm7 +L_AES_ECB_encrypt_avx512_64_aes_enc_block_last: + vaesenclast %zmm7, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%r11) + addl $0x40, %eax + cmpl %r9d, %eax + jl L_AES_ECB_encrypt_avx512_enc_64 +L_AES_ECB_encrypt_avx512_done_64: + cmpl %edx, %eax + movl %edx, %r9d + je L_AES_ECB_encrypt_avx512_done_enc + andl $0xfffffff0, %r9d +L_AES_ECB_encrypt_avx512_enc_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + vmovdqu (%r10), %xmm0 + # aes_enc_block + vpxor (%rcx), %xmm0, %xmm0 + vmovdqu 16(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm5 + jl L_AES_ECB_encrypt_avx512_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm5 + jl L_AES_ECB_encrypt_avx512_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%rcx), %xmm5 +L_AES_ECB_encrypt_avx512_16_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + leaq (%rsi,%rax,1), %r10 + vmovdqu %xmm0, (%r10) + addl $16, %eax + cmpl %r9d, %eax + jl L_AES_ECB_encrypt_avx512_enc_16 +L_AES_ECB_encrypt_avx512_done_enc: + repz retq +#ifndef __APPLE__ +.size AES_ECB_encrypt_avx512,.-AES_ECB_encrypt_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_ECB_decrypt_avx512 +.type AES_ECB_decrypt_avx512,@function +.align 16 +AES_ECB_decrypt_avx512: +#else +.section __TEXT,__text +.globl _AES_ECB_decrypt_avx512 +.p2align 4 +_AES_ECB_decrypt_avx512: +#endif /* __APPLE__ */ + xorl %eax, %eax + cmpl $0x40, %edx + jl L_AES_ECB_decrypt_avx512_done_64 + vbroadcasti32x4 (%rcx), %zmm8 + vbroadcasti32x4 16(%rcx), %zmm9 + vbroadcasti32x4 32(%rcx), %zmm10 + vbroadcasti32x4 48(%rcx), %zmm11 + vbroadcasti32x4 64(%rcx), %zmm12 + vbroadcasti32x4 80(%rcx), %zmm13 + vbroadcasti32x4 96(%rcx), %zmm14 + vbroadcasti32x4 112(%rcx), %zmm15 + vbroadcasti32x4 128(%rcx), %zmm16 + vbroadcasti32x4 144(%rcx), %zmm17 + vbroadcasti32x4 160(%rcx), %zmm18 + cmpl $11, %r8d + jl L_AES_ECB_decrypt_avx512_key_cached + vbroadcasti32x4 176(%rcx), %zmm19 + vbroadcasti32x4 192(%rcx), %zmm20 + cmpl $13, %r8d + jl L_AES_ECB_decrypt_avx512_key_cached + vbroadcasti32x4 208(%rcx), %zmm21 + vbroadcasti32x4 224(%rcx), %zmm22 +L_AES_ECB_decrypt_avx512_key_cached: + cmpl $0x100, %edx + movl %edx, %r9d + jl L_AES_ECB_decrypt_avx512_done_256 + andl $0xffffff00, %r9d +L_AES_ECB_decrypt_avx512_dec_256: + # 256 bytes of input + # aes_ecb_dec_256 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + vmovdqu64 (%r10), %zmm0 + vmovdqu64 64(%r10), %zmm1 + vmovdqu64 128(%r10), %zmm2 + vmovdqu64 192(%r10), %zmm3 + # aes_dec_block + vpxorq %zmm8, %zmm0, %zmm0 + vpxorq %zmm8, %zmm1, %zmm1 + vpxorq %zmm8, %zmm2, %zmm2 + vpxorq %zmm8, %zmm3, %zmm3 + vaesdec %zmm9, %zmm0, %zmm0 + vaesdec %zmm9, %zmm1, %zmm1 + vaesdec %zmm9, %zmm2, %zmm2 + vaesdec %zmm9, %zmm3, %zmm3 + vaesdec %zmm10, %zmm0, %zmm0 + vaesdec %zmm10, %zmm1, %zmm1 + vaesdec %zmm10, %zmm2, %zmm2 + vaesdec %zmm10, %zmm3, %zmm3 + vaesdec %zmm11, %zmm0, %zmm0 + vaesdec %zmm11, %zmm1, %zmm1 + vaesdec %zmm11, %zmm2, %zmm2 + vaesdec %zmm11, %zmm3, %zmm3 + vaesdec %zmm12, %zmm0, %zmm0 + vaesdec %zmm12, %zmm1, %zmm1 + vaesdec %zmm12, %zmm2, %zmm2 + vaesdec %zmm12, %zmm3, %zmm3 + vaesdec %zmm13, %zmm0, %zmm0 + vaesdec %zmm13, %zmm1, %zmm1 + vaesdec %zmm13, %zmm2, %zmm2 + vaesdec %zmm13, %zmm3, %zmm3 + vaesdec %zmm14, %zmm0, %zmm0 + vaesdec %zmm14, %zmm1, %zmm1 + vaesdec %zmm14, %zmm2, %zmm2 + vaesdec %zmm14, %zmm3, %zmm3 + vaesdec %zmm15, %zmm0, %zmm0 + vaesdec %zmm15, %zmm1, %zmm1 + vaesdec %zmm15, %zmm2, %zmm2 + vaesdec %zmm15, %zmm3, %zmm3 + vaesdec %zmm16, %zmm0, %zmm0 + vaesdec %zmm16, %zmm1, %zmm1 + vaesdec %zmm16, %zmm2, %zmm2 + vaesdec %zmm16, %zmm3, %zmm3 + vaesdec %zmm17, %zmm0, %zmm0 + vaesdec %zmm17, %zmm1, %zmm1 + vaesdec %zmm17, %zmm2, %zmm2 + vaesdec %zmm17, %zmm3, %zmm3 + cmpl $11, %r8d + vmovdqa64 %zmm18, %zmm7 + jl L_AES_ECB_decrypt_avx512_256_aes_dec_block_last + vaesdec %zmm18, %zmm0, %zmm0 + vaesdec %zmm18, %zmm1, %zmm1 + vaesdec %zmm18, %zmm2, %zmm2 + vaesdec %zmm18, %zmm3, %zmm3 + vaesdec %zmm19, %zmm0, %zmm0 + vaesdec %zmm19, %zmm1, %zmm1 + vaesdec %zmm19, %zmm2, %zmm2 + vaesdec %zmm19, %zmm3, %zmm3 + cmpl $13, %r8d + vmovdqa64 %zmm20, %zmm7 + jl L_AES_ECB_decrypt_avx512_256_aes_dec_block_last + vaesdec %zmm20, %zmm0, %zmm0 + vaesdec %zmm20, %zmm1, %zmm1 + vaesdec %zmm20, %zmm2, %zmm2 + vaesdec %zmm20, %zmm3, %zmm3 + vaesdec %zmm21, %zmm0, %zmm0 + vaesdec %zmm21, %zmm1, %zmm1 + vaesdec %zmm21, %zmm2, %zmm2 + vaesdec %zmm21, %zmm3, %zmm3 + vmovdqa64 %zmm22, %zmm7 +L_AES_ECB_decrypt_avx512_256_aes_dec_block_last: + vaesdeclast %zmm7, %zmm0, %zmm0 + vaesdeclast %zmm7, %zmm1, %zmm1 + vaesdeclast %zmm7, %zmm2, %zmm2 + vaesdeclast %zmm7, %zmm3, %zmm3 + vmovdqu64 %zmm0, (%r11) + vmovdqu64 %zmm1, 64(%r11) + vmovdqu64 %zmm2, 128(%r11) + vmovdqu64 %zmm3, 192(%r11) + addl $0x100, %eax + cmpl %r9d, %eax + jl L_AES_ECB_decrypt_avx512_dec_256 +L_AES_ECB_decrypt_avx512_done_256: + movl %edx, %r9d + andl $0xffffffc0, %r9d + cmpl %r9d, %eax + je L_AES_ECB_decrypt_avx512_done_64 +L_AES_ECB_decrypt_avx512_dec_64: + # 64 bytes of input + # aes_ecb_dec_64 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + vmovdqu64 (%r10), %zmm0 + # aes_dec_block + vpxorq %zmm8, %zmm0, %zmm0 + vaesdec %zmm9, %zmm0, %zmm0 + vaesdec %zmm10, %zmm0, %zmm0 + vaesdec %zmm11, %zmm0, %zmm0 + vaesdec %zmm12, %zmm0, %zmm0 + vaesdec %zmm13, %zmm0, %zmm0 + vaesdec %zmm14, %zmm0, %zmm0 + vaesdec %zmm15, %zmm0, %zmm0 + vaesdec %zmm16, %zmm0, %zmm0 + vaesdec %zmm17, %zmm0, %zmm0 + cmpl $11, %r8d + vmovdqa64 %zmm18, %zmm7 + jl L_AES_ECB_decrypt_avx512_64_aes_dec_block_last + vaesdec %zmm18, %zmm0, %zmm0 + vaesdec %zmm19, %zmm0, %zmm0 + cmpl $13, %r8d + vmovdqa64 %zmm20, %zmm7 + jl L_AES_ECB_decrypt_avx512_64_aes_dec_block_last + vaesdec %zmm20, %zmm0, %zmm0 + vaesdec %zmm21, %zmm0, %zmm0 + vmovdqa64 %zmm22, %zmm7 +L_AES_ECB_decrypt_avx512_64_aes_dec_block_last: + vaesdeclast %zmm7, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%r11) + addl $0x40, %eax + cmpl %r9d, %eax + jl L_AES_ECB_decrypt_avx512_dec_64 +L_AES_ECB_decrypt_avx512_done_64: + cmpl %edx, %eax + movl %edx, %r9d + je L_AES_ECB_decrypt_avx512_done_dec + andl $0xfffffff0, %r9d +L_AES_ECB_decrypt_avx512_dec_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + vmovdqu (%r10), %xmm0 + # aes_dec_block + vpxor (%rcx), %xmm0, %xmm0 + vmovdqu 16(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm5 + jl L_AES_ECB_decrypt_avx512_16_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%rcx), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm5 + jl L_AES_ECB_decrypt_avx512_16_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%rcx), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%rcx), %xmm5 +L_AES_ECB_decrypt_avx512_16_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + leaq (%rsi,%rax,1), %r10 + vmovdqu %xmm0, (%r10) + addl $16, %eax + cmpl %r9d, %eax + jl L_AES_ECB_decrypt_avx512_dec_16 +L_AES_ECB_decrypt_avx512_done_dec: + repz retq +#ifndef __APPLE__ +.size AES_ECB_decrypt_avx512,.-AES_ECB_decrypt_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_CBC_encrypt_avx512 +.type AES_CBC_encrypt_avx512,@function +.align 16 +AES_CBC_encrypt_avx512: +#else +.section __TEXT,__text +.globl _AES_CBC_encrypt_avx512 +.p2align 4 +_AES_CBC_encrypt_avx512: +#endif /* __APPLE__ */ + vmovdqu (%rdx), %xmm0 + xorl %eax, %eax + cmpl %ecx, %eax + je L_AES_CBC_encrypt_avx512_done +L_AES_CBC_encrypt_avx512_loop: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + vmovdqu (%r10), %xmm1 + vpternlogq $0x96, (%r8), %xmm0, %xmm1 + # aes_enc_block + vmovdqu 16(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 32(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 48(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 64(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 80(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 96(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 112(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 128(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 144(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + cmpl $11, %r9d + vmovdqu 160(%r8), %xmm3 + jl L_AES_CBC_encrypt_avx512_aes_enc_block_last + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 176(%r8), %xmm4 + vaesenc %xmm4, %xmm1, %xmm1 + cmpl $13, %r9d + vmovdqu 192(%r8), %xmm3 + jl L_AES_CBC_encrypt_avx512_aes_enc_block_last + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 208(%r8), %xmm4 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqu 224(%r8), %xmm3 +L_AES_CBC_encrypt_avx512_aes_enc_block_last: + vaesenclast %xmm3, %xmm1, %xmm1 + leaq (%rsi,%rax,1), %r11 + vmovdqu %xmm1, (%r11) + vmovdqa %xmm1, %xmm0 + addl $16, %eax + cmpl %ecx, %eax + jl L_AES_CBC_encrypt_avx512_loop +L_AES_CBC_encrypt_avx512_done: + vmovdqu %xmm0, (%rdx) + repz retq +#ifndef __APPLE__ +.size AES_CBC_encrypt_avx512,.-AES_CBC_encrypt_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_CBC_decrypt_avx512 +.type AES_CBC_decrypt_avx512,@function +.align 16 +AES_CBC_decrypt_avx512: +#else +.section __TEXT,__text +.globl _AES_CBC_decrypt_avx512 +.p2align 4 +_AES_CBC_decrypt_avx512: +#endif /* __APPLE__ */ + pushq %r12 + vmovdqu (%rdx), %xmm8 + xorl %eax, %eax + cmpl $0x40, %ecx + jl L_AES_CBC_decrypt_avx512_done_64 + vbroadcasti32x4 (%r8), %zmm14 + vbroadcasti32x4 16(%r8), %zmm15 + vbroadcasti32x4 32(%r8), %zmm16 + vbroadcasti32x4 48(%r8), %zmm17 + vbroadcasti32x4 64(%r8), %zmm18 + vbroadcasti32x4 80(%r8), %zmm19 + vbroadcasti32x4 96(%r8), %zmm20 + vbroadcasti32x4 112(%r8), %zmm21 + vbroadcasti32x4 128(%r8), %zmm22 + vbroadcasti32x4 144(%r8), %zmm23 + vbroadcasti32x4 160(%r8), %zmm24 + cmpl $11, %r9d + jl L_AES_CBC_decrypt_avx512_key_cached + vbroadcasti32x4 176(%r8), %zmm25 + vbroadcasti32x4 192(%r8), %zmm26 + cmpl $13, %r9d + jl L_AES_CBC_decrypt_avx512_key_cached + vbroadcasti32x4 208(%r8), %zmm27 + vbroadcasti32x4 224(%r8), %zmm28 +L_AES_CBC_decrypt_avx512_key_cached: + cmpl $0x100, %ecx + movl %ecx, %r10d + jl L_AES_CBC_decrypt_avx512_done_256 + andl $0xffffff00, %r10d +L_AES_CBC_decrypt_avx512_dec_256: + # 256 bytes of input + # aes_cbc_dec_256 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %r12 + vmovdqu64 (%r11), %zmm0 + vmovdqu64 64(%r11), %zmm1 + vmovdqu64 128(%r11), %zmm2 + vmovdqu64 192(%r11), %zmm3 + vshufi64x2 $0x90, %zmm0, %zmm0, %zmm10 + vinserti32x4 $0x00, %xmm8, %zmm10, %zmm10 + vmovdqu64 48(%r11), %zmm11 + vmovdqu64 112(%r11), %zmm12 + vmovdqu64 176(%r11), %zmm13 + vextracti32x4 $3, %zmm3, %xmm8 + # aes_dec_block + vpxorq %zmm14, %zmm0, %zmm0 + vpxorq %zmm14, %zmm1, %zmm1 + vpxorq %zmm14, %zmm2, %zmm2 + vpxorq %zmm14, %zmm3, %zmm3 + vaesdec %zmm15, %zmm0, %zmm0 + vaesdec %zmm15, %zmm1, %zmm1 + vaesdec %zmm15, %zmm2, %zmm2 + vaesdec %zmm15, %zmm3, %zmm3 + vaesdec %zmm16, %zmm0, %zmm0 + vaesdec %zmm16, %zmm1, %zmm1 + vaesdec %zmm16, %zmm2, %zmm2 + vaesdec %zmm16, %zmm3, %zmm3 + vaesdec %zmm17, %zmm0, %zmm0 + vaesdec %zmm17, %zmm1, %zmm1 + vaesdec %zmm17, %zmm2, %zmm2 + vaesdec %zmm17, %zmm3, %zmm3 + vaesdec %zmm18, %zmm0, %zmm0 + vaesdec %zmm18, %zmm1, %zmm1 + vaesdec %zmm18, %zmm2, %zmm2 + vaesdec %zmm18, %zmm3, %zmm3 + vaesdec %zmm19, %zmm0, %zmm0 + vaesdec %zmm19, %zmm1, %zmm1 + vaesdec %zmm19, %zmm2, %zmm2 + vaesdec %zmm19, %zmm3, %zmm3 + vaesdec %zmm20, %zmm0, %zmm0 + vaesdec %zmm20, %zmm1, %zmm1 + vaesdec %zmm20, %zmm2, %zmm2 + vaesdec %zmm20, %zmm3, %zmm3 + vaesdec %zmm21, %zmm0, %zmm0 + vaesdec %zmm21, %zmm1, %zmm1 + vaesdec %zmm21, %zmm2, %zmm2 + vaesdec %zmm21, %zmm3, %zmm3 + vaesdec %zmm22, %zmm0, %zmm0 + vaesdec %zmm22, %zmm1, %zmm1 + vaesdec %zmm22, %zmm2, %zmm2 + vaesdec %zmm22, %zmm3, %zmm3 + vaesdec %zmm23, %zmm0, %zmm0 + vaesdec %zmm23, %zmm1, %zmm1 + vaesdec %zmm23, %zmm2, %zmm2 + vaesdec %zmm23, %zmm3, %zmm3 + cmpl $11, %r9d + vmovdqa64 %zmm24, %zmm9 + jl L_AES_CBC_decrypt_avx512_256_aes_dec_block_last + vaesdec %zmm24, %zmm0, %zmm0 + vaesdec %zmm24, %zmm1, %zmm1 + vaesdec %zmm24, %zmm2, %zmm2 + vaesdec %zmm24, %zmm3, %zmm3 + vaesdec %zmm25, %zmm0, %zmm0 + vaesdec %zmm25, %zmm1, %zmm1 + vaesdec %zmm25, %zmm2, %zmm2 + vaesdec %zmm25, %zmm3, %zmm3 + cmpl $13, %r9d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_CBC_decrypt_avx512_256_aes_dec_block_last + vaesdec %zmm26, %zmm0, %zmm0 + vaesdec %zmm26, %zmm1, %zmm1 + vaesdec %zmm26, %zmm2, %zmm2 + vaesdec %zmm26, %zmm3, %zmm3 + vaesdec %zmm27, %zmm0, %zmm0 + vaesdec %zmm27, %zmm1, %zmm1 + vaesdec %zmm27, %zmm2, %zmm2 + vaesdec %zmm27, %zmm3, %zmm3 + vmovdqa64 %zmm28, %zmm9 +L_AES_CBC_decrypt_avx512_256_aes_dec_block_last: + vaesdeclast %zmm9, %zmm0, %zmm0 + vaesdeclast %zmm9, %zmm1, %zmm1 + vaesdeclast %zmm9, %zmm2, %zmm2 + vaesdeclast %zmm9, %zmm3, %zmm3 + vpxorq %zmm10, %zmm0, %zmm0 + vpxorq %zmm11, %zmm1, %zmm1 + vpxorq %zmm12, %zmm2, %zmm2 + vpxorq %zmm13, %zmm3, %zmm3 + vmovdqu64 %zmm0, (%r12) + vmovdqu64 %zmm1, 64(%r12) + vmovdqu64 %zmm2, 128(%r12) + vmovdqu64 %zmm3, 192(%r12) + addl $0x100, %eax + cmpl %r10d, %eax + jl L_AES_CBC_decrypt_avx512_dec_256 +L_AES_CBC_decrypt_avx512_done_256: + movl %ecx, %r10d + andl $0xffffffc0, %r10d + cmpl %r10d, %eax + je L_AES_CBC_decrypt_avx512_done_64 +L_AES_CBC_decrypt_avx512_dec_64: + # 64 bytes of input + # aes_cbc_dec_64 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %r12 + vmovdqu64 (%r11), %zmm0 + vshufi64x2 $0x90, %zmm0, %zmm0, %zmm10 + vinserti32x4 $0x00, %xmm8, %zmm10, %zmm10 + vextracti32x4 $3, %zmm0, %xmm8 + # aes_dec_block + vpxorq %zmm14, %zmm0, %zmm0 + vaesdec %zmm15, %zmm0, %zmm0 + vaesdec %zmm16, %zmm0, %zmm0 + vaesdec %zmm17, %zmm0, %zmm0 + vaesdec %zmm18, %zmm0, %zmm0 + vaesdec %zmm19, %zmm0, %zmm0 + vaesdec %zmm20, %zmm0, %zmm0 + vaesdec %zmm21, %zmm0, %zmm0 + vaesdec %zmm22, %zmm0, %zmm0 + vaesdec %zmm23, %zmm0, %zmm0 + cmpl $11, %r9d + vmovdqa64 %zmm24, %zmm9 + jl L_AES_CBC_decrypt_avx512_64_aes_dec_block_last + vaesdec %zmm24, %zmm0, %zmm0 + vaesdec %zmm25, %zmm0, %zmm0 + cmpl $13, %r9d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_CBC_decrypt_avx512_64_aes_dec_block_last + vaesdec %zmm26, %zmm0, %zmm0 + vaesdec %zmm27, %zmm0, %zmm0 + vmovdqa64 %zmm28, %zmm9 +L_AES_CBC_decrypt_avx512_64_aes_dec_block_last: + vaesdeclast %zmm9, %zmm0, %zmm0 + vpxorq %zmm10, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%r12) + addl $0x40, %eax + cmpl %r10d, %eax + jl L_AES_CBC_decrypt_avx512_dec_64 +L_AES_CBC_decrypt_avx512_done_64: + cmpl %ecx, %eax + movl %ecx, %r10d + je L_AES_CBC_decrypt_avx512_done_dec + andl $0xfffffff0, %r10d +L_AES_CBC_decrypt_avx512_dec_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r11 + vmovdqu (%r11), %xmm0 + vmovdqa %xmm0, %xmm7 + # aes_dec_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r8), %xmm5 + jl L_AES_CBC_decrypt_avx512_16_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r8), %xmm5 + jl L_AES_CBC_decrypt_avx512_16_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_CBC_decrypt_avx512_16_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + vmovdqa %xmm7, %xmm8 + leaq (%rsi,%rax,1), %r11 + vmovdqu %xmm0, (%r11) + addl $16, %eax + cmpl %r10d, %eax + jl L_AES_CBC_decrypt_avx512_dec_16 +L_AES_CBC_decrypt_avx512_done_dec: + vmovdqu %xmm8, (%rdx) + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_CBC_decrypt_avx512,.-AES_CBC_decrypt_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_aes_ctr_bswap_avx512: +.quad 0x08090a0b0c0d0e0f,0x0001020304050607 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_aes_ctr_inc_avx512: +.quad 0x0000000000000000,0x0000000000000000 +.quad 0x0000000000000001,0x0000000000000000 +.quad 0x0000000000000002,0x0000000000000000 +.quad 0x0000000000000003,0x0000000000000000 +.quad 0x0000000000000004,0x0000000000000000 +.quad 0x0000000000000005,0x0000000000000000 +.quad 0x0000000000000006,0x0000000000000000 +.quad 0x0000000000000007,0x0000000000000000 +.quad 0x0000000000000008,0x0000000000000000 +.quad 0x0000000000000009,0x0000000000000000 +.quad 0x000000000000000a,0x0000000000000000 +.quad 0x000000000000000b,0x0000000000000000 +.quad 0x000000000000000c,0x0000000000000000 +.quad 0x000000000000000d,0x0000000000000000 +.quad 0x000000000000000e,0x0000000000000000 +.quad 0x000000000000000f,0x0000000000000000 +.quad 0x0000000000000010,0x0000000000000000 +#ifndef __APPLE__ +.text +.globl AES_CTR_encrypt_avx512 +.type AES_CTR_encrypt_avx512,@function +.align 16 +AES_CTR_encrypt_avx512: +#else +.section __TEXT,__text +.globl _AES_CTR_encrypt_avx512 +.p2align 4 +_AES_CTR_encrypt_avx512: +#endif /* __APPLE__ */ + pushq %rbx + vbroadcasti32x4 L_aes_ctr_bswap_avx512(%rip), %zmm8 + vbroadcasti32x4 (%r9), %zmm7 + vpshufb %zmm8, %zmm7, %zmm7 + vbroadcasti32x4 256+L_aes_ctr_inc_avx512(%rip), %zmm10 + vbroadcasti32x4 64+L_aes_ctr_inc_avx512(%rip), %zmm11 + vbroadcasti32x4 16+L_aes_ctr_inc_avx512(%rip), %zmm12 + xorl %eax, %eax + cmpl $0x40, %edx + jl L_AES_CTR_encrypt_avx512_done_64 + vbroadcasti32x4 (%rcx), %zmm14 + vbroadcasti32x4 16(%rcx), %zmm15 + vbroadcasti32x4 32(%rcx), %zmm16 + vbroadcasti32x4 48(%rcx), %zmm17 + vbroadcasti32x4 64(%rcx), %zmm18 + vbroadcasti32x4 80(%rcx), %zmm19 + vbroadcasti32x4 96(%rcx), %zmm20 + vbroadcasti32x4 112(%rcx), %zmm21 + vbroadcasti32x4 128(%rcx), %zmm22 + vbroadcasti32x4 144(%rcx), %zmm23 + vbroadcasti32x4 160(%rcx), %zmm24 + cmpl $11, %r8d + jl L_AES_CTR_encrypt_avx512_key_cached + vbroadcasti32x4 176(%rcx), %zmm25 + vbroadcasti32x4 192(%rcx), %zmm26 + cmpl $13, %r8d + jl L_AES_CTR_encrypt_avx512_key_cached + vbroadcasti32x4 208(%rcx), %zmm27 + vbroadcasti32x4 224(%rcx), %zmm28 +L_AES_CTR_encrypt_avx512_key_cached: + cmpl $0x100, %edx + movl %edx, %r10d + jl L_AES_CTR_encrypt_avx512_done_256 + andl $0xffffff00, %r10d + vmovdqa64 %zmm7, %zmm9 + vpaddq 0+L_aes_ctr_inc_avx512(%rip), %zmm7, %zmm4 + vpternlogq $0xb2, 0+L_aes_ctr_inc_avx512(%rip), %zmm4, %zmm9 + vpsrlq $63, %zmm9, %zmm9 + vpslldq $8, %zmm9, %zmm9 + vpaddq %zmm9, %zmm4, %zmm4 + vmovdqa64 %zmm7, %zmm9 + vpaddq 64+L_aes_ctr_inc_avx512(%rip), %zmm7, %zmm5 + vpternlogq $0xb2, 64+L_aes_ctr_inc_avx512(%rip), %zmm5, %zmm9 + vpsrlq $63, %zmm9, %zmm9 + vpslldq $8, %zmm9, %zmm9 + vpaddq %zmm9, %zmm5, %zmm5 + vmovdqa64 %zmm7, %zmm9 + vpaddq 128+L_aes_ctr_inc_avx512(%rip), %zmm7, %zmm6 + vpternlogq $0xb2, 128+L_aes_ctr_inc_avx512(%rip), %zmm6, %zmm9 + vpsrlq $63, %zmm9, %zmm9 + vpslldq $8, %zmm9, %zmm9 + vpaddq %zmm9, %zmm6, %zmm6 + vmovdqa64 %zmm7, %zmm9 + vpaddq 192+L_aes_ctr_inc_avx512(%rip), %zmm7, %zmm7 + vpternlogq $0xb2, 192+L_aes_ctr_inc_avx512(%rip), %zmm7, %zmm9 + vpsrlq $63, %zmm9, %zmm9 + vpslldq $8, %zmm9, %zmm9 + vpaddq %zmm9, %zmm7, %zmm7 +L_AES_CTR_encrypt_avx512_enc_256: + # 256 bytes of input + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %rbx + vpshufb %zmm8, %zmm4, %zmm0 + vpshufb %zmm8, %zmm5, %zmm1 + vpshufb %zmm8, %zmm6, %zmm2 + vpshufb %zmm8, %zmm7, %zmm3 + vmovdqa64 %zmm4, %zmm9 + vpaddq %zmm10, %zmm4, %zmm4 + vpternlogq $0xb2, %zmm10, %zmm4, %zmm9 + vpsrlq $63, %zmm9, %zmm9 + vpslldq $8, %zmm9, %zmm9 + vpaddq %zmm9, %zmm4, %zmm4 + vmovdqa64 %zmm5, %zmm9 + vpaddq %zmm10, %zmm5, %zmm5 + vpternlogq $0xb2, %zmm10, %zmm5, %zmm9 + vpsrlq $63, %zmm9, %zmm9 + vpslldq $8, %zmm9, %zmm9 + vpaddq %zmm9, %zmm5, %zmm5 + vmovdqa64 %zmm6, %zmm9 + vpaddq %zmm10, %zmm6, %zmm6 + vpternlogq $0xb2, %zmm10, %zmm6, %zmm9 + vpsrlq $63, %zmm9, %zmm9 + vpslldq $8, %zmm9, %zmm9 + vpaddq %zmm9, %zmm6, %zmm6 + vmovdqa64 %zmm7, %zmm9 + vpaddq %zmm10, %zmm7, %zmm7 + vpternlogq $0xb2, %zmm10, %zmm7, %zmm9 + vpsrlq $63, %zmm9, %zmm9 + vpslldq $8, %zmm9, %zmm9 + vpaddq %zmm9, %zmm7, %zmm7 + # aes_enc_block + vpxorq %zmm14, %zmm0, %zmm0 + vpxorq %zmm14, %zmm1, %zmm1 + vpxorq %zmm14, %zmm2, %zmm2 + vpxorq %zmm14, %zmm3, %zmm3 + vaesenc %zmm15, %zmm0, %zmm0 + vaesenc %zmm15, %zmm1, %zmm1 + vaesenc %zmm15, %zmm2, %zmm2 + vaesenc %zmm15, %zmm3, %zmm3 + vaesenc %zmm16, %zmm0, %zmm0 + vaesenc %zmm16, %zmm1, %zmm1 + vaesenc %zmm16, %zmm2, %zmm2 + vaesenc %zmm16, %zmm3, %zmm3 + vaesenc %zmm17, %zmm0, %zmm0 + vaesenc %zmm17, %zmm1, %zmm1 + vaesenc %zmm17, %zmm2, %zmm2 + vaesenc %zmm17, %zmm3, %zmm3 + vaesenc %zmm18, %zmm0, %zmm0 + vaesenc %zmm18, %zmm1, %zmm1 + vaesenc %zmm18, %zmm2, %zmm2 + vaesenc %zmm18, %zmm3, %zmm3 + vaesenc %zmm19, %zmm0, %zmm0 + vaesenc %zmm19, %zmm1, %zmm1 + vaesenc %zmm19, %zmm2, %zmm2 + vaesenc %zmm19, %zmm3, %zmm3 + vaesenc %zmm20, %zmm0, %zmm0 + vaesenc %zmm20, %zmm1, %zmm1 + vaesenc %zmm20, %zmm2, %zmm2 + vaesenc %zmm20, %zmm3, %zmm3 + vaesenc %zmm21, %zmm0, %zmm0 + vaesenc %zmm21, %zmm1, %zmm1 + vaesenc %zmm21, %zmm2, %zmm2 + vaesenc %zmm21, %zmm3, %zmm3 + vaesenc %zmm22, %zmm0, %zmm0 + vaesenc %zmm22, %zmm1, %zmm1 + vaesenc %zmm22, %zmm2, %zmm2 + vaesenc %zmm22, %zmm3, %zmm3 + vaesenc %zmm23, %zmm0, %zmm0 + vaesenc %zmm23, %zmm1, %zmm1 + vaesenc %zmm23, %zmm2, %zmm2 + vaesenc %zmm23, %zmm3, %zmm3 + cmpl $11, %r8d + vmovdqa64 %zmm24, %zmm13 + jl L_AES_CTR_encrypt_avx512_256_aes_enc_block_last + vaesenc %zmm24, %zmm0, %zmm0 + vaesenc %zmm24, %zmm1, %zmm1 + vaesenc %zmm24, %zmm2, %zmm2 + vaesenc %zmm24, %zmm3, %zmm3 + vaesenc %zmm25, %zmm0, %zmm0 + vaesenc %zmm25, %zmm1, %zmm1 + vaesenc %zmm25, %zmm2, %zmm2 + vaesenc %zmm25, %zmm3, %zmm3 + cmpl $13, %r8d + vmovdqa64 %zmm26, %zmm13 + jl L_AES_CTR_encrypt_avx512_256_aes_enc_block_last + vaesenc %zmm26, %zmm0, %zmm0 + vaesenc %zmm26, %zmm1, %zmm1 + vaesenc %zmm26, %zmm2, %zmm2 + vaesenc %zmm26, %zmm3, %zmm3 + vaesenc %zmm27, %zmm0, %zmm0 + vaesenc %zmm27, %zmm1, %zmm1 + vaesenc %zmm27, %zmm2, %zmm2 + vaesenc %zmm27, %zmm3, %zmm3 + vmovdqa64 %zmm28, %zmm13 +L_AES_CTR_encrypt_avx512_256_aes_enc_block_last: + vaesenclast %zmm13, %zmm0, %zmm0 + vaesenclast %zmm13, %zmm1, %zmm1 + vaesenclast %zmm13, %zmm2, %zmm2 + vaesenclast %zmm13, %zmm3, %zmm3 + vpxorq (%r11), %zmm0, %zmm0 + vpxorq 64(%r11), %zmm1, %zmm1 + vpxorq 128(%r11), %zmm2, %zmm2 + vpxorq 192(%r11), %zmm3, %zmm3 + vmovdqu64 %zmm0, (%rbx) + vmovdqu64 %zmm1, 64(%rbx) + vmovdqu64 %zmm2, 128(%rbx) + vmovdqu64 %zmm3, 192(%rbx) + addl $0x100, %eax + cmpl %r10d, %eax + jl L_AES_CTR_encrypt_avx512_enc_256 + vshufi64x2 $0x00, %zmm4, %zmm4, %zmm7 +L_AES_CTR_encrypt_avx512_done_256: + movl %edx, %r10d + andl $0xffffffc0, %r10d + cmpl %r10d, %eax + je L_AES_CTR_encrypt_avx512_done_64 +L_AES_CTR_encrypt_avx512_enc_64: + # 64 bytes of input + # aes_ctr_enc_64 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %rbx + vpaddq 0+L_aes_ctr_inc_avx512(%rip), %zmm7, %zmm0 + vmovdqa64 %zmm7, %zmm9 + vpternlogq $0xb2, 0+L_aes_ctr_inc_avx512(%rip), %zmm0, %zmm9 + vpsrlq $63, %zmm9, %zmm9 + vpslldq $8, %zmm9, %zmm9 + vpaddq %zmm9, %zmm0, %zmm0 + vpshufb %zmm8, %zmm0, %zmm0 + vmovdqa64 %zmm7, %zmm9 + vpaddq %zmm11, %zmm7, %zmm7 + vpternlogq $0xb2, %zmm11, %zmm7, %zmm9 + vpsrlq $63, %zmm9, %zmm9 + vpslldq $8, %zmm9, %zmm9 + vpaddq %zmm9, %zmm7, %zmm7 + # aes_enc_block + vpxorq %zmm14, %zmm0, %zmm0 + vaesenc %zmm15, %zmm0, %zmm0 + vaesenc %zmm16, %zmm0, %zmm0 + vaesenc %zmm17, %zmm0, %zmm0 + vaesenc %zmm18, %zmm0, %zmm0 + vaesenc %zmm19, %zmm0, %zmm0 + vaesenc %zmm20, %zmm0, %zmm0 + vaesenc %zmm21, %zmm0, %zmm0 + vaesenc %zmm22, %zmm0, %zmm0 + vaesenc %zmm23, %zmm0, %zmm0 + cmpl $11, %r8d + vmovdqa64 %zmm24, %zmm13 + jl L_AES_CTR_encrypt_avx512_64_aes_enc_block_last + vaesenc %zmm24, %zmm0, %zmm0 + vaesenc %zmm25, %zmm0, %zmm0 + cmpl $13, %r8d + vmovdqa64 %zmm26, %zmm13 + jl L_AES_CTR_encrypt_avx512_64_aes_enc_block_last + vaesenc %zmm26, %zmm0, %zmm0 + vaesenc %zmm27, %zmm0, %zmm0 + vmovdqa64 %zmm28, %zmm13 +L_AES_CTR_encrypt_avx512_64_aes_enc_block_last: + vaesenclast %zmm13, %zmm0, %zmm0 + vpxorq (%r11), %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rbx) + addl $0x40, %eax + cmpl %r10d, %eax + jl L_AES_CTR_encrypt_avx512_enc_64 +L_AES_CTR_encrypt_avx512_done_64: + cmpl %edx, %eax + movl %edx, %r10d + je L_AES_CTR_encrypt_avx512_done_enc + andl $0xfffffff0, %r10d +L_AES_CTR_encrypt_avx512_enc_16: + # 16 bytes of input + vpshufb %xmm8, %xmm7, %xmm0 + vmovdqa64 %zmm7, %zmm9 + vpaddq %zmm12, %zmm7, %zmm7 + vpternlogq $0xb2, %zmm12, %zmm7, %zmm9 + vpsrlq $63, %zmm9, %zmm9 + vpslldq $8, %zmm9, %zmm9 + vpaddq %zmm9, %zmm7, %zmm7 + # aes_enc_block + vpxor (%rcx), %xmm0, %xmm0 + vmovdqu 16(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm5 + jl L_AES_CTR_encrypt_avx512_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm5 + jl L_AES_CTR_encrypt_avx512_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%rcx), %xmm5 +L_AES_CTR_encrypt_avx512_16_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + leaq (%rdi,%rax,1), %r11 + vpxor (%r11), %xmm0, %xmm0 + leaq (%rsi,%rax,1), %r11 + vmovdqu %xmm0, (%r11) + addl $16, %eax + cmpl %r10d, %eax + jl L_AES_CTR_encrypt_avx512_enc_16 +L_AES_CTR_encrypt_avx512_done_enc: + vpshufb %xmm8, %xmm7, %xmm0 + vmovdqu %xmm0, (%r9) + popq %rbx + repz retq +#ifndef __APPLE__ +.size AES_CTR_encrypt_avx512,.-AES_CTR_encrypt_avx512 +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX512 */ +#endif /* WOLFSSL_X86_64_BUILD */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/wolfcrypt/src/aes_x86_64_asm.asm b/wolfcrypt/src/aes_x86_64_asm.asm new file mode 100644 index 00000000000..26ccbb5ee8e --- /dev/null +++ b/wolfcrypt/src/aes_x86_64_asm.asm @@ -0,0 +1,4283 @@ +; /* aes_x86_64_asm.asm */ +; /* +; * Copyright (C) 2006-2026 wolfSSL Inc. +; * +; * This file is part of wolfSSL. +; * +; * wolfSSL is free software; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 3 of the License, or +; * (at your option) any later version. +; * +; * wolfSSL is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA +; */ + +IF @Version LT 1200 +; AVX2 instructions not recognized by old versions of MASM +IFNDEF NO_AVX2_SUPPORT +NO_AVX2_SUPPORT = 1 +ENDIF +; MOVBE instruction not recognized by old versions of MASM +IFNDEF NO_MOVBE_SUPPORT +NO_MOVBE_SUPPORT = 1 +ENDIF +ENDIF + +IFNDEF HAVE_INTEL_AVX1 +HAVE_INTEL_AVX1 = 1 +ENDIF +IFNDEF NO_AVX2_SUPPORT +HAVE_INTEL_AVX2 = 1 +ENDIF + +IFNDEF _WIN64 +_WIN64 = 1 +ENDIF + +_TEXT SEGMENT READONLY PARA +AES_128_Key_Expansion_AESNI PROC + movdqu xmm0, OWORD PTR [rcx] + movdqu OWORD PTR [rdx], xmm0 + aeskeygenassist xmm1, xmm0, 1 + pshufd xmm1, xmm1, 255 + movdqa xmm2, xmm0 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pxor xmm0, xmm1 + movdqu OWORD PTR [rdx+16], xmm0 + aeskeygenassist xmm1, xmm0, 2 + pshufd xmm1, xmm1, 255 + movdqa xmm2, xmm0 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pxor xmm0, xmm1 + movdqu OWORD PTR [rdx+32], xmm0 + aeskeygenassist xmm1, xmm0, 4 + pshufd xmm1, xmm1, 255 + movdqa xmm2, xmm0 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pxor xmm0, xmm1 + movdqu OWORD PTR [rdx+48], xmm0 + aeskeygenassist xmm1, xmm0, 8 + pshufd xmm1, xmm1, 255 + movdqa xmm2, xmm0 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pxor xmm0, xmm1 + movdqu OWORD PTR [rdx+64], xmm0 + aeskeygenassist xmm1, xmm0, 16 + pshufd xmm1, xmm1, 255 + movdqa xmm2, xmm0 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pxor xmm0, xmm1 + movdqu OWORD PTR [rdx+80], xmm0 + aeskeygenassist xmm1, xmm0, 32 + pshufd xmm1, xmm1, 255 + movdqa xmm2, xmm0 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pxor xmm0, xmm1 + movdqu OWORD PTR [rdx+96], xmm0 + aeskeygenassist xmm1, xmm0, 64 + pshufd xmm1, xmm1, 255 + movdqa xmm2, xmm0 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pxor xmm0, xmm1 + movdqu OWORD PTR [rdx+112], xmm0 + aeskeygenassist xmm1, xmm0, 128 + pshufd xmm1, xmm1, 255 + movdqa xmm2, xmm0 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pxor xmm0, xmm1 + movdqu OWORD PTR [rdx+128], xmm0 + aeskeygenassist xmm1, xmm0, 27 + pshufd xmm1, xmm1, 255 + movdqa xmm2, xmm0 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pxor xmm0, xmm1 + movdqu OWORD PTR [rdx+144], xmm0 + aeskeygenassist xmm1, xmm0, 54 + pshufd xmm1, xmm1, 255 + movdqa xmm2, xmm0 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pxor xmm0, xmm1 + movdqu OWORD PTR [rdx+160], xmm0 + ret +AES_128_Key_Expansion_AESNI ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_192_Key_Expansion_AESNI PROC + movdqu xmm0, OWORD PTR [rcx] + pxor xmm1, xmm1 + pinsrq xmm1, QWORD PTR [rcx+16], 0 + movdqu OWORD PTR [rdx], xmm0 + movdqa xmm4, xmm1 + aeskeygenassist xmm2, xmm1, 1 + pshufd xmm2, xmm2, 85 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + pshufd xmm2, xmm0, 255 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + shufpd xmm4, xmm0, 0 + movdqu OWORD PTR [rdx+16], xmm4 + movdqa xmm5, xmm0 + shufpd xmm5, xmm1, 1 + movdqu OWORD PTR [rdx+32], xmm5 + aeskeygenassist xmm2, xmm1, 2 + pshufd xmm2, xmm2, 85 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + pshufd xmm2, xmm0, 255 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu OWORD PTR [rdx+48], xmm0 + movdqa xmm4, xmm1 + aeskeygenassist xmm2, xmm1, 4 + pshufd xmm2, xmm2, 85 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + pshufd xmm2, xmm0, 255 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + shufpd xmm4, xmm0, 0 + movdqu OWORD PTR [rdx+64], xmm4 + movdqa xmm5, xmm0 + shufpd xmm5, xmm1, 1 + movdqu OWORD PTR [rdx+80], xmm5 + aeskeygenassist xmm2, xmm1, 8 + pshufd xmm2, xmm2, 85 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + pshufd xmm2, xmm0, 255 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu OWORD PTR [rdx+96], xmm0 + movdqa xmm4, xmm1 + aeskeygenassist xmm2, xmm1, 16 + pshufd xmm2, xmm2, 85 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + pshufd xmm2, xmm0, 255 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + shufpd xmm4, xmm0, 0 + movdqu OWORD PTR [rdx+112], xmm4 + movdqa xmm5, xmm0 + shufpd xmm5, xmm1, 1 + movdqu OWORD PTR [rdx+128], xmm5 + aeskeygenassist xmm2, xmm1, 32 + pshufd xmm2, xmm2, 85 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + pshufd xmm2, xmm0, 255 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu OWORD PTR [rdx+144], xmm0 + movdqa xmm4, xmm1 + aeskeygenassist xmm2, xmm1, 64 + pshufd xmm2, xmm2, 85 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + pshufd xmm2, xmm0, 255 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + shufpd xmm4, xmm0, 0 + movdqu OWORD PTR [rdx+160], xmm4 + movdqa xmm5, xmm0 + shufpd xmm5, xmm1, 1 + movdqu OWORD PTR [rdx+176], xmm5 + aeskeygenassist xmm2, xmm1, 128 + pshufd xmm2, xmm2, 85 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + pshufd xmm2, xmm0, 255 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu OWORD PTR [rdx+192], xmm0 + movdqu OWORD PTR [rdx+208], xmm1 + ret +AES_192_Key_Expansion_AESNI ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_256_Key_Expansion_AESNI PROC + movdqu xmm0, OWORD PTR [rcx] + movdqu xmm1, OWORD PTR [rcx+16] + movdqu OWORD PTR [rdx], xmm0 + movdqu OWORD PTR [rdx+16], xmm1 + aeskeygenassist xmm2, xmm1, 1 + pshufd xmm2, xmm2, 255 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + movdqu OWORD PTR [rdx+32], xmm0 + aeskeygenassist xmm2, xmm0, 0 + pshufd xmm2, xmm2, 170 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu OWORD PTR [rdx+48], xmm1 + aeskeygenassist xmm2, xmm1, 2 + pshufd xmm2, xmm2, 255 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + movdqu OWORD PTR [rdx+64], xmm0 + aeskeygenassist xmm2, xmm0, 0 + pshufd xmm2, xmm2, 170 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu OWORD PTR [rdx+80], xmm1 + aeskeygenassist xmm2, xmm1, 4 + pshufd xmm2, xmm2, 255 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + movdqu OWORD PTR [rdx+96], xmm0 + aeskeygenassist xmm2, xmm0, 0 + pshufd xmm2, xmm2, 170 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu OWORD PTR [rdx+112], xmm1 + aeskeygenassist xmm2, xmm1, 8 + pshufd xmm2, xmm2, 255 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + movdqu OWORD PTR [rdx+128], xmm0 + aeskeygenassist xmm2, xmm0, 0 + pshufd xmm2, xmm2, 170 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu OWORD PTR [rdx+144], xmm1 + aeskeygenassist xmm2, xmm1, 16 + pshufd xmm2, xmm2, 255 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + movdqu OWORD PTR [rdx+160], xmm0 + aeskeygenassist xmm2, xmm0, 0 + pshufd xmm2, xmm2, 170 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu OWORD PTR [rdx+176], xmm1 + aeskeygenassist xmm2, xmm1, 32 + pshufd xmm2, xmm2, 255 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + movdqu OWORD PTR [rdx+192], xmm0 + aeskeygenassist xmm2, xmm0, 0 + pshufd xmm2, xmm2, 170 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu OWORD PTR [rdx+208], xmm1 + aeskeygenassist xmm2, xmm1, 64 + pshufd xmm2, xmm2, 255 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + movdqu OWORD PTR [rdx+224], xmm0 + ret +AES_256_Key_Expansion_AESNI ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_ECB_encrypt_AESNI PROC + mov eax, DWORD PTR [rsp+40] + sub rsp, 16 + movdqu OWORD PTR [rsp], xmm6 + xor eax, eax + cmp r8d, 64 + mov r9d, r8d + jl L_AES_ECB_encrypt_AESNI_done_64 + and r9d, 4294967232 +L_AES_ECB_encrypt_AESNI_enc_64: + ; 64 bytes of input + ; aes_ecb_enc_64 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + movdqu xmm0, OWORD PTR [r10] + movdqu xmm1, OWORD PTR [r10+16] + movdqu xmm2, OWORD PTR [r10+32] + movdqu xmm3, OWORD PTR [r10+48] + ; aes_enc_block + movdqu xmm4, OWORD PTR [r9] + pxor xmm0, xmm4 + pxor xmm1, xmm4 + pxor xmm2, xmm4 + pxor xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+16] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+32] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+48] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+64] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+80] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+96] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+112] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+128] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+144] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + cmp eax, 11 + movdqu xmm4, OWORD PTR [r9+160] + jl L_AES_ECB_encrypt_AESNI_64_aes_enc_block_last + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+176] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + cmp eax, 13 + movdqu xmm4, OWORD PTR [r9+192] + jl L_AES_ECB_encrypt_AESNI_64_aes_enc_block_last + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+208] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+224] +L_AES_ECB_encrypt_AESNI_64_aes_enc_block_last: + aesenclast xmm0, xmm4 + aesenclast xmm1, xmm4 + aesenclast xmm2, xmm4 + aesenclast xmm3, xmm4 + movdqu OWORD PTR [r11], xmm0 + movdqu OWORD PTR [r11+16], xmm1 + movdqu OWORD PTR [r11+32], xmm2 + movdqu OWORD PTR [r11+48], xmm3 + add eax, 64 + cmp eax, r9d + jl L_AES_ECB_encrypt_AESNI_enc_64 +L_AES_ECB_encrypt_AESNI_done_64: + cmp eax, r8d + mov r9d, r8d + je L_AES_ECB_encrypt_AESNI_done_enc + and r9d, 4294967280 +L_AES_ECB_encrypt_AESNI_enc_16: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + movdqu xmm0, OWORD PTR [r10] + ; aes_enc_block + pxor xmm0, [r9] + movdqu xmm5, OWORD PTR [r9+16] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+32] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+48] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+64] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+80] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+96] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+112] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+128] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+144] + aesenc xmm0, xmm5 + cmp eax, 11 + movdqu xmm5, OWORD PTR [r9+160] + jl L_AES_ECB_encrypt_AESNI_16_aes_enc_block_last + aesenc xmm0, xmm5 + movdqu xmm6, OWORD PTR [r9+176] + aesenc xmm0, xmm6 + cmp eax, 13 + movdqu xmm5, OWORD PTR [r9+192] + jl L_AES_ECB_encrypt_AESNI_16_aes_enc_block_last + aesenc xmm0, xmm5 + movdqu xmm6, OWORD PTR [r9+208] + aesenc xmm0, xmm6 + movdqu xmm5, OWORD PTR [r9+224] +L_AES_ECB_encrypt_AESNI_16_aes_enc_block_last: + aesenclast xmm0, xmm5 + lea r10, QWORD PTR [rdx+rax] + movdqu OWORD PTR [r10], xmm0 + add eax, 16 + cmp eax, r9d + jl L_AES_ECB_encrypt_AESNI_enc_16 +L_AES_ECB_encrypt_AESNI_done_enc: + movdqu xmm6, OWORD PTR [rsp] + add rsp, 16 + ret +AES_ECB_encrypt_AESNI ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_ECB_decrypt_AESNI PROC + mov eax, DWORD PTR [rsp+40] + sub rsp, 16 + movdqu OWORD PTR [rsp], xmm6 + xor eax, eax + cmp r8d, 64 + mov r9d, r8d + jl L_AES_ECB_decrypt_AESNI_done_64 + and r9d, 4294967232 +L_AES_ECB_decrypt_AESNI_dec_64: + ; 64 bytes of input + ; aes_ecb_dec_64 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + movdqu xmm0, OWORD PTR [r10] + movdqu xmm1, OWORD PTR [r10+16] + movdqu xmm2, OWORD PTR [r10+32] + movdqu xmm3, OWORD PTR [r10+48] + ; aes_dec_block + movdqu xmm4, OWORD PTR [r9] + pxor xmm0, xmm4 + pxor xmm1, xmm4 + pxor xmm2, xmm4 + pxor xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+16] + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+32] + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+48] + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+64] + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+80] + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+96] + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+112] + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+128] + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+144] + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + cmp eax, 11 + movdqu xmm4, OWORD PTR [r9+160] + jl L_AES_ECB_decrypt_AESNI_64_aes_dec_block_last + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+176] + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + cmp eax, 13 + movdqu xmm4, OWORD PTR [r9+192] + jl L_AES_ECB_decrypt_AESNI_64_aes_dec_block_last + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+208] + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+224] +L_AES_ECB_decrypt_AESNI_64_aes_dec_block_last: + aesdeclast xmm0, xmm4 + aesdeclast xmm1, xmm4 + aesdeclast xmm2, xmm4 + aesdeclast xmm3, xmm4 + movdqu OWORD PTR [r11], xmm0 + movdqu OWORD PTR [r11+16], xmm1 + movdqu OWORD PTR [r11+32], xmm2 + movdqu OWORD PTR [r11+48], xmm3 + add eax, 64 + cmp eax, r9d + jl L_AES_ECB_decrypt_AESNI_dec_64 +L_AES_ECB_decrypt_AESNI_done_64: + cmp eax, r8d + mov r9d, r8d + je L_AES_ECB_decrypt_AESNI_done_dec + and r9d, 4294967280 +L_AES_ECB_decrypt_AESNI_dec_16: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + movdqu xmm0, OWORD PTR [r10] + ; aes_dec_block + pxor xmm0, [r9] + movdqu xmm5, OWORD PTR [r9+16] + aesdec xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+32] + aesdec xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+48] + aesdec xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+64] + aesdec xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+80] + aesdec xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+96] + aesdec xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+112] + aesdec xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+128] + aesdec xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+144] + aesdec xmm0, xmm5 + cmp eax, 11 + movdqu xmm5, OWORD PTR [r9+160] + jl L_AES_ECB_decrypt_AESNI_16_aes_dec_block_last + aesdec xmm0, xmm5 + movdqu xmm6, OWORD PTR [r9+176] + aesdec xmm0, xmm6 + cmp eax, 13 + movdqu xmm5, OWORD PTR [r9+192] + jl L_AES_ECB_decrypt_AESNI_16_aes_dec_block_last + aesdec xmm0, xmm5 + movdqu xmm6, OWORD PTR [r9+208] + aesdec xmm0, xmm6 + movdqu xmm5, OWORD PTR [r9+224] +L_AES_ECB_decrypt_AESNI_16_aes_dec_block_last: + aesdeclast xmm0, xmm5 + lea r10, QWORD PTR [rdx+rax] + movdqu OWORD PTR [r10], xmm0 + add eax, 16 + cmp eax, r9d + jl L_AES_ECB_decrypt_AESNI_dec_16 +L_AES_ECB_decrypt_AESNI_done_dec: + movdqu xmm6, OWORD PTR [rsp] + add rsp, 16 + ret +AES_ECB_decrypt_AESNI ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_CBC_encrypt_AESNI PROC + mov rax, QWORD PTR [rsp+40] + mov r10d, DWORD PTR [rsp+48] + movdqu xmm0, OWORD PTR [r8] + xor eax, eax + cmp eax, r9d + je L_AES_CBC_encrypt_AESNI_done +L_AES_CBC_encrypt_AESNI_loop: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + movdqu xmm1, OWORD PTR [r10] + pxor xmm1, xmm0 + ; aes_enc_block + pxor xmm1, [rax] + movdqu xmm3, OWORD PTR [rax+16] + aesenc xmm1, xmm3 + movdqu xmm3, OWORD PTR [rax+32] + aesenc xmm1, xmm3 + movdqu xmm3, OWORD PTR [rax+48] + aesenc xmm1, xmm3 + movdqu xmm3, OWORD PTR [rax+64] + aesenc xmm1, xmm3 + movdqu xmm3, OWORD PTR [rax+80] + aesenc xmm1, xmm3 + movdqu xmm3, OWORD PTR [rax+96] + aesenc xmm1, xmm3 + movdqu xmm3, OWORD PTR [rax+112] + aesenc xmm1, xmm3 + movdqu xmm3, OWORD PTR [rax+128] + aesenc xmm1, xmm3 + movdqu xmm3, OWORD PTR [rax+144] + aesenc xmm1, xmm3 + cmp r10d, 11 + movdqu xmm3, OWORD PTR [rax+160] + jl L_AES_CBC_encrypt_AESNI_aes_enc_block_last + aesenc xmm1, xmm3 + movdqu xmm4, OWORD PTR [rax+176] + aesenc xmm1, xmm4 + cmp r10d, 13 + movdqu xmm3, OWORD PTR [rax+192] + jl L_AES_CBC_encrypt_AESNI_aes_enc_block_last + aesenc xmm1, xmm3 + movdqu xmm4, OWORD PTR [rax+208] + aesenc xmm1, xmm4 + movdqu xmm3, OWORD PTR [rax+224] +L_AES_CBC_encrypt_AESNI_aes_enc_block_last: + aesenclast xmm1, xmm3 + lea r11, QWORD PTR [rdx+rax] + movdqu OWORD PTR [r11], xmm1 + movdqa xmm0, xmm1 + add eax, 16 + cmp eax, r9d + jl L_AES_CBC_encrypt_AESNI_loop +L_AES_CBC_encrypt_AESNI_done: + movdqu OWORD PTR [r8], xmm0 + ret +AES_CBC_encrypt_AESNI ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_CBC_decrypt_AESNI PROC + push r12 + mov rax, QWORD PTR [rsp+48] + mov r10d, DWORD PTR [rsp+56] + sub rsp, 48 + movdqu OWORD PTR [rsp], xmm6 + movdqu OWORD PTR [rsp+16], xmm7 + movdqu OWORD PTR [rsp+32], xmm8 + movdqu xmm4, OWORD PTR [r8] + xor eax, eax + cmp r9d, 64 + mov r10d, r9d + jl L_AES_CBC_decrypt_AESNI_done_64 + and r10d, 4294967232 +L_AES_CBC_decrypt_AESNI_dec_64: + ; 64 bytes of input + ; aes_cbc_dec_64 + lea r11, QWORD PTR [rcx+rax] + lea r12, QWORD PTR [rdx+rax] + movdqu xmm0, OWORD PTR [r11] + movdqu xmm1, OWORD PTR [r11+16] + movdqu xmm2, OWORD PTR [r11+32] + movdqu xmm3, OWORD PTR [r11+48] + ; aes_dec_block + movdqu xmm5, OWORD PTR [rax] + pxor xmm0, xmm5 + pxor xmm1, xmm5 + pxor xmm2, xmm5 + pxor xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+16] + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+32] + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+48] + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+64] + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+80] + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+96] + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+112] + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+128] + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+144] + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + cmp r10d, 11 + movdqu xmm5, OWORD PTR [rax+160] + jl L_AES_CBC_decrypt_AESNI_64_aes_dec_block_last + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+176] + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + cmp r10d, 13 + movdqu xmm5, OWORD PTR [rax+192] + jl L_AES_CBC_decrypt_AESNI_64_aes_dec_block_last + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+208] + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+224] +L_AES_CBC_decrypt_AESNI_64_aes_dec_block_last: + aesdeclast xmm0, xmm5 + aesdeclast xmm1, xmm5 + aesdeclast xmm2, xmm5 + aesdeclast xmm3, xmm5 + pxor xmm0, xmm4 + movdqu xmm5, OWORD PTR [r11] + pxor xmm1, xmm5 + movdqu xmm5, OWORD PTR [r11+16] + pxor xmm2, xmm5 + movdqu xmm5, OWORD PTR [r11+32] + pxor xmm3, xmm5 + movdqu xmm4, OWORD PTR [r11+48] + movdqu OWORD PTR [r12], xmm0 + movdqu OWORD PTR [r12+16], xmm1 + movdqu OWORD PTR [r12+32], xmm2 + movdqu OWORD PTR [r12+48], xmm3 + add eax, 64 + cmp eax, r10d + jl L_AES_CBC_decrypt_AESNI_dec_64 +L_AES_CBC_decrypt_AESNI_done_64: + cmp eax, r9d + mov r10d, r9d + je L_AES_CBC_decrypt_AESNI_done_dec + and r10d, 4294967280 +L_AES_CBC_decrypt_AESNI_dec_16: + ; 16 bytes of input + lea r11, QWORD PTR [rcx+rax] + movdqu xmm0, OWORD PTR [r11] + movdqa xmm8, xmm0 + ; aes_dec_block + pxor xmm0, [rax] + movdqu xmm6, OWORD PTR [rax+16] + aesdec xmm0, xmm6 + movdqu xmm6, OWORD PTR [rax+32] + aesdec xmm0, xmm6 + movdqu xmm6, OWORD PTR [rax+48] + aesdec xmm0, xmm6 + movdqu xmm6, OWORD PTR [rax+64] + aesdec xmm0, xmm6 + movdqu xmm6, OWORD PTR [rax+80] + aesdec xmm0, xmm6 + movdqu xmm6, OWORD PTR [rax+96] + aesdec xmm0, xmm6 + movdqu xmm6, OWORD PTR [rax+112] + aesdec xmm0, xmm6 + movdqu xmm6, OWORD PTR [rax+128] + aesdec xmm0, xmm6 + movdqu xmm6, OWORD PTR [rax+144] + aesdec xmm0, xmm6 + cmp r10d, 11 + movdqu xmm6, OWORD PTR [rax+160] + jl L_AES_CBC_decrypt_AESNI_16_aes_dec_block_last + aesdec xmm0, xmm6 + movdqu xmm7, OWORD PTR [rax+176] + aesdec xmm0, xmm7 + cmp r10d, 13 + movdqu xmm6, OWORD PTR [rax+192] + jl L_AES_CBC_decrypt_AESNI_16_aes_dec_block_last + aesdec xmm0, xmm6 + movdqu xmm7, OWORD PTR [rax+208] + aesdec xmm0, xmm7 + movdqu xmm6, OWORD PTR [rax+224] +L_AES_CBC_decrypt_AESNI_16_aes_dec_block_last: + aesdeclast xmm0, xmm6 + pxor xmm0, xmm4 + movdqa xmm4, xmm8 + lea r11, QWORD PTR [rdx+rax] + movdqu OWORD PTR [r11], xmm0 + add eax, 16 + cmp eax, r10d + jl L_AES_CBC_decrypt_AESNI_dec_16 +L_AES_CBC_decrypt_AESNI_done_dec: + movdqu OWORD PTR [r8], xmm4 + movdqu xmm6, OWORD PTR [rsp] + movdqu xmm7, OWORD PTR [rsp+16] + movdqu xmm8, OWORD PTR [rsp+32] + add rsp, 48 + pop r12 + ret +AES_CBC_decrypt_AESNI ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_ctr_aesni_bswap QWORD \ + 08090a0b0c0d0e0fh, 0001020304050607h +ptr_L_aes_ctr_aesni_bswap QWORD L_aes_ctr_aesni_bswap +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_ctr_aesni_one QWORD \ + 0000000000000001h, 0000000000000000h +ptr_L_aes_ctr_aesni_one QWORD L_aes_ctr_aesni_one +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_CTR_encrypt_AESNI PROC + push rbx + mov eax, DWORD PTR [rsp+48] + mov r10, QWORD PTR [rsp+56] + sub rsp, 96 + movdqu OWORD PTR [rsp], xmm6 + movdqu OWORD PTR [rsp+16], xmm7 + movdqu OWORD PTR [rsp+32], xmm8 + movdqu OWORD PTR [rsp+48], xmm9 + movdqu OWORD PTR [rsp+64], xmm10 + movdqu OWORD PTR [rsp+80], xmm11 + movdqu xmm8, OWORD PTR L_aes_ctr_aesni_bswap + movdqu xmm9, OWORD PTR L_aes_ctr_aesni_one + pxor xmm10, xmm10 + movdqu xmm7, OWORD PTR [r10] + pshufb xmm7, xmm8 + xor eax, eax + cmp r8d, 64 + mov r10d, r8d + jl L_AES_CTR_encrypt_AESNI_done_64 + and r10d, 4294967232 +L_AES_CTR_encrypt_AESNI_enc_64: + ; 64 bytes of input + ; aes_ctr_enc_64 + lea r11, QWORD PTR [rcx+rax] + lea rbx, QWORD PTR [rdx+rax] + movdqa xmm0, xmm7 + pshufb xmm0, xmm8 + paddq xmm7, xmm9 + movdqa xmm11, xmm7 + pcmpeqq xmm11, xmm10 + pslldq xmm11, 8 + psrlq xmm11, 63 + paddq xmm7, xmm11 + movdqa xmm1, xmm7 + pshufb xmm1, xmm8 + paddq xmm7, xmm9 + movdqa xmm11, xmm7 + pcmpeqq xmm11, xmm10 + pslldq xmm11, 8 + psrlq xmm11, 63 + paddq xmm7, xmm11 + movdqa xmm2, xmm7 + pshufb xmm2, xmm8 + paddq xmm7, xmm9 + movdqa xmm11, xmm7 + pcmpeqq xmm11, xmm10 + pslldq xmm11, 8 + psrlq xmm11, 63 + paddq xmm7, xmm11 + movdqa xmm3, xmm7 + pshufb xmm3, xmm8 + paddq xmm7, xmm9 + movdqa xmm11, xmm7 + pcmpeqq xmm11, xmm10 + pslldq xmm11, 8 + psrlq xmm11, 63 + paddq xmm7, xmm11 + ; aes_enc_block + movdqu xmm4, OWORD PTR [r9] + pxor xmm0, xmm4 + pxor xmm1, xmm4 + pxor xmm2, xmm4 + pxor xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+16] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+32] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+48] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+64] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+80] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+96] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+112] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+128] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+144] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + cmp eax, 11 + movdqu xmm4, OWORD PTR [r9+160] + jl L_AES_CTR_encrypt_AESNI_64_aes_enc_block_last + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+176] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + cmp eax, 13 + movdqu xmm4, OWORD PTR [r9+192] + jl L_AES_CTR_encrypt_AESNI_64_aes_enc_block_last + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+208] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+224] +L_AES_CTR_encrypt_AESNI_64_aes_enc_block_last: + aesenclast xmm0, xmm4 + aesenclast xmm1, xmm4 + aesenclast xmm2, xmm4 + aesenclast xmm3, xmm4 + movdqu xmm4, OWORD PTR [r11] + pxor xmm0, xmm4 + movdqu xmm4, OWORD PTR [r11+16] + pxor xmm1, xmm4 + movdqu xmm4, OWORD PTR [r11+32] + pxor xmm2, xmm4 + movdqu xmm4, OWORD PTR [r11+48] + pxor xmm3, xmm4 + movdqu OWORD PTR [rbx], xmm0 + movdqu OWORD PTR [rbx+16], xmm1 + movdqu OWORD PTR [rbx+32], xmm2 + movdqu OWORD PTR [rbx+48], xmm3 + add eax, 64 + cmp eax, r10d + jl L_AES_CTR_encrypt_AESNI_enc_64 +L_AES_CTR_encrypt_AESNI_done_64: + cmp eax, r8d + mov r10d, r8d + je L_AES_CTR_encrypt_AESNI_done_enc + and r10d, 4294967280 +L_AES_CTR_encrypt_AESNI_enc_16: + ; 16 bytes of input + movdqa xmm0, xmm7 + pshufb xmm0, xmm8 + paddq xmm7, xmm9 + movdqa xmm11, xmm7 + pcmpeqq xmm11, xmm10 + pslldq xmm11, 8 + psrlq xmm11, 63 + paddq xmm7, xmm11 + ; aes_enc_block + pxor xmm0, [r9] + movdqu xmm5, OWORD PTR [r9+16] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+32] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+48] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+64] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+80] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+96] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+112] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+128] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+144] + aesenc xmm0, xmm5 + cmp eax, 11 + movdqu xmm5, OWORD PTR [r9+160] + jl L_AES_CTR_encrypt_AESNI_16_aes_enc_block_last + aesenc xmm0, xmm5 + movdqu xmm6, OWORD PTR [r9+176] + aesenc xmm0, xmm6 + cmp eax, 13 + movdqu xmm5, OWORD PTR [r9+192] + jl L_AES_CTR_encrypt_AESNI_16_aes_enc_block_last + aesenc xmm0, xmm5 + movdqu xmm6, OWORD PTR [r9+208] + aesenc xmm0, xmm6 + movdqu xmm5, OWORD PTR [r9+224] +L_AES_CTR_encrypt_AESNI_16_aes_enc_block_last: + aesenclast xmm0, xmm5 + lea r11, QWORD PTR [rcx+rax] + movdqu xmm4, OWORD PTR [r11] + pxor xmm0, xmm4 + lea r11, QWORD PTR [rdx+rax] + movdqu OWORD PTR [r11], xmm0 + add eax, 16 + cmp eax, r10d + jl L_AES_CTR_encrypt_AESNI_enc_16 +L_AES_CTR_encrypt_AESNI_done_enc: + pshufb xmm7, xmm8 + movdqu OWORD PTR [r10], xmm7 + movdqu xmm6, OWORD PTR [rsp] + movdqu xmm7, OWORD PTR [rsp+16] + movdqu xmm8, OWORD PTR [rsp+32] + movdqu xmm9, OWORD PTR [rsp+48] + movdqu xmm10, OWORD PTR [rsp+64] + movdqu xmm11, OWORD PTR [rsp+80] + add rsp, 96 + pop rbx + ret +AES_CTR_encrypt_AESNI ENDP +_TEXT ENDS +IFDEF HAVE_INTEL_AVX1 +_TEXT SEGMENT READONLY PARA +AES_ECB_encrypt_avx1 PROC + mov eax, DWORD PTR [rsp+40] + sub rsp, 16 + vmovdqu OWORD PTR [rsp], xmm6 + xor eax, eax + cmp r8d, 64 + mov r9d, r8d + jl L_AES_ECB_encrypt_avx1_done_64 + and r9d, 4294967232 +L_AES_ECB_encrypt_avx1_enc_64: + ; 64 bytes of input + ; aes_ecb_enc_64 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + vmovdqu xmm0, OWORD PTR [r10] + vmovdqu xmm1, OWORD PTR [r10+16] + vmovdqu xmm2, OWORD PTR [r10+32] + vmovdqu xmm3, OWORD PTR [r10+48] + ; aes_enc_block + vmovdqu xmm4, OWORD PTR [r9] + vpxor xmm0, xmm0, xmm4 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm4 + vpxor xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+16] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+32] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+48] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+64] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+80] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+96] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+112] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+128] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+144] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + cmp eax, 11 + vmovdqu xmm4, OWORD PTR [r9+160] + jl L_AES_ECB_encrypt_avx1_64_aes_enc_block_last + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+176] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + cmp eax, 13 + vmovdqu xmm4, OWORD PTR [r9+192] + jl L_AES_ECB_encrypt_avx1_64_aes_enc_block_last + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+208] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+224] +L_AES_ECB_encrypt_avx1_64_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm4 + vaesenclast xmm1, xmm1, xmm4 + vaesenclast xmm2, xmm2, xmm4 + vaesenclast xmm3, xmm3, xmm4 + vmovdqu OWORD PTR [r11], xmm0 + vmovdqu OWORD PTR [r11+16], xmm1 + vmovdqu OWORD PTR [r11+32], xmm2 + vmovdqu OWORD PTR [r11+48], xmm3 + add eax, 64 + cmp eax, r9d + jl L_AES_ECB_encrypt_avx1_enc_64 +L_AES_ECB_encrypt_avx1_done_64: + cmp eax, r8d + mov r9d, r8d + je L_AES_ECB_encrypt_avx1_done_enc + and r9d, 4294967280 +L_AES_ECB_encrypt_avx1_enc_16: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + vmovdqu xmm0, OWORD PTR [r10] + ; aes_enc_block + vpxor xmm0, xmm0, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm0, xmm0, xmm5 + cmp eax, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_ECB_encrypt_avx1_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm0, xmm0, xmm6 + cmp eax, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_ECB_encrypt_avx1_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_ECB_encrypt_avx1_16_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + lea r10, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r10], xmm0 + add eax, 16 + cmp eax, r9d + jl L_AES_ECB_encrypt_avx1_enc_16 +L_AES_ECB_encrypt_avx1_done_enc: + vmovdqu xmm6, OWORD PTR [rsp] + add rsp, 16 + ret +AES_ECB_encrypt_avx1 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_ECB_decrypt_avx1 PROC + mov eax, DWORD PTR [rsp+40] + sub rsp, 16 + vmovdqu OWORD PTR [rsp], xmm6 + xor eax, eax + cmp r8d, 64 + mov r9d, r8d + jl L_AES_ECB_decrypt_avx1_done_64 + and r9d, 4294967232 +L_AES_ECB_decrypt_avx1_dec_64: + ; 64 bytes of input + ; aes_ecb_dec_64 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + vmovdqu xmm0, OWORD PTR [r10] + vmovdqu xmm1, OWORD PTR [r10+16] + vmovdqu xmm2, OWORD PTR [r10+32] + vmovdqu xmm3, OWORD PTR [r10+48] + ; aes_dec_block + vmovdqu xmm4, OWORD PTR [r9] + vpxor xmm0, xmm0, xmm4 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm4 + vpxor xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+16] + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+32] + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+48] + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+64] + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+80] + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+96] + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+112] + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+128] + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+144] + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + cmp eax, 11 + vmovdqu xmm4, OWORD PTR [r9+160] + jl L_AES_ECB_decrypt_avx1_64_aes_dec_block_last + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+176] + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + cmp eax, 13 + vmovdqu xmm4, OWORD PTR [r9+192] + jl L_AES_ECB_decrypt_avx1_64_aes_dec_block_last + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+208] + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+224] +L_AES_ECB_decrypt_avx1_64_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm4 + vaesdeclast xmm1, xmm1, xmm4 + vaesdeclast xmm2, xmm2, xmm4 + vaesdeclast xmm3, xmm3, xmm4 + vmovdqu OWORD PTR [r11], xmm0 + vmovdqu OWORD PTR [r11+16], xmm1 + vmovdqu OWORD PTR [r11+32], xmm2 + vmovdqu OWORD PTR [r11+48], xmm3 + add eax, 64 + cmp eax, r9d + jl L_AES_ECB_decrypt_avx1_dec_64 +L_AES_ECB_decrypt_avx1_done_64: + cmp eax, r8d + mov r9d, r8d + je L_AES_ECB_decrypt_avx1_done_dec + and r9d, 4294967280 +L_AES_ECB_decrypt_avx1_dec_16: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + vmovdqu xmm0, OWORD PTR [r10] + ; aes_dec_block + vpxor xmm0, xmm0, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesdec xmm0, xmm0, xmm5 + cmp eax, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_ECB_decrypt_avx1_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesdec xmm0, xmm0, xmm6 + cmp eax, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_ECB_decrypt_avx1_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_ECB_decrypt_avx1_16_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + lea r10, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r10], xmm0 + add eax, 16 + cmp eax, r9d + jl L_AES_ECB_decrypt_avx1_dec_16 +L_AES_ECB_decrypt_avx1_done_dec: + vmovdqu xmm6, OWORD PTR [rsp] + add rsp, 16 + ret +AES_ECB_decrypt_avx1 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_CBC_encrypt_avx1 PROC + mov rax, QWORD PTR [rsp+40] + mov r10d, DWORD PTR [rsp+48] + vmovdqu xmm0, OWORD PTR [r8] + xor eax, eax + cmp eax, r9d + je L_AES_CBC_encrypt_avx1_done +L_AES_CBC_encrypt_avx1_loop: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + vmovdqu xmm1, OWORD PTR [r10] + vpxor xmm1, xmm1, xmm0 + ; aes_enc_block + vpxor xmm1, xmm1, [rax] + vmovdqu xmm3, OWORD PTR [rax+16] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+32] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+48] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+64] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+80] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+96] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+112] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+128] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+144] + vaesenc xmm1, xmm1, xmm3 + cmp r10d, 11 + vmovdqu xmm3, OWORD PTR [rax+160] + jl L_AES_CBC_encrypt_avx1_aes_enc_block_last + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm4, OWORD PTR [rax+176] + vaesenc xmm1, xmm1, xmm4 + cmp r10d, 13 + vmovdqu xmm3, OWORD PTR [rax+192] + jl L_AES_CBC_encrypt_avx1_aes_enc_block_last + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm4, OWORD PTR [rax+208] + vaesenc xmm1, xmm1, xmm4 + vmovdqu xmm3, OWORD PTR [rax+224] +L_AES_CBC_encrypt_avx1_aes_enc_block_last: + vaesenclast xmm1, xmm1, xmm3 + lea r11, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r11], xmm1 + vmovdqa xmm0, xmm1 + add eax, 16 + cmp eax, r9d + jl L_AES_CBC_encrypt_avx1_loop +L_AES_CBC_encrypt_avx1_done: + vmovdqu OWORD PTR [r8], xmm0 + ret +AES_CBC_encrypt_avx1 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_CBC_decrypt_avx1 PROC + push r12 + mov rax, QWORD PTR [rsp+48] + mov r10d, DWORD PTR [rsp+56] + sub rsp, 48 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu xmm4, OWORD PTR [r8] + xor eax, eax + cmp r9d, 64 + mov r10d, r9d + jl L_AES_CBC_decrypt_avx1_done_64 + and r10d, 4294967232 +L_AES_CBC_decrypt_avx1_dec_64: + ; 64 bytes of input + ; aes_cbc_dec_64 + lea r11, QWORD PTR [rcx+rax] + lea r12, QWORD PTR [rdx+rax] + vmovdqu xmm0, OWORD PTR [r11] + vmovdqu xmm1, OWORD PTR [r11+16] + vmovdqu xmm2, OWORD PTR [r11+32] + vmovdqu xmm3, OWORD PTR [r11+48] + ; aes_dec_block + vmovdqu xmm5, OWORD PTR [rax] + vpxor xmm0, xmm0, xmm5 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm5 + vpxor xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+16] + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+32] + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+48] + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+64] + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+80] + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+96] + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+112] + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+128] + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+144] + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [rax+160] + jl L_AES_CBC_decrypt_avx1_64_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+176] + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [rax+192] + jl L_AES_CBC_decrypt_avx1_64_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+208] + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+224] +L_AES_CBC_decrypt_avx1_64_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vaesdeclast xmm1, xmm1, xmm5 + vaesdeclast xmm2, xmm2, xmm5 + vaesdeclast xmm3, xmm3, xmm5 + vpxor xmm0, xmm0, xmm4 + vpxor xmm1, xmm1, [r11] + vpxor xmm2, xmm2, [r11+16] + vpxor xmm3, xmm3, [r11+32] + vmovdqu xmm4, OWORD PTR [r11+48] + vmovdqu OWORD PTR [r12], xmm0 + vmovdqu OWORD PTR [r12+16], xmm1 + vmovdqu OWORD PTR [r12+32], xmm2 + vmovdqu OWORD PTR [r12+48], xmm3 + add eax, 64 + cmp eax, r10d + jl L_AES_CBC_decrypt_avx1_dec_64 +L_AES_CBC_decrypt_avx1_done_64: + cmp eax, r9d + mov r10d, r9d + je L_AES_CBC_decrypt_avx1_done_dec + and r10d, 4294967280 +L_AES_CBC_decrypt_avx1_dec_16: + ; 16 bytes of input + lea r11, QWORD PTR [rcx+rax] + vmovdqu xmm0, OWORD PTR [r11] + vmovdqa xmm8, xmm0 + ; aes_dec_block + vpxor xmm0, xmm0, [rax] + vmovdqu xmm6, OWORD PTR [rax+16] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm6, OWORD PTR [rax+32] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm6, OWORD PTR [rax+48] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm6, OWORD PTR [rax+64] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm6, OWORD PTR [rax+80] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm6, OWORD PTR [rax+96] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm6, OWORD PTR [rax+112] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm6, OWORD PTR [rax+128] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm6, OWORD PTR [rax+144] + vaesdec xmm0, xmm0, xmm6 + cmp r10d, 11 + vmovdqu xmm6, OWORD PTR [rax+160] + jl L_AES_CBC_decrypt_avx1_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm7, OWORD PTR [rax+176] + vaesdec xmm0, xmm0, xmm7 + cmp r10d, 13 + vmovdqu xmm6, OWORD PTR [rax+192] + jl L_AES_CBC_decrypt_avx1_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm7, OWORD PTR [rax+208] + vaesdec xmm0, xmm0, xmm7 + vmovdqu xmm6, OWORD PTR [rax+224] +L_AES_CBC_decrypt_avx1_16_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm6 + vpxor xmm0, xmm0, xmm4 + vmovdqa xmm4, xmm8 + lea r11, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r11], xmm0 + add eax, 16 + cmp eax, r10d + jl L_AES_CBC_decrypt_avx1_dec_16 +L_AES_CBC_decrypt_avx1_done_dec: + vmovdqu OWORD PTR [r8], xmm4 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + add rsp, 48 + pop r12 + ret +AES_CBC_decrypt_avx1 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_ctr_avx1_bswap QWORD \ + 08090a0b0c0d0e0fh, 0001020304050607h +ptr_L_aes_ctr_avx1_bswap QWORD L_aes_ctr_avx1_bswap +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_ctr_avx1_one QWORD \ + 0000000000000001h, 0000000000000000h +ptr_L_aes_ctr_avx1_one QWORD L_aes_ctr_avx1_one +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_CTR_encrypt_avx1 PROC + push rbx + mov eax, DWORD PTR [rsp+48] + mov r10, QWORD PTR [rsp+56] + sub rsp, 96 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu xmm8, OWORD PTR L_aes_ctr_avx1_bswap + vmovdqu xmm9, OWORD PTR L_aes_ctr_avx1_one + vpxor xmm10, xmm10, xmm10 + vmovdqu xmm7, OWORD PTR [r10] + vpshufb xmm7, xmm7, xmm8 + xor eax, eax + cmp r8d, 64 + mov r10d, r8d + jl L_AES_CTR_encrypt_avx1_done_64 + and r10d, 4294967232 +L_AES_CTR_encrypt_avx1_enc_64: + ; 64 bytes of input + ; aes_ctr_enc_64 + lea r11, QWORD PTR [rcx+rax] + lea rbx, QWORD PTR [rdx+rax] + vpshufb xmm0, xmm7, xmm8 + vpaddq xmm7, xmm7, xmm9 + vpcmpeqq xmm11, xmm7, xmm10 + vpslldq xmm11, xmm11, 8 + vpsrlq xmm11, xmm11, 63 + vpaddq xmm7, xmm7, xmm11 + vpshufb xmm1, xmm7, xmm8 + vpaddq xmm7, xmm7, xmm9 + vpcmpeqq xmm11, xmm7, xmm10 + vpslldq xmm11, xmm11, 8 + vpsrlq xmm11, xmm11, 63 + vpaddq xmm7, xmm7, xmm11 + vpshufb xmm2, xmm7, xmm8 + vpaddq xmm7, xmm7, xmm9 + vpcmpeqq xmm11, xmm7, xmm10 + vpslldq xmm11, xmm11, 8 + vpsrlq xmm11, xmm11, 63 + vpaddq xmm7, xmm7, xmm11 + vpshufb xmm3, xmm7, xmm8 + vpaddq xmm7, xmm7, xmm9 + vpcmpeqq xmm11, xmm7, xmm10 + vpslldq xmm11, xmm11, 8 + vpsrlq xmm11, xmm11, 63 + vpaddq xmm7, xmm7, xmm11 + ; aes_enc_block + vmovdqu xmm4, OWORD PTR [r9] + vpxor xmm0, xmm0, xmm4 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm4 + vpxor xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+16] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+32] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+48] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+64] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+80] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+96] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+112] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+128] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+144] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + cmp eax, 11 + vmovdqu xmm4, OWORD PTR [r9+160] + jl L_AES_CTR_encrypt_avx1_64_aes_enc_block_last + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+176] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + cmp eax, 13 + vmovdqu xmm4, OWORD PTR [r9+192] + jl L_AES_CTR_encrypt_avx1_64_aes_enc_block_last + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+208] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+224] +L_AES_CTR_encrypt_avx1_64_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm4 + vaesenclast xmm1, xmm1, xmm4 + vaesenclast xmm2, xmm2, xmm4 + vaesenclast xmm3, xmm3, xmm4 + vpxor xmm0, xmm0, [r11] + vpxor xmm1, xmm1, [r11+16] + vpxor xmm2, xmm2, [r11+32] + vpxor xmm3, xmm3, [r11+48] + vmovdqu OWORD PTR [rbx], xmm0 + vmovdqu OWORD PTR [rbx+16], xmm1 + vmovdqu OWORD PTR [rbx+32], xmm2 + vmovdqu OWORD PTR [rbx+48], xmm3 + add eax, 64 + cmp eax, r10d + jl L_AES_CTR_encrypt_avx1_enc_64 +L_AES_CTR_encrypt_avx1_done_64: + cmp eax, r8d + mov r10d, r8d + je L_AES_CTR_encrypt_avx1_done_enc + and r10d, 4294967280 +L_AES_CTR_encrypt_avx1_enc_16: + ; 16 bytes of input + vpshufb xmm0, xmm7, xmm8 + vpaddq xmm7, xmm7, xmm9 + vpcmpeqq xmm11, xmm7, xmm10 + vpslldq xmm11, xmm11, 8 + vpsrlq xmm11, xmm11, 63 + vpaddq xmm7, xmm7, xmm11 + ; aes_enc_block + vpxor xmm0, xmm0, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm0, xmm0, xmm5 + cmp eax, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_CTR_encrypt_avx1_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm0, xmm0, xmm6 + cmp eax, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_CTR_encrypt_avx1_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_CTR_encrypt_avx1_16_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + lea r11, QWORD PTR [rcx+rax] + vpxor xmm0, xmm0, [r11] + lea r11, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r11], xmm0 + add eax, 16 + cmp eax, r10d + jl L_AES_CTR_encrypt_avx1_enc_16 +L_AES_CTR_encrypt_avx1_done_enc: + vpshufb xmm7, xmm7, xmm8 + vmovdqu OWORD PTR [r10], xmm7 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + add rsp, 96 + pop rbx + ret +AES_CTR_encrypt_avx1 ENDP +_TEXT ENDS +ENDIF +IFDEF HAVE_INTEL_VAES +_TEXT SEGMENT READONLY PARA +AES_ECB_encrypt_vaes PROC + mov eax, DWORD PTR [rsp+40] + sub rsp, 32 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + xor eax, eax + cmp r8d, 128 + mov r9d, r8d + jl L_AES_ECB_encrypt_vaes_done_128 + and r9d, 4294967168 +L_AES_ECB_encrypt_vaes_enc_128: + ; 128 bytes of input + ; aes_ecb_enc_128 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + vmovdqu ymm0, YMMWORD PTR [r10] + vmovdqu ymm1, YMMWORD PTR [r10+32] + vmovdqu ymm2, YMMWORD PTR [r10+64] + vmovdqu ymm3, YMMWORD PTR [r10+96] + ; aes_enc_block + vbroadcasti128 ymm7, [r9] + vpxor ymm0, ymm0, ymm7 + vpxor ymm1, ymm1, ymm7 + vpxor ymm2, ymm2, ymm7 + vpxor ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+16] + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+32] + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+48] + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+64] + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+80] + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+96] + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+112] + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+128] + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+144] + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + cmp eax, 11 + vbroadcasti128 ymm7, [r9+160] + jl L_AES_ECB_encrypt_vaes_128_aes_enc_block_last + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+176] + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + cmp eax, 13 + vbroadcasti128 ymm7, [r9+192] + jl L_AES_ECB_encrypt_vaes_128_aes_enc_block_last + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+208] + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+224] +L_AES_ECB_encrypt_vaes_128_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm7 + vaesenclast ymm1, ymm1, ymm7 + vaesenclast ymm2, ymm2, ymm7 + vaesenclast ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [r11], ymm0 + vmovdqu YMMWORD PTR [r11+32], ymm1 + vmovdqu YMMWORD PTR [r11+64], ymm2 + vmovdqu YMMWORD PTR [r11+96], ymm3 + add eax, 128 + cmp eax, r9d + jl L_AES_ECB_encrypt_vaes_enc_128 +L_AES_ECB_encrypt_vaes_done_128: + mov r9d, r8d + and r9d, 4294967264 + cmp eax, r9d + je L_AES_ECB_encrypt_vaes_done_32 +L_AES_ECB_encrypt_vaes_enc_32: + ; 32 bytes of input + ; aes_ecb_enc_32 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + vmovdqu ymm0, YMMWORD PTR [r10] + ; aes_enc_block + vbroadcasti128 ymm7, [r9] + vpxor ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+16] + vaesenc ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+32] + vaesenc ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+48] + vaesenc ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+64] + vaesenc ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+80] + vaesenc ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+96] + vaesenc ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+112] + vaesenc ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+128] + vaesenc ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+144] + vaesenc ymm0, ymm0, ymm7 + cmp eax, 11 + vbroadcasti128 ymm7, [r9+160] + jl L_AES_ECB_encrypt_vaes_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+176] + vaesenc ymm0, ymm0, ymm7 + cmp eax, 13 + vbroadcasti128 ymm7, [r9+192] + jl L_AES_ECB_encrypt_vaes_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+208] + vaesenc ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+224] +L_AES_ECB_encrypt_vaes_32_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm7 + vmovdqu YMMWORD PTR [r11], ymm0 + add eax, 32 + cmp eax, r9d + jl L_AES_ECB_encrypt_vaes_enc_32 +L_AES_ECB_encrypt_vaes_done_32: + cmp eax, r8d + mov r9d, r8d + je L_AES_ECB_encrypt_vaes_done_enc + and r9d, 4294967280 +L_AES_ECB_encrypt_vaes_enc_16: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + vmovdqu xmm0, OWORD PTR [r10] + ; aes_enc_block + vpxor xmm0, xmm0, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm0, xmm0, xmm5 + cmp eax, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_ECB_encrypt_vaes_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm0, xmm0, xmm6 + cmp eax, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_ECB_encrypt_vaes_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_ECB_encrypt_vaes_16_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + lea r10, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r10], xmm0 + add eax, 16 + cmp eax, r9d + jl L_AES_ECB_encrypt_vaes_enc_16 +L_AES_ECB_encrypt_vaes_done_enc: + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + add rsp, 32 + ret +AES_ECB_encrypt_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_ECB_decrypt_vaes PROC + mov eax, DWORD PTR [rsp+40] + sub rsp, 32 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + xor eax, eax + cmp r8d, 128 + mov r9d, r8d + jl L_AES_ECB_decrypt_vaes_done_128 + and r9d, 4294967168 +L_AES_ECB_decrypt_vaes_dec_128: + ; 128 bytes of input + ; aes_ecb_dec_128 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + vmovdqu ymm0, YMMWORD PTR [r10] + vmovdqu ymm1, YMMWORD PTR [r10+32] + vmovdqu ymm2, YMMWORD PTR [r10+64] + vmovdqu ymm3, YMMWORD PTR [r10+96] + ; aes_dec_block + vbroadcasti128 ymm7, [r9] + vpxor ymm0, ymm0, ymm7 + vpxor ymm1, ymm1, ymm7 + vpxor ymm2, ymm2, ymm7 + vpxor ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+16] + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+32] + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+48] + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+64] + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+80] + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+96] + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+112] + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+128] + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+144] + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + cmp eax, 11 + vbroadcasti128 ymm7, [r9+160] + jl L_AES_ECB_decrypt_vaes_128_aes_dec_block_last + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+176] + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + cmp eax, 13 + vbroadcasti128 ymm7, [r9+192] + jl L_AES_ECB_decrypt_vaes_128_aes_dec_block_last + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+208] + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+224] +L_AES_ECB_decrypt_vaes_128_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm7 + vaesdeclast ymm1, ymm1, ymm7 + vaesdeclast ymm2, ymm2, ymm7 + vaesdeclast ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [r11], ymm0 + vmovdqu YMMWORD PTR [r11+32], ymm1 + vmovdqu YMMWORD PTR [r11+64], ymm2 + vmovdqu YMMWORD PTR [r11+96], ymm3 + add eax, 128 + cmp eax, r9d + jl L_AES_ECB_decrypt_vaes_dec_128 +L_AES_ECB_decrypt_vaes_done_128: + mov r9d, r8d + and r9d, 4294967264 + cmp eax, r9d + je L_AES_ECB_decrypt_vaes_done_32 +L_AES_ECB_decrypt_vaes_dec_32: + ; 32 bytes of input + ; aes_ecb_dec_32 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + vmovdqu ymm0, YMMWORD PTR [r10] + ; aes_dec_block + vbroadcasti128 ymm7, [r9] + vpxor ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+16] + vaesdec ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+32] + vaesdec ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+48] + vaesdec ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+64] + vaesdec ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+80] + vaesdec ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+96] + vaesdec ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+112] + vaesdec ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+128] + vaesdec ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+144] + vaesdec ymm0, ymm0, ymm7 + cmp eax, 11 + vbroadcasti128 ymm7, [r9+160] + jl L_AES_ECB_decrypt_vaes_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+176] + vaesdec ymm0, ymm0, ymm7 + cmp eax, 13 + vbroadcasti128 ymm7, [r9+192] + jl L_AES_ECB_decrypt_vaes_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+208] + vaesdec ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+224] +L_AES_ECB_decrypt_vaes_32_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm7 + vmovdqu YMMWORD PTR [r11], ymm0 + add eax, 32 + cmp eax, r9d + jl L_AES_ECB_decrypt_vaes_dec_32 +L_AES_ECB_decrypt_vaes_done_32: + cmp eax, r8d + mov r9d, r8d + je L_AES_ECB_decrypt_vaes_done_dec + and r9d, 4294967280 +L_AES_ECB_decrypt_vaes_dec_16: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + vmovdqu xmm0, OWORD PTR [r10] + ; aes_dec_block + vpxor xmm0, xmm0, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesdec xmm0, xmm0, xmm5 + cmp eax, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_ECB_decrypt_vaes_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesdec xmm0, xmm0, xmm6 + cmp eax, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_ECB_decrypt_vaes_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_ECB_decrypt_vaes_16_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + lea r10, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r10], xmm0 + add eax, 16 + cmp eax, r9d + jl L_AES_ECB_decrypt_vaes_dec_16 +L_AES_ECB_decrypt_vaes_done_dec: + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + add rsp, 32 + ret +AES_ECB_decrypt_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_CBC_encrypt_vaes PROC + mov rax, QWORD PTR [rsp+40] + mov r10d, DWORD PTR [rsp+48] + vmovdqu xmm0, OWORD PTR [r8] + xor eax, eax + cmp eax, r9d + je L_AES_CBC_encrypt_vaes_done +L_AES_CBC_encrypt_vaes_loop: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + vmovdqu xmm1, OWORD PTR [r10] + vpxor xmm1, xmm1, xmm0 + ; aes_enc_block + vpxor xmm1, xmm1, [rax] + vmovdqu xmm3, OWORD PTR [rax+16] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+32] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+48] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+64] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+80] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+96] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+112] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+128] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+144] + vaesenc xmm1, xmm1, xmm3 + cmp r10d, 11 + vmovdqu xmm3, OWORD PTR [rax+160] + jl L_AES_CBC_encrypt_vaes_aes_enc_block_last + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm4, OWORD PTR [rax+176] + vaesenc xmm1, xmm1, xmm4 + cmp r10d, 13 + vmovdqu xmm3, OWORD PTR [rax+192] + jl L_AES_CBC_encrypt_vaes_aes_enc_block_last + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm4, OWORD PTR [rax+208] + vaesenc xmm1, xmm1, xmm4 + vmovdqu xmm3, OWORD PTR [rax+224] +L_AES_CBC_encrypt_vaes_aes_enc_block_last: + vaesenclast xmm1, xmm1, xmm3 + lea r11, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r11], xmm1 + vmovdqa xmm0, xmm1 + add eax, 16 + cmp eax, r9d + jl L_AES_CBC_encrypt_vaes_loop +L_AES_CBC_encrypt_vaes_done: + vmovdqu OWORD PTR [r8], xmm0 + ret +AES_CBC_encrypt_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_CBC_decrypt_vaes PROC + push r12 + mov rax, QWORD PTR [rsp+48] + mov r10d, DWORD PTR [rsp+56] + sub rsp, 128 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu xmm8, OWORD PTR [r8] + xor eax, eax + cmp r9d, 128 + mov r10d, r9d + jl L_AES_CBC_decrypt_vaes_done_128 + and r10d, 4294967168 +L_AES_CBC_decrypt_vaes_dec_128: + ; 128 bytes of input + ; aes_cbc_dec_128 + lea r11, QWORD PTR [rcx+rax] + lea r12, QWORD PTR [rdx+rax] + vmovdqu ymm0, YMMWORD PTR [r11] + vmovdqu ymm1, YMMWORD PTR [r11+32] + vmovdqu ymm2, YMMWORD PTR [r11+64] + vmovdqu ymm3, YMMWORD PTR [r11+96] + vinserti128 ymm10, ymm8, xmm0, 1 + vmovdqu ymm11, YMMWORD PTR [r11+16] + vmovdqu ymm12, YMMWORD PTR [r11+48] + vmovdqu ymm13, YMMWORD PTR [r11+80] + vextracti128 xmm8, ymm3, 1 + ; aes_dec_block + vbroadcasti128 ymm9, [rax] + vpxor ymm0, ymm0, ymm9 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm9 + vpxor ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+16] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+32] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+48] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+64] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+80] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+96] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+112] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+128] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+144] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + cmp r10d, 11 + vbroadcasti128 ymm9, [rax+160] + jl L_AES_CBC_decrypt_vaes_128_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+176] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + cmp r10d, 13 + vbroadcasti128 ymm9, [rax+192] + jl L_AES_CBC_decrypt_vaes_128_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+208] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+224] +L_AES_CBC_decrypt_vaes_128_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm9 + vaesdeclast ymm1, ymm1, ymm9 + vaesdeclast ymm2, ymm2, ymm9 + vaesdeclast ymm3, ymm3, ymm9 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vmovdqu YMMWORD PTR [r12], ymm0 + vmovdqu YMMWORD PTR [r12+32], ymm1 + vmovdqu YMMWORD PTR [r12+64], ymm2 + vmovdqu YMMWORD PTR [r12+96], ymm3 + add eax, 128 + cmp eax, r10d + jl L_AES_CBC_decrypt_vaes_dec_128 +L_AES_CBC_decrypt_vaes_done_128: + mov r10d, r9d + and r10d, 4294967264 + cmp eax, r10d + je L_AES_CBC_decrypt_vaes_done_32 +L_AES_CBC_decrypt_vaes_dec_32: + ; 32 bytes of input + ; aes_cbc_dec_32 + lea r11, QWORD PTR [rcx+rax] + lea r12, QWORD PTR [rdx+rax] + vmovdqu ymm0, YMMWORD PTR [r11] + vinserti128 ymm10, ymm8, xmm0, 1 + vextracti128 xmm8, ymm0, 1 + ; aes_dec_block + vbroadcasti128 ymm9, [rax] + vpxor ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+16] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+32] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+48] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+64] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+80] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+96] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+112] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+128] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+144] + vaesdec ymm0, ymm0, ymm9 + cmp r10d, 11 + vbroadcasti128 ymm9, [rax+160] + jl L_AES_CBC_decrypt_vaes_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+176] + vaesdec ymm0, ymm0, ymm9 + cmp r10d, 13 + vbroadcasti128 ymm9, [rax+192] + jl L_AES_CBC_decrypt_vaes_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+208] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+224] +L_AES_CBC_decrypt_vaes_32_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm9 + vpxor ymm0, ymm0, ymm10 + vmovdqu YMMWORD PTR [r12], ymm0 + add eax, 32 + cmp eax, r10d + jl L_AES_CBC_decrypt_vaes_dec_32 +L_AES_CBC_decrypt_vaes_done_32: + cmp eax, r9d + mov r10d, r9d + je L_AES_CBC_decrypt_vaes_done_dec + and r10d, 4294967280 +L_AES_CBC_decrypt_vaes_dec_16: + ; 16 bytes of input + lea r11, QWORD PTR [rcx+rax] + vmovdqu xmm0, OWORD PTR [r11] + vmovdqa xmm7, xmm0 + ; aes_dec_block + vpxor xmm0, xmm0, [rax] + vmovdqu xmm5, OWORD PTR [rax+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+144] + vaesdec xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [rax+160] + jl L_AES_CBC_decrypt_vaes_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [rax+176] + vaesdec xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [rax+192] + jl L_AES_CBC_decrypt_vaes_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [rax+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [rax+224] +L_AES_CBC_decrypt_vaes_16_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + vmovdqa xmm8, xmm7 + lea r11, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r11], xmm0 + add eax, 16 + cmp eax, r10d + jl L_AES_CBC_decrypt_vaes_dec_16 +L_AES_CBC_decrypt_vaes_done_dec: + vmovdqu OWORD PTR [r8], xmm8 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + add rsp, 128 + pop r12 + ret +AES_CBC_decrypt_vaes ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_ctr_bswap_vaes QWORD \ + 08090a0b0c0d0e0fh, 0001020304050607h +ptr_L_aes_ctr_bswap_vaes QWORD L_aes_ctr_bswap_vaes +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_ctr_inc_vaes QWORD \ + 0000000000000000h, 0000000000000000h, + 0000000000000001h, 0000000000000000h, + 0000000000000002h, 0000000000000000h, + 0000000000000003h, 0000000000000000h, + 0000000000000004h, 0000000000000000h, + 0000000000000005h, 0000000000000000h, + 0000000000000006h, 0000000000000000h, + 0000000000000007h, 0000000000000000h, + 0000000000000008h, 0000000000000000h, + 0000000000000009h, 0000000000000000h, + 000000000000000ah, 0000000000000000h, + 000000000000000bh, 0000000000000000h, + 000000000000000ch, 0000000000000000h, + 000000000000000dh, 0000000000000000h, + 000000000000000eh, 0000000000000000h, + 000000000000000fh, 0000000000000000h, + 0000000000000010h, 0000000000000000h +ptr_L_aes_ctr_inc_vaes QWORD L_aes_ctr_inc_vaes +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_CTR_encrypt_vaes PROC + push rbx + mov eax, DWORD PTR [rsp+48] + mov r10, QWORD PTR [rsp+56] + sub rsp, 144 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vbroadcasti128 ymm8, ptr_L_aes_ctr_bswap_vaes + vbroadcasti128 ymm7, [r10] + vpshufb ymm7, ymm7, ymm8 + vbroadcasti128 ymm10, [ptr_L_aes_ctr_inc_vaes+128] + vbroadcasti128 ymm11, [ptr_L_aes_ctr_inc_vaes+32] + vbroadcasti128 ymm12, [ptr_L_aes_ctr_inc_vaes+16] + xor eax, eax + cmp r8d, 128 + mov r10d, r8d + jl L_AES_CTR_encrypt_vaes_done_128 + and r10d, 4294967168 + vmovdqa ymm9, ymm7 + vpaddq ymm4, ymm7, [ptr_L_aes_ctr_inc_vaes] + vpand ymm14, ymm9, [ptr_L_aes_ctr_inc_vaes] + vpor ymm9, ymm9, [ptr_L_aes_ctr_inc_vaes] + vpandn ymm9, ymm4, ymm9 + vpor ymm9, ymm9, ymm14 + vpsrlq ymm9, ymm9, 63 + vpslldq ymm9, ymm9, 8 + vpaddq ymm4, ymm4, ymm9 + vmovdqa ymm9, ymm7 + vpaddq ymm5, ymm7, [ptr_L_aes_ctr_inc_vaes+32] + vpand ymm14, ymm9, [ptr_L_aes_ctr_inc_vaes+32] + vpor ymm9, ymm9, [ptr_L_aes_ctr_inc_vaes+32] + vpandn ymm9, ymm5, ymm9 + vpor ymm9, ymm9, ymm14 + vpsrlq ymm9, ymm9, 63 + vpslldq ymm9, ymm9, 8 + vpaddq ymm5, ymm5, ymm9 + vmovdqa ymm9, ymm7 + vpaddq ymm6, ymm7, [ptr_L_aes_ctr_inc_vaes+64] + vpand ymm14, ymm9, [ptr_L_aes_ctr_inc_vaes+64] + vpor ymm9, ymm9, [ptr_L_aes_ctr_inc_vaes+64] + vpandn ymm9, ymm6, ymm9 + vpor ymm9, ymm9, ymm14 + vpsrlq ymm9, ymm9, 63 + vpslldq ymm9, ymm9, 8 + vpaddq ymm6, ymm6, ymm9 + vmovdqa ymm9, ymm7 + vpaddq ymm7, ymm7, [ptr_L_aes_ctr_inc_vaes+96] + vpand ymm14, ymm9, [ptr_L_aes_ctr_inc_vaes+96] + vpor ymm9, ymm9, [ptr_L_aes_ctr_inc_vaes+96] + vpandn ymm9, ymm7, ymm9 + vpor ymm9, ymm9, ymm14 + vpsrlq ymm9, ymm9, 63 + vpslldq ymm9, ymm9, 8 + vpaddq ymm7, ymm7, ymm9 +L_AES_CTR_encrypt_vaes_enc_128: + ; 128 bytes of input + lea r11, QWORD PTR [rcx+rax] + lea rbx, QWORD PTR [rdx+rax] + vpshufb ymm0, ymm4, ymm8 + vpshufb ymm1, ymm5, ymm8 + vpshufb ymm2, ymm6, ymm8 + vpshufb ymm3, ymm7, ymm8 + vmovdqa ymm9, ymm4 + vpaddq ymm4, ymm4, ymm10 + vpand ymm14, ymm9, ymm10 + vpor ymm9, ymm9, ymm10 + vpandn ymm9, ymm4, ymm9 + vpor ymm9, ymm9, ymm14 + vpsrlq ymm9, ymm9, 63 + vpslldq ymm9, ymm9, 8 + vpaddq ymm4, ymm4, ymm9 + vmovdqa ymm9, ymm5 + vpaddq ymm5, ymm5, ymm10 + vpand ymm14, ymm9, ymm10 + vpor ymm9, ymm9, ymm10 + vpandn ymm9, ymm5, ymm9 + vpor ymm9, ymm9, ymm14 + vpsrlq ymm9, ymm9, 63 + vpslldq ymm9, ymm9, 8 + vpaddq ymm5, ymm5, ymm9 + vmovdqa ymm9, ymm6 + vpaddq ymm6, ymm6, ymm10 + vpand ymm14, ymm9, ymm10 + vpor ymm9, ymm9, ymm10 + vpandn ymm9, ymm6, ymm9 + vpor ymm9, ymm9, ymm14 + vpsrlq ymm9, ymm9, 63 + vpslldq ymm9, ymm9, 8 + vpaddq ymm6, ymm6, ymm9 + vmovdqa ymm9, ymm7 + vpaddq ymm7, ymm7, ymm10 + vpand ymm14, ymm9, ymm10 + vpor ymm9, ymm9, ymm10 + vpandn ymm9, ymm7, ymm9 + vpor ymm9, ymm9, ymm14 + vpsrlq ymm9, ymm9, 63 + vpslldq ymm9, ymm9, 8 + vpaddq ymm7, ymm7, ymm9 + ; aes_enc_block + vbroadcasti128 ymm13, [r9] + vpxor ymm0, ymm0, ymm13 + vpxor ymm1, ymm1, ymm13 + vpxor ymm2, ymm2, ymm13 + vpxor ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+16] + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+32] + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+48] + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+64] + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+80] + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+96] + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+112] + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+128] + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+144] + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + cmp eax, 11 + vbroadcasti128 ymm13, [r9+160] + jl L_AES_CTR_encrypt_vaes_128_aes_enc_block_last + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+176] + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + cmp eax, 13 + vbroadcasti128 ymm13, [r9+192] + jl L_AES_CTR_encrypt_vaes_128_aes_enc_block_last + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+208] + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+224] +L_AES_CTR_encrypt_vaes_128_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm13 + vaesenclast ymm1, ymm1, ymm13 + vaesenclast ymm2, ymm2, ymm13 + vaesenclast ymm3, ymm3, ymm13 + vpxor ymm0, ymm0, [r11] + vpxor ymm1, ymm1, [r11+32] + vpxor ymm2, ymm2, [r11+64] + vpxor ymm3, ymm3, [r11+96] + vmovdqu YMMWORD PTR [rbx], ymm0 + vmovdqu YMMWORD PTR [rbx+32], ymm1 + vmovdqu YMMWORD PTR [rbx+64], ymm2 + vmovdqu YMMWORD PTR [rbx+96], ymm3 + add eax, 128 + cmp eax, r10d + jl L_AES_CTR_encrypt_vaes_enc_128 + vperm2i128 ymm7, ymm4, ymm4, 0 +L_AES_CTR_encrypt_vaes_done_128: + mov r10d, r8d + and r10d, 4294967264 + cmp eax, r10d + je L_AES_CTR_encrypt_vaes_done_32 +L_AES_CTR_encrypt_vaes_enc_32: + ; 32 bytes of input + ; aes_ctr_enc_32 + lea r11, QWORD PTR [rcx+rax] + lea rbx, QWORD PTR [rdx+rax] + vpaddq ymm0, ymm7, [ptr_L_aes_ctr_inc_vaes] + vmovdqa ymm9, ymm7 + vpand ymm14, ymm9, [ptr_L_aes_ctr_inc_vaes] + vpor ymm9, ymm9, [ptr_L_aes_ctr_inc_vaes] + vpandn ymm9, ymm0, ymm9 + vpor ymm9, ymm9, ymm14 + vpsrlq ymm9, ymm9, 63 + vpslldq ymm9, ymm9, 8 + vpaddq ymm0, ymm0, ymm9 + vpshufb ymm0, ymm0, ymm8 + vmovdqa ymm9, ymm7 + vpaddq ymm7, ymm7, ymm11 + vpand ymm14, ymm9, ymm11 + vpor ymm9, ymm9, ymm11 + vpandn ymm9, ymm7, ymm9 + vpor ymm9, ymm9, ymm14 + vpsrlq ymm9, ymm9, 63 + vpslldq ymm9, ymm9, 8 + vpaddq ymm7, ymm7, ymm9 + ; aes_enc_block + vbroadcasti128 ymm13, [r9] + vpxor ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+16] + vaesenc ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+32] + vaesenc ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+48] + vaesenc ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+64] + vaesenc ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+80] + vaesenc ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+96] + vaesenc ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+112] + vaesenc ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+128] + vaesenc ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+144] + vaesenc ymm0, ymm0, ymm13 + cmp eax, 11 + vbroadcasti128 ymm13, [r9+160] + jl L_AES_CTR_encrypt_vaes_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+176] + vaesenc ymm0, ymm0, ymm13 + cmp eax, 13 + vbroadcasti128 ymm13, [r9+192] + jl L_AES_CTR_encrypt_vaes_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+208] + vaesenc ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+224] +L_AES_CTR_encrypt_vaes_32_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm13 + vpxor ymm0, ymm0, [r11] + vmovdqu YMMWORD PTR [rbx], ymm0 + add eax, 32 + cmp eax, r10d + jl L_AES_CTR_encrypt_vaes_enc_32 +L_AES_CTR_encrypt_vaes_done_32: + cmp eax, r8d + mov r10d, r8d + je L_AES_CTR_encrypt_vaes_done_enc + and r10d, 4294967280 +L_AES_CTR_encrypt_vaes_enc_16: + ; 16 bytes of input + vpshufb xmm0, xmm7, xmm8 + vmovdqa ymm9, ymm7 + vpaddq ymm7, ymm7, ymm12 + vpand ymm14, ymm9, ymm12 + vpor ymm9, ymm9, ymm12 + vpandn ymm9, ymm7, ymm9 + vpor ymm9, ymm9, ymm14 + vpsrlq ymm9, ymm9, 63 + vpslldq ymm9, ymm9, 8 + vpaddq ymm7, ymm7, ymm9 + ; aes_enc_block + vpxor xmm0, xmm0, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm0, xmm0, xmm5 + cmp eax, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_CTR_encrypt_vaes_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm0, xmm0, xmm6 + cmp eax, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_CTR_encrypt_vaes_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_CTR_encrypt_vaes_16_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + lea r11, QWORD PTR [rcx+rax] + vpxor xmm0, xmm0, [r11] + lea r11, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r11], xmm0 + add eax, 16 + cmp eax, r10d + jl L_AES_CTR_encrypt_vaes_enc_16 +L_AES_CTR_encrypt_vaes_done_enc: + vpshufb xmm0, xmm7, xmm8 + vmovdqu OWORD PTR [r10], xmm0 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + add rsp, 144 + pop rbx + ret +AES_CTR_encrypt_vaes ENDP +_TEXT ENDS +ENDIF +IFDEF HAVE_INTEL_AVX512 +_TEXT SEGMENT READONLY PARA +AES_ECB_encrypt_avx512 PROC + mov eax, DWORD PTR [rsp+40] + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + xor eax, eax + cmp r8d, 64 + jl L_AES_ECB_encrypt_avx512_done_64 + vbroadcasti32x4 zmm8, [r9] + vbroadcasti32x4 zmm9, [r9+16] + vbroadcasti32x4 zmm10, [r9+32] + vbroadcasti32x4 zmm11, [r9+48] + vbroadcasti32x4 zmm12, [r9+64] + vbroadcasti32x4 zmm13, [r9+80] + vbroadcasti32x4 zmm14, [r9+96] + vbroadcasti32x4 zmm15, [r9+112] + vbroadcasti32x4 zmm16, [r9+128] + vbroadcasti32x4 zmm17, [r9+144] + vbroadcasti32x4 zmm18, [r9+160] + cmp eax, 11 + jl L_AES_ECB_encrypt_avx512_key_cached + vbroadcasti32x4 zmm19, [r9+176] + vbroadcasti32x4 zmm20, [r9+192] + cmp eax, 13 + jl L_AES_ECB_encrypt_avx512_key_cached + vbroadcasti32x4 zmm21, [r9+208] + vbroadcasti32x4 zmm22, [r9+224] +L_AES_ECB_encrypt_avx512_key_cached: + cmp r8d, 256 + mov r9d, r8d + jl L_AES_ECB_encrypt_avx512_done_256 + and r9d, 4294967040 +L_AES_ECB_encrypt_avx512_enc_256: + ; 256 bytes of input + ; aes_ecb_enc_256 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + vmovdqu64 zmm0, [r10] + vmovdqu64 zmm1, [r10+64] + vmovdqu64 zmm2, [r10+128] + vmovdqu64 zmm3, [r10+192] + ; aes_enc_block + vpxorq zmm0, zmm0, zmm8 + vpxorq zmm1, zmm1, zmm8 + vpxorq zmm2, zmm2, zmm8 + vpxorq zmm3, zmm3, zmm8 + vaesenc zmm0, zmm0, zmm9 + vaesenc zmm1, zmm1, zmm9 + vaesenc zmm2, zmm2, zmm9 + vaesenc zmm3, zmm3, zmm9 + vaesenc zmm0, zmm0, zmm10 + vaesenc zmm1, zmm1, zmm10 + vaesenc zmm2, zmm2, zmm10 + vaesenc zmm3, zmm3, zmm10 + vaesenc zmm0, zmm0, zmm11 + vaesenc zmm1, zmm1, zmm11 + vaesenc zmm2, zmm2, zmm11 + vaesenc zmm3, zmm3, zmm11 + vaesenc zmm0, zmm0, zmm12 + vaesenc zmm1, zmm1, zmm12 + vaesenc zmm2, zmm2, zmm12 + vaesenc zmm3, zmm3, zmm12 + vaesenc zmm0, zmm0, zmm13 + vaesenc zmm1, zmm1, zmm13 + vaesenc zmm2, zmm2, zmm13 + vaesenc zmm3, zmm3, zmm13 + vaesenc zmm0, zmm0, zmm14 + vaesenc zmm1, zmm1, zmm14 + vaesenc zmm2, zmm2, zmm14 + vaesenc zmm3, zmm3, zmm14 + vaesenc zmm0, zmm0, zmm15 + vaesenc zmm1, zmm1, zmm15 + vaesenc zmm2, zmm2, zmm15 + vaesenc zmm3, zmm3, zmm15 + vaesenc zmm0, zmm0, zmm16 + vaesenc zmm1, zmm1, zmm16 + vaesenc zmm2, zmm2, zmm16 + vaesenc zmm3, zmm3, zmm16 + vaesenc zmm0, zmm0, zmm17 + vaesenc zmm1, zmm1, zmm17 + vaesenc zmm2, zmm2, zmm17 + vaesenc zmm3, zmm3, zmm17 + cmp eax, 11 + vmovdqa64 zmm7, zmm18 + jl L_AES_ECB_encrypt_avx512_256_aes_enc_block_last + vaesenc zmm0, zmm0, zmm18 + vaesenc zmm1, zmm1, zmm18 + vaesenc zmm2, zmm2, zmm18 + vaesenc zmm3, zmm3, zmm18 + vaesenc zmm0, zmm0, zmm19 + vaesenc zmm1, zmm1, zmm19 + vaesenc zmm2, zmm2, zmm19 + vaesenc zmm3, zmm3, zmm19 + cmp eax, 13 + vmovdqa64 zmm7, zmm20 + jl L_AES_ECB_encrypt_avx512_256_aes_enc_block_last + vaesenc zmm0, zmm0, zmm20 + vaesenc zmm1, zmm1, zmm20 + vaesenc zmm2, zmm2, zmm20 + vaesenc zmm3, zmm3, zmm20 + vaesenc zmm0, zmm0, zmm21 + vaesenc zmm1, zmm1, zmm21 + vaesenc zmm2, zmm2, zmm21 + vaesenc zmm3, zmm3, zmm21 + vmovdqa64 zmm7, zmm22 +L_AES_ECB_encrypt_avx512_256_aes_enc_block_last: + vaesenclast zmm0, zmm0, zmm7 + vaesenclast zmm1, zmm1, zmm7 + vaesenclast zmm2, zmm2, zmm7 + vaesenclast zmm3, zmm3, zmm7 + vmovdqu64 [r11], zmm0 + vmovdqu64 [r11+64], zmm1 + vmovdqu64 [r11+128], zmm2 + vmovdqu64 [r11+192], zmm3 + add eax, 256 + cmp eax, r9d + jl L_AES_ECB_encrypt_avx512_enc_256 +L_AES_ECB_encrypt_avx512_done_256: + mov r9d, r8d + and r9d, 4294967232 + cmp eax, r9d + je L_AES_ECB_encrypt_avx512_done_64 +L_AES_ECB_encrypt_avx512_enc_64: + ; 64 bytes of input + ; aes_ecb_enc_64 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + vmovdqu64 zmm0, [r10] + ; aes_enc_block + vpxorq zmm0, zmm0, zmm8 + vaesenc zmm0, zmm0, zmm9 + vaesenc zmm0, zmm0, zmm10 + vaesenc zmm0, zmm0, zmm11 + vaesenc zmm0, zmm0, zmm12 + vaesenc zmm0, zmm0, zmm13 + vaesenc zmm0, zmm0, zmm14 + vaesenc zmm0, zmm0, zmm15 + vaesenc zmm0, zmm0, zmm16 + vaesenc zmm0, zmm0, zmm17 + cmp eax, 11 + vmovdqa64 zmm7, zmm18 + jl L_AES_ECB_encrypt_avx512_64_aes_enc_block_last + vaesenc zmm0, zmm0, zmm18 + vaesenc zmm0, zmm0, zmm19 + cmp eax, 13 + vmovdqa64 zmm7, zmm20 + jl L_AES_ECB_encrypt_avx512_64_aes_enc_block_last + vaesenc zmm0, zmm0, zmm20 + vaesenc zmm0, zmm0, zmm21 + vmovdqa64 zmm7, zmm22 +L_AES_ECB_encrypt_avx512_64_aes_enc_block_last: + vaesenclast zmm0, zmm0, zmm7 + vmovdqu64 [r11], zmm0 + add eax, 64 + cmp eax, r9d + jl L_AES_ECB_encrypt_avx512_enc_64 +L_AES_ECB_encrypt_avx512_done_64: + cmp eax, r8d + mov r9d, r8d + je L_AES_ECB_encrypt_avx512_done_enc + and r9d, 4294967280 +L_AES_ECB_encrypt_avx512_enc_16: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + vmovdqu xmm0, OWORD PTR [r10] + ; aes_enc_block + vpxor xmm0, xmm0, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm0, xmm0, xmm5 + cmp eax, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_ECB_encrypt_avx512_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm0, xmm0, xmm6 + cmp eax, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_ECB_encrypt_avx512_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_ECB_encrypt_avx512_16_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + lea r10, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r10], xmm0 + add eax, 16 + cmp eax, r9d + jl L_AES_ECB_encrypt_avx512_enc_16 +L_AES_ECB_encrypt_avx512_done_enc: + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + ret +AES_ECB_encrypt_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_ECB_decrypt_avx512 PROC + mov eax, DWORD PTR [rsp+40] + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + xor eax, eax + cmp r8d, 64 + jl L_AES_ECB_decrypt_avx512_done_64 + vbroadcasti32x4 zmm8, [r9] + vbroadcasti32x4 zmm9, [r9+16] + vbroadcasti32x4 zmm10, [r9+32] + vbroadcasti32x4 zmm11, [r9+48] + vbroadcasti32x4 zmm12, [r9+64] + vbroadcasti32x4 zmm13, [r9+80] + vbroadcasti32x4 zmm14, [r9+96] + vbroadcasti32x4 zmm15, [r9+112] + vbroadcasti32x4 zmm16, [r9+128] + vbroadcasti32x4 zmm17, [r9+144] + vbroadcasti32x4 zmm18, [r9+160] + cmp eax, 11 + jl L_AES_ECB_decrypt_avx512_key_cached + vbroadcasti32x4 zmm19, [r9+176] + vbroadcasti32x4 zmm20, [r9+192] + cmp eax, 13 + jl L_AES_ECB_decrypt_avx512_key_cached + vbroadcasti32x4 zmm21, [r9+208] + vbroadcasti32x4 zmm22, [r9+224] +L_AES_ECB_decrypt_avx512_key_cached: + cmp r8d, 256 + mov r9d, r8d + jl L_AES_ECB_decrypt_avx512_done_256 + and r9d, 4294967040 +L_AES_ECB_decrypt_avx512_dec_256: + ; 256 bytes of input + ; aes_ecb_dec_256 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + vmovdqu64 zmm0, [r10] + vmovdqu64 zmm1, [r10+64] + vmovdqu64 zmm2, [r10+128] + vmovdqu64 zmm3, [r10+192] + ; aes_dec_block + vpxorq zmm0, zmm0, zmm8 + vpxorq zmm1, zmm1, zmm8 + vpxorq zmm2, zmm2, zmm8 + vpxorq zmm3, zmm3, zmm8 + vaesdec zmm0, zmm0, zmm9 + vaesdec zmm1, zmm1, zmm9 + vaesdec zmm2, zmm2, zmm9 + vaesdec zmm3, zmm3, zmm9 + vaesdec zmm0, zmm0, zmm10 + vaesdec zmm1, zmm1, zmm10 + vaesdec zmm2, zmm2, zmm10 + vaesdec zmm3, zmm3, zmm10 + vaesdec zmm0, zmm0, zmm11 + vaesdec zmm1, zmm1, zmm11 + vaesdec zmm2, zmm2, zmm11 + vaesdec zmm3, zmm3, zmm11 + vaesdec zmm0, zmm0, zmm12 + vaesdec zmm1, zmm1, zmm12 + vaesdec zmm2, zmm2, zmm12 + vaesdec zmm3, zmm3, zmm12 + vaesdec zmm0, zmm0, zmm13 + vaesdec zmm1, zmm1, zmm13 + vaesdec zmm2, zmm2, zmm13 + vaesdec zmm3, zmm3, zmm13 + vaesdec zmm0, zmm0, zmm14 + vaesdec zmm1, zmm1, zmm14 + vaesdec zmm2, zmm2, zmm14 + vaesdec zmm3, zmm3, zmm14 + vaesdec zmm0, zmm0, zmm15 + vaesdec zmm1, zmm1, zmm15 + vaesdec zmm2, zmm2, zmm15 + vaesdec zmm3, zmm3, zmm15 + vaesdec zmm0, zmm0, zmm16 + vaesdec zmm1, zmm1, zmm16 + vaesdec zmm2, zmm2, zmm16 + vaesdec zmm3, zmm3, zmm16 + vaesdec zmm0, zmm0, zmm17 + vaesdec zmm1, zmm1, zmm17 + vaesdec zmm2, zmm2, zmm17 + vaesdec zmm3, zmm3, zmm17 + cmp eax, 11 + vmovdqa64 zmm7, zmm18 + jl L_AES_ECB_decrypt_avx512_256_aes_dec_block_last + vaesdec zmm0, zmm0, zmm18 + vaesdec zmm1, zmm1, zmm18 + vaesdec zmm2, zmm2, zmm18 + vaesdec zmm3, zmm3, zmm18 + vaesdec zmm0, zmm0, zmm19 + vaesdec zmm1, zmm1, zmm19 + vaesdec zmm2, zmm2, zmm19 + vaesdec zmm3, zmm3, zmm19 + cmp eax, 13 + vmovdqa64 zmm7, zmm20 + jl L_AES_ECB_decrypt_avx512_256_aes_dec_block_last + vaesdec zmm0, zmm0, zmm20 + vaesdec zmm1, zmm1, zmm20 + vaesdec zmm2, zmm2, zmm20 + vaesdec zmm3, zmm3, zmm20 + vaesdec zmm0, zmm0, zmm21 + vaesdec zmm1, zmm1, zmm21 + vaesdec zmm2, zmm2, zmm21 + vaesdec zmm3, zmm3, zmm21 + vmovdqa64 zmm7, zmm22 +L_AES_ECB_decrypt_avx512_256_aes_dec_block_last: + vaesdeclast zmm0, zmm0, zmm7 + vaesdeclast zmm1, zmm1, zmm7 + vaesdeclast zmm2, zmm2, zmm7 + vaesdeclast zmm3, zmm3, zmm7 + vmovdqu64 [r11], zmm0 + vmovdqu64 [r11+64], zmm1 + vmovdqu64 [r11+128], zmm2 + vmovdqu64 [r11+192], zmm3 + add eax, 256 + cmp eax, r9d + jl L_AES_ECB_decrypt_avx512_dec_256 +L_AES_ECB_decrypt_avx512_done_256: + mov r9d, r8d + and r9d, 4294967232 + cmp eax, r9d + je L_AES_ECB_decrypt_avx512_done_64 +L_AES_ECB_decrypt_avx512_dec_64: + ; 64 bytes of input + ; aes_ecb_dec_64 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + vmovdqu64 zmm0, [r10] + ; aes_dec_block + vpxorq zmm0, zmm0, zmm8 + vaesdec zmm0, zmm0, zmm9 + vaesdec zmm0, zmm0, zmm10 + vaesdec zmm0, zmm0, zmm11 + vaesdec zmm0, zmm0, zmm12 + vaesdec zmm0, zmm0, zmm13 + vaesdec zmm0, zmm0, zmm14 + vaesdec zmm0, zmm0, zmm15 + vaesdec zmm0, zmm0, zmm16 + vaesdec zmm0, zmm0, zmm17 + cmp eax, 11 + vmovdqa64 zmm7, zmm18 + jl L_AES_ECB_decrypt_avx512_64_aes_dec_block_last + vaesdec zmm0, zmm0, zmm18 + vaesdec zmm0, zmm0, zmm19 + cmp eax, 13 + vmovdqa64 zmm7, zmm20 + jl L_AES_ECB_decrypt_avx512_64_aes_dec_block_last + vaesdec zmm0, zmm0, zmm20 + vaesdec zmm0, zmm0, zmm21 + vmovdqa64 zmm7, zmm22 +L_AES_ECB_decrypt_avx512_64_aes_dec_block_last: + vaesdeclast zmm0, zmm0, zmm7 + vmovdqu64 [r11], zmm0 + add eax, 64 + cmp eax, r9d + jl L_AES_ECB_decrypt_avx512_dec_64 +L_AES_ECB_decrypt_avx512_done_64: + cmp eax, r8d + mov r9d, r8d + je L_AES_ECB_decrypt_avx512_done_dec + and r9d, 4294967280 +L_AES_ECB_decrypt_avx512_dec_16: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + vmovdqu xmm0, OWORD PTR [r10] + ; aes_dec_block + vpxor xmm0, xmm0, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesdec xmm0, xmm0, xmm5 + cmp eax, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_ECB_decrypt_avx512_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesdec xmm0, xmm0, xmm6 + cmp eax, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_ECB_decrypt_avx512_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_ECB_decrypt_avx512_16_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + lea r10, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r10], xmm0 + add eax, 16 + cmp eax, r9d + jl L_AES_ECB_decrypt_avx512_dec_16 +L_AES_ECB_decrypt_avx512_done_dec: + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + ret +AES_ECB_decrypt_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_CBC_encrypt_avx512 PROC + mov rax, QWORD PTR [rsp+40] + mov r10d, DWORD PTR [rsp+48] + vmovdqu xmm0, OWORD PTR [r8] + xor eax, eax + cmp eax, r9d + je L_AES_CBC_encrypt_avx512_done +L_AES_CBC_encrypt_avx512_loop: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + vmovdqu xmm1, OWORD PTR [r10] + vpternlogq xmm1, xmm0, [rax], 150 + ; aes_enc_block + vmovdqu xmm3, OWORD PTR [rax+16] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+32] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+48] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+64] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+80] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+96] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+112] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+128] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+144] + vaesenc xmm1, xmm1, xmm3 + cmp r10d, 11 + vmovdqu xmm3, OWORD PTR [rax+160] + jl L_AES_CBC_encrypt_avx512_aes_enc_block_last + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm4, OWORD PTR [rax+176] + vaesenc xmm1, xmm1, xmm4 + cmp r10d, 13 + vmovdqu xmm3, OWORD PTR [rax+192] + jl L_AES_CBC_encrypt_avx512_aes_enc_block_last + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm4, OWORD PTR [rax+208] + vaesenc xmm1, xmm1, xmm4 + vmovdqu xmm3, OWORD PTR [rax+224] +L_AES_CBC_encrypt_avx512_aes_enc_block_last: + vaesenclast xmm1, xmm1, xmm3 + lea r11, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r11], xmm1 + vmovdqa xmm0, xmm1 + add eax, 16 + cmp eax, r9d + jl L_AES_CBC_encrypt_avx512_loop +L_AES_CBC_encrypt_avx512_done: + vmovdqu OWORD PTR [r8], xmm0 + ret +AES_CBC_encrypt_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_CBC_decrypt_avx512 PROC + push r12 + mov rax, QWORD PTR [rsp+48] + mov r10d, DWORD PTR [rsp+56] + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + vmovdqu xmm8, OWORD PTR [r8] + xor eax, eax + cmp r9d, 64 + jl L_AES_CBC_decrypt_avx512_done_64 + vbroadcasti32x4 zmm14, [rax] + vbroadcasti32x4 zmm15, [rax+16] + vbroadcasti32x4 zmm16, [rax+32] + vbroadcasti32x4 zmm17, [rax+48] + vbroadcasti32x4 zmm18, [rax+64] + vbroadcasti32x4 zmm19, [rax+80] + vbroadcasti32x4 zmm20, [rax+96] + vbroadcasti32x4 zmm21, [rax+112] + vbroadcasti32x4 zmm22, [rax+128] + vbroadcasti32x4 zmm23, [rax+144] + vbroadcasti32x4 zmm24, [rax+160] + cmp r10d, 11 + jl L_AES_CBC_decrypt_avx512_key_cached + vbroadcasti32x4 zmm25, [rax+176] + vbroadcasti32x4 zmm26, [rax+192] + cmp r10d, 13 + jl L_AES_CBC_decrypt_avx512_key_cached + vbroadcasti32x4 zmm27, [rax+208] + vbroadcasti32x4 zmm28, [rax+224] +L_AES_CBC_decrypt_avx512_key_cached: + cmp r9d, 256 + mov r10d, r9d + jl L_AES_CBC_decrypt_avx512_done_256 + and r10d, 4294967040 +L_AES_CBC_decrypt_avx512_dec_256: + ; 256 bytes of input + ; aes_cbc_dec_256 + lea r11, QWORD PTR [rcx+rax] + lea r12, QWORD PTR [rdx+rax] + vmovdqu64 zmm0, [r11] + vmovdqu64 zmm1, [r11+64] + vmovdqu64 zmm2, [r11+128] + vmovdqu64 zmm3, [r11+192] + vshufi64x2 zmm10, zmm0, zmm0, 144 + vinserti32x4 zmm10, zmm10, xmm8, 0 + vmovdqu64 zmm11, [r11+48] + vmovdqu64 zmm12, [r11+112] + vmovdqu64 zmm13, [r11+176] + vextracti32x4 xmm8, zmm3, 3 + ; aes_dec_block + vpxorq zmm0, zmm0, zmm14 + vpxorq zmm1, zmm1, zmm14 + vpxorq zmm2, zmm2, zmm14 + vpxorq zmm3, zmm3, zmm14 + vaesdec zmm0, zmm0, zmm15 + vaesdec zmm1, zmm1, zmm15 + vaesdec zmm2, zmm2, zmm15 + vaesdec zmm3, zmm3, zmm15 + vaesdec zmm0, zmm0, zmm16 + vaesdec zmm1, zmm1, zmm16 + vaesdec zmm2, zmm2, zmm16 + vaesdec zmm3, zmm3, zmm16 + vaesdec zmm0, zmm0, zmm17 + vaesdec zmm1, zmm1, zmm17 + vaesdec zmm2, zmm2, zmm17 + vaesdec zmm3, zmm3, zmm17 + vaesdec zmm0, zmm0, zmm18 + vaesdec zmm1, zmm1, zmm18 + vaesdec zmm2, zmm2, zmm18 + vaesdec zmm3, zmm3, zmm18 + vaesdec zmm0, zmm0, zmm19 + vaesdec zmm1, zmm1, zmm19 + vaesdec zmm2, zmm2, zmm19 + vaesdec zmm3, zmm3, zmm19 + vaesdec zmm0, zmm0, zmm20 + vaesdec zmm1, zmm1, zmm20 + vaesdec zmm2, zmm2, zmm20 + vaesdec zmm3, zmm3, zmm20 + vaesdec zmm0, zmm0, zmm21 + vaesdec zmm1, zmm1, zmm21 + vaesdec zmm2, zmm2, zmm21 + vaesdec zmm3, zmm3, zmm21 + vaesdec zmm0, zmm0, zmm22 + vaesdec zmm1, zmm1, zmm22 + vaesdec zmm2, zmm2, zmm22 + vaesdec zmm3, zmm3, zmm22 + vaesdec zmm0, zmm0, zmm23 + vaesdec zmm1, zmm1, zmm23 + vaesdec zmm2, zmm2, zmm23 + vaesdec zmm3, zmm3, zmm23 + cmp r10d, 11 + vmovdqa64 zmm9, zmm24 + jl L_AES_CBC_decrypt_avx512_256_aes_dec_block_last + vaesdec zmm0, zmm0, zmm24 + vaesdec zmm1, zmm1, zmm24 + vaesdec zmm2, zmm2, zmm24 + vaesdec zmm3, zmm3, zmm24 + vaesdec zmm0, zmm0, zmm25 + vaesdec zmm1, zmm1, zmm25 + vaesdec zmm2, zmm2, zmm25 + vaesdec zmm3, zmm3, zmm25 + cmp r10d, 13 + vmovdqa64 zmm9, zmm26 + jl L_AES_CBC_decrypt_avx512_256_aes_dec_block_last + vaesdec zmm0, zmm0, zmm26 + vaesdec zmm1, zmm1, zmm26 + vaesdec zmm2, zmm2, zmm26 + vaesdec zmm3, zmm3, zmm26 + vaesdec zmm0, zmm0, zmm27 + vaesdec zmm1, zmm1, zmm27 + vaesdec zmm2, zmm2, zmm27 + vaesdec zmm3, zmm3, zmm27 + vmovdqa64 zmm9, zmm28 +L_AES_CBC_decrypt_avx512_256_aes_dec_block_last: + vaesdeclast zmm0, zmm0, zmm9 + vaesdeclast zmm1, zmm1, zmm9 + vaesdeclast zmm2, zmm2, zmm9 + vaesdeclast zmm3, zmm3, zmm9 + vpxorq zmm0, zmm0, zmm10 + vpxorq zmm1, zmm1, zmm11 + vpxorq zmm2, zmm2, zmm12 + vpxorq zmm3, zmm3, zmm13 + vmovdqu64 [r12], zmm0 + vmovdqu64 [r12+64], zmm1 + vmovdqu64 [r12+128], zmm2 + vmovdqu64 [r12+192], zmm3 + add eax, 256 + cmp eax, r10d + jl L_AES_CBC_decrypt_avx512_dec_256 +L_AES_CBC_decrypt_avx512_done_256: + mov r10d, r9d + and r10d, 4294967232 + cmp eax, r10d + je L_AES_CBC_decrypt_avx512_done_64 +L_AES_CBC_decrypt_avx512_dec_64: + ; 64 bytes of input + ; aes_cbc_dec_64 + lea r11, QWORD PTR [rcx+rax] + lea r12, QWORD PTR [rdx+rax] + vmovdqu64 zmm0, [r11] + vshufi64x2 zmm10, zmm0, zmm0, 144 + vinserti32x4 zmm10, zmm10, xmm8, 0 + vextracti32x4 xmm8, zmm0, 3 + ; aes_dec_block + vpxorq zmm0, zmm0, zmm14 + vaesdec zmm0, zmm0, zmm15 + vaesdec zmm0, zmm0, zmm16 + vaesdec zmm0, zmm0, zmm17 + vaesdec zmm0, zmm0, zmm18 + vaesdec zmm0, zmm0, zmm19 + vaesdec zmm0, zmm0, zmm20 + vaesdec zmm0, zmm0, zmm21 + vaesdec zmm0, zmm0, zmm22 + vaesdec zmm0, zmm0, zmm23 + cmp r10d, 11 + vmovdqa64 zmm9, zmm24 + jl L_AES_CBC_decrypt_avx512_64_aes_dec_block_last + vaesdec zmm0, zmm0, zmm24 + vaesdec zmm0, zmm0, zmm25 + cmp r10d, 13 + vmovdqa64 zmm9, zmm26 + jl L_AES_CBC_decrypt_avx512_64_aes_dec_block_last + vaesdec zmm0, zmm0, zmm26 + vaesdec zmm0, zmm0, zmm27 + vmovdqa64 zmm9, zmm28 +L_AES_CBC_decrypt_avx512_64_aes_dec_block_last: + vaesdeclast zmm0, zmm0, zmm9 + vpxorq zmm0, zmm0, zmm10 + vmovdqu64 [r12], zmm0 + add eax, 64 + cmp eax, r10d + jl L_AES_CBC_decrypt_avx512_dec_64 +L_AES_CBC_decrypt_avx512_done_64: + cmp eax, r9d + mov r10d, r9d + je L_AES_CBC_decrypt_avx512_done_dec + and r10d, 4294967280 +L_AES_CBC_decrypt_avx512_dec_16: + ; 16 bytes of input + lea r11, QWORD PTR [rcx+rax] + vmovdqu xmm0, OWORD PTR [r11] + vmovdqa xmm7, xmm0 + ; aes_dec_block + vpxor xmm0, xmm0, [rax] + vmovdqu xmm5, OWORD PTR [rax+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+144] + vaesdec xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [rax+160] + jl L_AES_CBC_decrypt_avx512_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [rax+176] + vaesdec xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [rax+192] + jl L_AES_CBC_decrypt_avx512_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [rax+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [rax+224] +L_AES_CBC_decrypt_avx512_16_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + vmovdqa xmm8, xmm7 + lea r11, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r11], xmm0 + add eax, 16 + cmp eax, r10d + jl L_AES_CBC_decrypt_avx512_dec_16 +L_AES_CBC_decrypt_avx512_done_dec: + vmovdqu OWORD PTR [r8], xmm8 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + pop r12 + ret +AES_CBC_decrypt_avx512 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_ctr_bswap_avx512 QWORD \ + 08090a0b0c0d0e0fh, 0001020304050607h +ptr_L_aes_ctr_bswap_avx512 QWORD L_aes_ctr_bswap_avx512 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_ctr_inc_avx512 QWORD \ + 0000000000000000h, 0000000000000000h, + 0000000000000001h, 0000000000000000h, + 0000000000000002h, 0000000000000000h, + 0000000000000003h, 0000000000000000h, + 0000000000000004h, 0000000000000000h, + 0000000000000005h, 0000000000000000h, + 0000000000000006h, 0000000000000000h, + 0000000000000007h, 0000000000000000h, + 0000000000000008h, 0000000000000000h, + 0000000000000009h, 0000000000000000h, + 000000000000000ah, 0000000000000000h, + 000000000000000bh, 0000000000000000h, + 000000000000000ch, 0000000000000000h, + 000000000000000dh, 0000000000000000h, + 000000000000000eh, 0000000000000000h, + 000000000000000fh, 0000000000000000h, + 0000000000000010h, 0000000000000000h +ptr_L_aes_ctr_inc_avx512 QWORD L_aes_ctr_inc_avx512 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_CTR_encrypt_avx512 PROC + push rbx + mov eax, DWORD PTR [rsp+48] + mov r10, QWORD PTR [rsp+56] + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + vbroadcasti32x4 zmm8, ptr_L_aes_ctr_bswap_avx512 + vbroadcasti32x4 zmm7, [r10] + vpshufb zmm7, zmm7, zmm8 + vbroadcasti32x4 zmm10, [ptr_L_aes_ctr_inc_avx512+256] + vbroadcasti32x4 zmm11, [ptr_L_aes_ctr_inc_avx512+64] + vbroadcasti32x4 zmm12, [ptr_L_aes_ctr_inc_avx512+16] + xor eax, eax + cmp r8d, 64 + jl L_AES_CTR_encrypt_avx512_done_64 + vbroadcasti32x4 zmm14, [r9] + vbroadcasti32x4 zmm15, [r9+16] + vbroadcasti32x4 zmm16, [r9+32] + vbroadcasti32x4 zmm17, [r9+48] + vbroadcasti32x4 zmm18, [r9+64] + vbroadcasti32x4 zmm19, [r9+80] + vbroadcasti32x4 zmm20, [r9+96] + vbroadcasti32x4 zmm21, [r9+112] + vbroadcasti32x4 zmm22, [r9+128] + vbroadcasti32x4 zmm23, [r9+144] + vbroadcasti32x4 zmm24, [r9+160] + cmp eax, 11 + jl L_AES_CTR_encrypt_avx512_key_cached + vbroadcasti32x4 zmm25, [r9+176] + vbroadcasti32x4 zmm26, [r9+192] + cmp eax, 13 + jl L_AES_CTR_encrypt_avx512_key_cached + vbroadcasti32x4 zmm27, [r9+208] + vbroadcasti32x4 zmm28, [r9+224] +L_AES_CTR_encrypt_avx512_key_cached: + cmp r8d, 256 + mov r10d, r8d + jl L_AES_CTR_encrypt_avx512_done_256 + and r10d, 4294967040 + vmovdqa64 zmm9, zmm7 + vpaddq zmm4, zmm7, [ptr_L_aes_ctr_inc_avx512] + vpternlogq zmm9, zmm4, [ptr_L_aes_ctr_inc_avx512], 178 + vpsrlq zmm9, zmm9, 63 + vpslldq zmm9, zmm9, 8 + vpaddq zmm4, zmm4, zmm9 + vmovdqa64 zmm9, zmm7 + vpaddq zmm5, zmm7, [ptr_L_aes_ctr_inc_avx512+64] + vpternlogq zmm9, zmm5, [ptr_L_aes_ctr_inc_avx512+64], 178 + vpsrlq zmm9, zmm9, 63 + vpslldq zmm9, zmm9, 8 + vpaddq zmm5, zmm5, zmm9 + vmovdqa64 zmm9, zmm7 + vpaddq zmm6, zmm7, [ptr_L_aes_ctr_inc_avx512+128] + vpternlogq zmm9, zmm6, [ptr_L_aes_ctr_inc_avx512+128], 178 + vpsrlq zmm9, zmm9, 63 + vpslldq zmm9, zmm9, 8 + vpaddq zmm6, zmm6, zmm9 + vmovdqa64 zmm9, zmm7 + vpaddq zmm7, zmm7, [ptr_L_aes_ctr_inc_avx512+192] + vpternlogq zmm9, zmm7, [ptr_L_aes_ctr_inc_avx512+192], 178 + vpsrlq zmm9, zmm9, 63 + vpslldq zmm9, zmm9, 8 + vpaddq zmm7, zmm7, zmm9 +L_AES_CTR_encrypt_avx512_enc_256: + ; 256 bytes of input + lea r11, QWORD PTR [rcx+rax] + lea rbx, QWORD PTR [rdx+rax] + vpshufb zmm0, zmm4, zmm8 + vpshufb zmm1, zmm5, zmm8 + vpshufb zmm2, zmm6, zmm8 + vpshufb zmm3, zmm7, zmm8 + vmovdqa64 zmm9, zmm4 + vpaddq zmm4, zmm4, zmm10 + vpternlogq zmm9, zmm4, zmm10, 178 + vpsrlq zmm9, zmm9, 63 + vpslldq zmm9, zmm9, 8 + vpaddq zmm4, zmm4, zmm9 + vmovdqa64 zmm9, zmm5 + vpaddq zmm5, zmm5, zmm10 + vpternlogq zmm9, zmm5, zmm10, 178 + vpsrlq zmm9, zmm9, 63 + vpslldq zmm9, zmm9, 8 + vpaddq zmm5, zmm5, zmm9 + vmovdqa64 zmm9, zmm6 + vpaddq zmm6, zmm6, zmm10 + vpternlogq zmm9, zmm6, zmm10, 178 + vpsrlq zmm9, zmm9, 63 + vpslldq zmm9, zmm9, 8 + vpaddq zmm6, zmm6, zmm9 + vmovdqa64 zmm9, zmm7 + vpaddq zmm7, zmm7, zmm10 + vpternlogq zmm9, zmm7, zmm10, 178 + vpsrlq zmm9, zmm9, 63 + vpslldq zmm9, zmm9, 8 + vpaddq zmm7, zmm7, zmm9 + ; aes_enc_block + vpxorq zmm0, zmm0, zmm14 + vpxorq zmm1, zmm1, zmm14 + vpxorq zmm2, zmm2, zmm14 + vpxorq zmm3, zmm3, zmm14 + vaesenc zmm0, zmm0, zmm15 + vaesenc zmm1, zmm1, zmm15 + vaesenc zmm2, zmm2, zmm15 + vaesenc zmm3, zmm3, zmm15 + vaesenc zmm0, zmm0, zmm16 + vaesenc zmm1, zmm1, zmm16 + vaesenc zmm2, zmm2, zmm16 + vaesenc zmm3, zmm3, zmm16 + vaesenc zmm0, zmm0, zmm17 + vaesenc zmm1, zmm1, zmm17 + vaesenc zmm2, zmm2, zmm17 + vaesenc zmm3, zmm3, zmm17 + vaesenc zmm0, zmm0, zmm18 + vaesenc zmm1, zmm1, zmm18 + vaesenc zmm2, zmm2, zmm18 + vaesenc zmm3, zmm3, zmm18 + vaesenc zmm0, zmm0, zmm19 + vaesenc zmm1, zmm1, zmm19 + vaesenc zmm2, zmm2, zmm19 + vaesenc zmm3, zmm3, zmm19 + vaesenc zmm0, zmm0, zmm20 + vaesenc zmm1, zmm1, zmm20 + vaesenc zmm2, zmm2, zmm20 + vaesenc zmm3, zmm3, zmm20 + vaesenc zmm0, zmm0, zmm21 + vaesenc zmm1, zmm1, zmm21 + vaesenc zmm2, zmm2, zmm21 + vaesenc zmm3, zmm3, zmm21 + vaesenc zmm0, zmm0, zmm22 + vaesenc zmm1, zmm1, zmm22 + vaesenc zmm2, zmm2, zmm22 + vaesenc zmm3, zmm3, zmm22 + vaesenc zmm0, zmm0, zmm23 + vaesenc zmm1, zmm1, zmm23 + vaesenc zmm2, zmm2, zmm23 + vaesenc zmm3, zmm3, zmm23 + cmp eax, 11 + vmovdqa64 zmm13, zmm24 + jl L_AES_CTR_encrypt_avx512_256_aes_enc_block_last + vaesenc zmm0, zmm0, zmm24 + vaesenc zmm1, zmm1, zmm24 + vaesenc zmm2, zmm2, zmm24 + vaesenc zmm3, zmm3, zmm24 + vaesenc zmm0, zmm0, zmm25 + vaesenc zmm1, zmm1, zmm25 + vaesenc zmm2, zmm2, zmm25 + vaesenc zmm3, zmm3, zmm25 + cmp eax, 13 + vmovdqa64 zmm13, zmm26 + jl L_AES_CTR_encrypt_avx512_256_aes_enc_block_last + vaesenc zmm0, zmm0, zmm26 + vaesenc zmm1, zmm1, zmm26 + vaesenc zmm2, zmm2, zmm26 + vaesenc zmm3, zmm3, zmm26 + vaesenc zmm0, zmm0, zmm27 + vaesenc zmm1, zmm1, zmm27 + vaesenc zmm2, zmm2, zmm27 + vaesenc zmm3, zmm3, zmm27 + vmovdqa64 zmm13, zmm28 +L_AES_CTR_encrypt_avx512_256_aes_enc_block_last: + vaesenclast zmm0, zmm0, zmm13 + vaesenclast zmm1, zmm1, zmm13 + vaesenclast zmm2, zmm2, zmm13 + vaesenclast zmm3, zmm3, zmm13 + vpxorq zmm0, zmm0, [r11] + vpxorq zmm1, zmm1, [r11+64] + vpxorq zmm2, zmm2, [r11+128] + vpxorq zmm3, zmm3, [r11+192] + vmovdqu64 [rbx], zmm0 + vmovdqu64 [rbx+64], zmm1 + vmovdqu64 [rbx+128], zmm2 + vmovdqu64 [rbx+192], zmm3 + add eax, 256 + cmp eax, r10d + jl L_AES_CTR_encrypt_avx512_enc_256 + vshufi64x2 zmm7, zmm4, zmm4, 0 +L_AES_CTR_encrypt_avx512_done_256: + mov r10d, r8d + and r10d, 4294967232 + cmp eax, r10d + je L_AES_CTR_encrypt_avx512_done_64 +L_AES_CTR_encrypt_avx512_enc_64: + ; 64 bytes of input + ; aes_ctr_enc_64 + lea r11, QWORD PTR [rcx+rax] + lea rbx, QWORD PTR [rdx+rax] + vpaddq zmm0, zmm7, [ptr_L_aes_ctr_inc_avx512] + vmovdqa64 zmm9, zmm7 + vpternlogq zmm9, zmm0, [ptr_L_aes_ctr_inc_avx512], 178 + vpsrlq zmm9, zmm9, 63 + vpslldq zmm9, zmm9, 8 + vpaddq zmm0, zmm0, zmm9 + vpshufb zmm0, zmm0, zmm8 + vmovdqa64 zmm9, zmm7 + vpaddq zmm7, zmm7, zmm11 + vpternlogq zmm9, zmm7, zmm11, 178 + vpsrlq zmm9, zmm9, 63 + vpslldq zmm9, zmm9, 8 + vpaddq zmm7, zmm7, zmm9 + ; aes_enc_block + vpxorq zmm0, zmm0, zmm14 + vaesenc zmm0, zmm0, zmm15 + vaesenc zmm0, zmm0, zmm16 + vaesenc zmm0, zmm0, zmm17 + vaesenc zmm0, zmm0, zmm18 + vaesenc zmm0, zmm0, zmm19 + vaesenc zmm0, zmm0, zmm20 + vaesenc zmm0, zmm0, zmm21 + vaesenc zmm0, zmm0, zmm22 + vaesenc zmm0, zmm0, zmm23 + cmp eax, 11 + vmovdqa64 zmm13, zmm24 + jl L_AES_CTR_encrypt_avx512_64_aes_enc_block_last + vaesenc zmm0, zmm0, zmm24 + vaesenc zmm0, zmm0, zmm25 + cmp eax, 13 + vmovdqa64 zmm13, zmm26 + jl L_AES_CTR_encrypt_avx512_64_aes_enc_block_last + vaesenc zmm0, zmm0, zmm26 + vaesenc zmm0, zmm0, zmm27 + vmovdqa64 zmm13, zmm28 +L_AES_CTR_encrypt_avx512_64_aes_enc_block_last: + vaesenclast zmm0, zmm0, zmm13 + vpxorq zmm0, zmm0, [r11] + vmovdqu64 [rbx], zmm0 + add eax, 64 + cmp eax, r10d + jl L_AES_CTR_encrypt_avx512_enc_64 +L_AES_CTR_encrypt_avx512_done_64: + cmp eax, r8d + mov r10d, r8d + je L_AES_CTR_encrypt_avx512_done_enc + and r10d, 4294967280 +L_AES_CTR_encrypt_avx512_enc_16: + ; 16 bytes of input + vpshufb xmm0, xmm7, xmm8 + vmovdqa64 zmm9, zmm7 + vpaddq zmm7, zmm7, zmm12 + vpternlogq zmm9, zmm7, zmm12, 178 + vpsrlq zmm9, zmm9, 63 + vpslldq zmm9, zmm9, 8 + vpaddq zmm7, zmm7, zmm9 + ; aes_enc_block + vpxor xmm0, xmm0, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm0, xmm0, xmm5 + cmp eax, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_CTR_encrypt_avx512_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm0, xmm0, xmm6 + cmp eax, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_CTR_encrypt_avx512_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_CTR_encrypt_avx512_16_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + lea r11, QWORD PTR [rcx+rax] + vpxor xmm0, xmm0, [r11] + lea r11, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r11], xmm0 + add eax, 16 + cmp eax, r10d + jl L_AES_CTR_encrypt_avx512_enc_16 +L_AES_CTR_encrypt_avx512_done_enc: + vpshufb xmm0, xmm7, xmm8 + vmovdqu OWORD PTR [r10], xmm0 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + pop rbx + ret +AES_CTR_encrypt_avx512 ENDP +_TEXT ENDS +ENDIF +END diff --git a/wolfcrypt/src/aes_xts_asm.S b/wolfcrypt/src/aes_xts_asm.S index 09045c6d8f7..29f3a0174b4 100644 --- a/wolfcrypt/src/aes_xts_asm.S +++ b/wolfcrypt/src/aes_xts_asm.S @@ -46,6 +46,16 @@ #define HAVE_INTEL_AVX2 #endif /* HAVE_INTEL_AVX2 */ #endif /* NO_AVX2_SUPPORT */ +#ifndef NO_VAES_SUPPORT +#ifndef HAVE_INTEL_VAES +#define HAVE_INTEL_VAES +#endif /* HAVE_INTEL_VAES */ +#endif /* NO_VAES_SUPPORT */ +#ifndef NO_AVX512_SUPPORT +#ifndef HAVE_INTEL_AVX512 +#define HAVE_INTEL_AVX512 +#endif /* HAVE_INTEL_AVX512 */ +#endif /* NO_AVX512_SUPPORT */ #ifdef WOLFSSL_AES_XTS #ifdef WOLFSSL_X86_64_BUILD @@ -2785,6 +2795,4408 @@ L_AES_XTS_decrypt_update_avx1_done_dec: .size AES_XTS_decrypt_update_avx1,.-AES_XTS_decrypt_update_avx1 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX1 */ +#ifdef HAVE_INTEL_VAES +#ifndef __APPLE__ +.text +.globl AES_XTS_init_vaes +.type AES_XTS_init_vaes,@function +.align 16 +AES_XTS_init_vaes: +#else +.section __TEXT,__text +.globl _AES_XTS_init_vaes +.p2align 4 +_AES_XTS_init_vaes: +#endif /* __APPLE__ */ + vmovdqu (%rdi), %xmm0 + # aes_enc_block + vpxor (%rsi), %xmm0, %xmm0 + vmovdqu 16(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 32(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 48(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 64(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 80(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 96(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 112(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 128(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 144(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + cmpl $11, %edx + vmovdqu 160(%rsi), %xmm2 + jl L_AES_XTS_init_vaes_tweak_aes_enc_block_last + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 176(%rsi), %xmm3 + vaesenc %xmm3, %xmm0, %xmm0 + cmpl $13, %edx + vmovdqu 192(%rsi), %xmm2 + jl L_AES_XTS_init_vaes_tweak_aes_enc_block_last + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 208(%rsi), %xmm3 + vaesenc %xmm3, %xmm0, %xmm0 + vmovdqu 224(%rsi), %xmm2 +L_AES_XTS_init_vaes_tweak_aes_enc_block_last: + vaesenclast %xmm2, %xmm0, %xmm0 + vmovdqu %xmm0, (%rdi) + repz retq +#ifndef __APPLE__ +.size AES_XTS_init_vaes,.-AES_XTS_init_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_xts_gc_xts: +.long 0x00000087,0x00000000,0x00000001,0x00000000 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_xts_poly: +.long 0x00000087,0x00000000,0x00000000,0x00000000 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_xts_shl: +.long 0x00000000,0x00000000,0x00000000,0x00000000 +.long 0x00000001,0x00000000,0x00000001,0x00000000 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_xts_shr: +.long 0x00000040,0x00000000,0x00000040,0x00000000 +.long 0x0000003f,0x00000000,0x0000003f,0x00000000 +#ifndef __APPLE__ +.text +.globl AES_XTS_encrypt_vaes +.type AES_XTS_encrypt_vaes,@function +.align 16 +AES_XTS_encrypt_vaes: +#else +.section __TEXT,__text +.globl _AES_XTS_encrypt_vaes +.p2align 4 +_AES_XTS_encrypt_vaes: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + movq %rdx, %rax + movq %rcx, %r12 + movl 24(%rsp), %r10d + subq $0x40, %rsp + vmovdqu L_vaes_aes_xts_gc_xts(%rip), %xmm12 + vbroadcasti128 L_vaes_aes_xts_poly(%rip), %ymm13 + vmovdqu L_vaes_aes_xts_shl(%rip), %ymm14 + vmovdqu L_vaes_aes_xts_shr(%rip), %ymm15 + vmovdqu (%r12), %xmm8 + # aes_enc_block + vpxor (%r9), %xmm8, %xmm8 + vmovdqu 16(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 32(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 48(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 64(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 80(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 96(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 112(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 128(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 144(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + cmpl $11, %r10d + vmovdqu 160(%r9), %xmm5 + jl L_AES_XTS_encrypt_vaes_tweak_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 176(%r9), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqu 192(%r9), %xmm5 + jl L_AES_XTS_encrypt_vaes_tweak_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 208(%r9), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + vmovdqu 224(%r9), %xmm5 +L_AES_XTS_encrypt_vaes_tweak_aes_enc_block_last: + vaesenclast %xmm5, %xmm8, %xmm8 + xorl %r13d, %r13d + cmpl $32, %eax + jl L_AES_XTS_encrypt_vaes_done_128 + cmpl $0x80, %eax + movl %eax, %r11d + jl L_AES_XTS_encrypt_vaes_done_128 + andl $0xffffff80, %r11d + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpsrlq $62, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm4, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + vpsrlq $62, %ymm5, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm5, %ymm6 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm9, %ymm6, %ymm6 + vpsrlq $62, %ymm6, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm6, %ymm7 + vpxor %ymm10, %ymm7, %ymm7 + vpxor %ymm9, %ymm7, %ymm7 +L_AES_XTS_encrypt_vaes_enc_128: + # 128 bytes of input + # aes_enc_128 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + # aes_enc_block + vbroadcasti128 (%r8), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm9, %ymm1, %ymm1 + vpxor %ymm6, %ymm2, %ymm2 + vpxor %ymm9, %ymm2, %ymm2 + vpxor %ymm7, %ymm3, %ymm3 + vpxor %ymm9, %ymm3, %ymm3 + vbroadcasti128 16(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 32(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 48(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 64(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 80(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 96(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 112(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 128(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 144(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + cmpl $11, %r10d + vbroadcasti128 160(%r8), %ymm9 + jl L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 176(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + cmpl $13, %r10d + vbroadcasti128 192(%r8), %ymm9 + jl L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 208(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 224(%r8), %ymm9 +L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last: + vaesenclast %ymm9, %ymm0, %ymm0 + vaesenclast %ymm9, %ymm1, %ymm1 + vaesenclast %ymm9, %ymm2, %ymm2 + vaesenclast %ymm9, %ymm3, %ymm3 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vpsrlq $56, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm4, %ymm4 + vpxor %ymm10, %ymm4, %ymm4 + vpxor %ymm9, %ymm4, %ymm4 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vpsrlq $56, %ymm5, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm5, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + vpxor %ymm6, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vpsrlq $56, %ymm6, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm6, %ymm6 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm9, %ymm6, %ymm6 + vpxor %ymm7, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + vpsrlq $56, %ymm7, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm7, %ymm7 + vpxor %ymm10, %ymm7, %ymm7 + vpxor %ymm9, %ymm7, %ymm7 + addl $0x80, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_encrypt_vaes_enc_128 + vextracti128 $0x00, %ymm4, %xmm8 +L_AES_XTS_encrypt_vaes_done_128: + movl %eax, %r11d + andl $0xffffffc0, %r11d + cmpl %r11d, %r13d + je L_AES_XTS_encrypt_vaes_done_64 + # 64 bytes of input + # aes_enc_64 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpsrlq $62, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm4, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + # aes_enc_block + vbroadcasti128 (%r8), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm9, %ymm1, %ymm1 + vbroadcasti128 16(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 32(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 48(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 64(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 80(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 96(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 112(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 128(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 144(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + cmpl $11, %r10d + vbroadcasti128 160(%r8), %ymm9 + jl L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 176(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + cmpl $13, %r10d + vbroadcasti128 192(%r8), %ymm9 + jl L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 208(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 224(%r8), %ymm9 +L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last: + vaesenclast %ymm9, %ymm0, %ymm0 + vaesenclast %ymm9, %ymm1, %ymm1 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vextracti128 $0x01, %ymm5, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpand %xmm12, %xmm9, %xmm9 + vpxor %xmm9, %xmm8, %xmm8 + addl $0x40, %r13d +L_AES_XTS_encrypt_vaes_done_64: + movl %eax, %r11d + andl $0xffffffe0, %r11d + cmpl %r11d, %r13d + je L_AES_XTS_encrypt_vaes_done_32 + # 32 bytes of input + # aes_enc_32 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu (%rcx), %ymm0 + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + # aes_enc_block + vbroadcasti128 (%r8), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vbroadcasti128 16(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 32(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 48(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 64(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 80(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 96(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 112(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 128(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 144(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + cmpl $11, %r10d + vbroadcasti128 160(%r8), %ymm9 + jl L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 176(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + cmpl $13, %r10d + vbroadcasti128 192(%r8), %ymm9 + jl L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 208(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 224(%r8), %ymm9 +L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last: + vaesenclast %ymm9, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vextracti128 $0x01, %ymm4, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpand %xmm12, %xmm9, %xmm9 + vpxor %xmm9, %xmm8, %xmm8 + addl $32, %r13d +L_AES_XTS_encrypt_vaes_done_32: + cmpl %eax, %r13d + movl %eax, %r11d + je L_AES_XTS_encrypt_vaes_done_enc + subl %r13d, %r11d + cmpl $16, %r11d + movl %eax, %r11d + jl L_AES_XTS_encrypt_vaes_last_15 + andl $0xfffffff0, %r11d + # 16 bytes of input +L_AES_XTS_encrypt_vaes_enc_16: + leaq (%rdi,%r13,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_enc_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_encrypt_vaes_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_encrypt_vaes_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_encrypt_vaes_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r13,1), %rcx + vmovdqu %xmm0, (%rcx) + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm8, %xmm8 + addl $16, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_encrypt_vaes_enc_16 + cmpl %eax, %r13d + je L_AES_XTS_encrypt_vaes_done_enc +L_AES_XTS_encrypt_vaes_last_15: + subq $16, %r13 + leaq (%rsi,%r13,1), %rcx + vmovdqu (%rcx), %xmm0 + addq $16, %r13 + vmovdqu %xmm0, (%rsp) + xorq %rdx, %rdx +L_AES_XTS_encrypt_vaes_last_15_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r13,1), %cl + movb %r11b, (%rsi,%r13,1) + movb %cl, (%rsp,%rdx,1) + incl %r13d + incl %edx + cmpl %eax, %r13d + jl L_AES_XTS_encrypt_vaes_last_15_byte_loop + subq %rdx, %r13 + vmovdqu (%rsp), %xmm0 + subq $16, %r13 + vpxor %xmm8, %xmm0, %xmm0 + # aes_enc_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_encrypt_vaes_last_15_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_encrypt_vaes_last_15_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_encrypt_vaes_last_15_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r13,1), %rcx + vmovdqu %xmm0, (%rcx) +L_AES_XTS_encrypt_vaes_done_enc: + addq $0x40, %rsp + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_encrypt_vaes,.-AES_XTS_encrypt_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_XTS_encrypt_update_vaes +.type AES_XTS_encrypt_update_vaes,@function +.align 16 +AES_XTS_encrypt_update_vaes: +#else +.section __TEXT,__text +.globl _AES_XTS_encrypt_update_vaes +.p2align 4 +_AES_XTS_encrypt_update_vaes: +#endif /* __APPLE__ */ + pushq %r12 + movq %rdx, %rax + movq %rcx, %r10 + subq $0x40, %rsp + vmovdqu L_vaes_aes_xts_gc_xts(%rip), %xmm12 + vbroadcasti128 L_vaes_aes_xts_poly(%rip), %ymm13 + vmovdqu L_vaes_aes_xts_shl(%rip), %ymm14 + vmovdqu L_vaes_aes_xts_shr(%rip), %ymm15 + vmovdqu (%r8), %xmm8 + xorl %r12d, %r12d + cmpl $32, %eax + jl L_AES_XTS_encrypt_update_vaes_done_128 + cmpl $0x80, %eax + movl %eax, %r11d + jl L_AES_XTS_encrypt_update_vaes_done_128 + andl $0xffffff80, %r11d + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpsrlq $62, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm4, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + vpsrlq $62, %ymm5, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm5, %ymm6 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm9, %ymm6, %ymm6 + vpsrlq $62, %ymm6, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm6, %ymm7 + vpxor %ymm10, %ymm7, %ymm7 + vpxor %ymm9, %ymm7, %ymm7 +L_AES_XTS_encrypt_update_vaes_enc_128: + # 128 bytes of input + # aes_enc_128 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + # aes_enc_block + vbroadcasti128 (%r10), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm9, %ymm1, %ymm1 + vpxor %ymm6, %ymm2, %ymm2 + vpxor %ymm9, %ymm2, %ymm2 + vpxor %ymm7, %ymm3, %ymm3 + vpxor %ymm9, %ymm3, %ymm3 + vbroadcasti128 16(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 32(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 48(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 64(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 80(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 96(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 112(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 128(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 144(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + cmpl $11, %r9d + vbroadcasti128 160(%r10), %ymm9 + jl L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 176(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + cmpl $13, %r9d + vbroadcasti128 192(%r10), %ymm9 + jl L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 208(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 224(%r10), %ymm9 +L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last: + vaesenclast %ymm9, %ymm0, %ymm0 + vaesenclast %ymm9, %ymm1, %ymm1 + vaesenclast %ymm9, %ymm2, %ymm2 + vaesenclast %ymm9, %ymm3, %ymm3 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vpsrlq $56, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm4, %ymm4 + vpxor %ymm10, %ymm4, %ymm4 + vpxor %ymm9, %ymm4, %ymm4 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vpsrlq $56, %ymm5, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm5, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + vpxor %ymm6, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vpsrlq $56, %ymm6, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm6, %ymm6 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm9, %ymm6, %ymm6 + vpxor %ymm7, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + vpsrlq $56, %ymm7, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm7, %ymm7 + vpxor %ymm10, %ymm7, %ymm7 + vpxor %ymm9, %ymm7, %ymm7 + addl $0x80, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_encrypt_update_vaes_enc_128 + vextracti128 $0x00, %ymm4, %xmm8 +L_AES_XTS_encrypt_update_vaes_done_128: + movl %eax, %r11d + andl $0xffffffc0, %r11d + cmpl %r11d, %r12d + je L_AES_XTS_encrypt_update_vaes_done_64 + # 64 bytes of input + # aes_enc_64 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpsrlq $62, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm4, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + # aes_enc_block + vbroadcasti128 (%r10), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm9, %ymm1, %ymm1 + vbroadcasti128 16(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 32(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 48(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 64(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 80(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 96(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 112(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 128(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 144(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + cmpl $11, %r9d + vbroadcasti128 160(%r10), %ymm9 + jl L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 176(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + cmpl $13, %r9d + vbroadcasti128 192(%r10), %ymm9 + jl L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 208(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 224(%r10), %ymm9 +L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last: + vaesenclast %ymm9, %ymm0, %ymm0 + vaesenclast %ymm9, %ymm1, %ymm1 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vextracti128 $0x01, %ymm5, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpand %xmm12, %xmm9, %xmm9 + vpxor %xmm9, %xmm8, %xmm8 + addl $0x40, %r12d +L_AES_XTS_encrypt_update_vaes_done_64: + movl %eax, %r11d + andl $0xffffffe0, %r11d + cmpl %r11d, %r12d + je L_AES_XTS_encrypt_update_vaes_done_32 + # 32 bytes of input + # aes_enc_32 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu (%rcx), %ymm0 + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + # aes_enc_block + vbroadcasti128 (%r10), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vbroadcasti128 16(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 32(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 48(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 64(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 80(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 96(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 112(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 128(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 144(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + cmpl $11, %r9d + vbroadcasti128 160(%r10), %ymm9 + jl L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 176(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + cmpl $13, %r9d + vbroadcasti128 192(%r10), %ymm9 + jl L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 208(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 224(%r10), %ymm9 +L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last: + vaesenclast %ymm9, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vextracti128 $0x01, %ymm4, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpand %xmm12, %xmm9, %xmm9 + vpxor %xmm9, %xmm8, %xmm8 + addl $32, %r12d +L_AES_XTS_encrypt_update_vaes_done_32: + cmpl %eax, %r12d + movl %eax, %r11d + je L_AES_XTS_encrypt_update_vaes_done_enc + subl %r12d, %r11d + cmpl $16, %r11d + movl %eax, %r11d + jl L_AES_XTS_encrypt_update_vaes_last_15 + andl $0xfffffff0, %r11d + # 16 bytes of input +L_AES_XTS_encrypt_update_vaes_enc_16: + leaq (%rdi,%r12,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_enc_block + vpxor (%r10), %xmm0, %xmm0 + vmovdqu 16(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_vaes_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r10), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_vaes_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r10), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_encrypt_update_vaes_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r12,1), %rcx + vmovdqu %xmm0, (%rcx) + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm8, %xmm8 + addl $16, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_encrypt_update_vaes_enc_16 + cmpl %eax, %r12d + je L_AES_XTS_encrypt_update_vaes_done_enc +L_AES_XTS_encrypt_update_vaes_last_15: + subq $16, %r12 + leaq (%rsi,%r12,1), %rcx + vmovdqu (%rcx), %xmm0 + addq $16, %r12 + vmovdqu %xmm0, (%rsp) + xorq %rdx, %rdx +L_AES_XTS_encrypt_update_vaes_last_15_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r12,1), %cl + movb %r11b, (%rsi,%r12,1) + movb %cl, (%rsp,%rdx,1) + incl %r12d + incl %edx + cmpl %eax, %r12d + jl L_AES_XTS_encrypt_update_vaes_last_15_byte_loop + subq %rdx, %r12 + vmovdqu (%rsp), %xmm0 + subq $16, %r12 + vpxor %xmm8, %xmm0, %xmm0 + # aes_enc_block + vpxor (%r10), %xmm0, %xmm0 + vmovdqu 16(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_vaes_last_15_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r10), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_vaes_last_15_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r10), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_encrypt_update_vaes_last_15_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r12,1), %rcx + vmovdqu %xmm0, (%rcx) +L_AES_XTS_encrypt_update_vaes_done_enc: + vmovdqu %xmm8, (%r8) + addq $0x40, %rsp + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_encrypt_update_vaes,.-AES_XTS_encrypt_update_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_XTS_decrypt_vaes +.type AES_XTS_decrypt_vaes,@function +.align 16 +AES_XTS_decrypt_vaes: +#else +.section __TEXT,__text +.globl _AES_XTS_decrypt_vaes +.p2align 4 +_AES_XTS_decrypt_vaes: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + movq %rdx, %rax + movq %rcx, %r12 + movl 24(%rsp), %r10d + subq $0x40, %rsp + vmovdqu L_vaes_aes_xts_gc_xts(%rip), %xmm12 + vbroadcasti128 L_vaes_aes_xts_poly(%rip), %ymm13 + vmovdqu L_vaes_aes_xts_shl(%rip), %ymm14 + vmovdqu L_vaes_aes_xts_shr(%rip), %ymm15 + vmovdqu (%r12), %xmm8 + # aes_enc_block + vpxor (%r9), %xmm8, %xmm8 + vmovdqu 16(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 32(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 48(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 64(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 80(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 96(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 112(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 128(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 144(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + cmpl $11, %r10d + vmovdqu 160(%r9), %xmm5 + jl L_AES_XTS_decrypt_vaes_tweak_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 176(%r9), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqu 192(%r9), %xmm5 + jl L_AES_XTS_decrypt_vaes_tweak_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 208(%r9), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + vmovdqu 224(%r9), %xmm5 +L_AES_XTS_decrypt_vaes_tweak_aes_enc_block_last: + vaesenclast %xmm5, %xmm8, %xmm8 + xorl %r13d, %r13d + movl %eax, %r11d + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_vaes_mul16_128 + subl $16, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_vaes_last_31_start +L_AES_XTS_decrypt_vaes_mul16_128: + cmpl $32, %r11d + jl L_AES_XTS_decrypt_vaes_done_128 + cmpl $0x80, %r11d + jl L_AES_XTS_decrypt_vaes_done_128 + andl $0xffffff80, %r11d + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpsrlq $62, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm4, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + vpsrlq $62, %ymm5, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm5, %ymm6 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm9, %ymm6, %ymm6 + vpsrlq $62, %ymm6, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm6, %ymm7 + vpxor %ymm10, %ymm7, %ymm7 + vpxor %ymm9, %ymm7, %ymm7 +L_AES_XTS_decrypt_vaes_dec_128: + # 128 bytes of input + # aes_dec_128 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + # aes_dec_block + vbroadcasti128 (%r8), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm9, %ymm1, %ymm1 + vpxor %ymm6, %ymm2, %ymm2 + vpxor %ymm9, %ymm2, %ymm2 + vpxor %ymm7, %ymm3, %ymm3 + vpxor %ymm9, %ymm3, %ymm3 + vbroadcasti128 16(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 32(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 48(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 64(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 80(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 96(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 112(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 128(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 144(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + cmpl $11, %r10d + vbroadcasti128 160(%r8), %ymm9 + jl L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 176(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + cmpl $13, %r10d + vbroadcasti128 192(%r8), %ymm9 + jl L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 208(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 224(%r8), %ymm9 +L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last: + vaesdeclast %ymm9, %ymm0, %ymm0 + vaesdeclast %ymm9, %ymm1, %ymm1 + vaesdeclast %ymm9, %ymm2, %ymm2 + vaesdeclast %ymm9, %ymm3, %ymm3 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vpsrlq $56, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm4, %ymm4 + vpxor %ymm10, %ymm4, %ymm4 + vpxor %ymm9, %ymm4, %ymm4 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vpsrlq $56, %ymm5, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm5, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + vpxor %ymm6, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vpsrlq $56, %ymm6, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm6, %ymm6 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm9, %ymm6, %ymm6 + vpxor %ymm7, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + vpsrlq $56, %ymm7, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm7, %ymm7 + vpxor %ymm10, %ymm7, %ymm7 + vpxor %ymm9, %ymm7, %ymm7 + addl $0x80, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_decrypt_vaes_dec_128 + vextracti128 $0x00, %ymm4, %xmm8 +L_AES_XTS_decrypt_vaes_done_128: + cmpl %eax, %r13d + movl %eax, %r11d + je L_AES_XTS_decrypt_vaes_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_vaes_mul16_64 + subl $16, %r11d + subl %r13d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_vaes_last_31_start + addl %r13d, %r11d +L_AES_XTS_decrypt_vaes_mul16_64: + andl $0xffffffc0, %r11d + cmpl %r11d, %r13d + je L_AES_XTS_decrypt_vaes_done_64 + # 64 bytes of input + # aes_dec_64 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpsrlq $62, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm4, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + # aes_dec_block + vbroadcasti128 (%r8), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm9, %ymm1, %ymm1 + vbroadcasti128 16(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 32(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 48(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 64(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 80(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 96(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 112(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 128(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 144(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + cmpl $11, %r10d + vbroadcasti128 160(%r8), %ymm9 + jl L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 176(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + cmpl $13, %r10d + vbroadcasti128 192(%r8), %ymm9 + jl L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 208(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 224(%r8), %ymm9 +L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last: + vaesdeclast %ymm9, %ymm0, %ymm0 + vaesdeclast %ymm9, %ymm1, %ymm1 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vextracti128 $0x01, %ymm5, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpand %xmm12, %xmm9, %xmm9 + vpxor %xmm9, %xmm8, %xmm8 + addl $0x40, %r13d +L_AES_XTS_decrypt_vaes_done_64: + cmpl %eax, %r13d + movl %eax, %r11d + je L_AES_XTS_decrypt_vaes_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_vaes_mul16_32 + subl $16, %r11d + subl %r13d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_vaes_last_31_start + addl %r13d, %r11d +L_AES_XTS_decrypt_vaes_mul16_32: + andl $0xffffffe0, %r11d + cmpl %r11d, %r13d + je L_AES_XTS_decrypt_vaes_done_32 + # 32 bytes of input + # aes_dec_32 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu (%rcx), %ymm0 + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + # aes_dec_block + vbroadcasti128 (%r8), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vbroadcasti128 16(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 32(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 48(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 64(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 80(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 96(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 112(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 128(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 144(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + cmpl $11, %r10d + vbroadcasti128 160(%r8), %ymm9 + jl L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 176(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + cmpl $13, %r10d + vbroadcasti128 192(%r8), %ymm9 + jl L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 208(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 224(%r8), %ymm9 +L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last: + vaesdeclast %ymm9, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vextracti128 $0x01, %ymm4, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpand %xmm12, %xmm9, %xmm9 + vpxor %xmm9, %xmm8, %xmm8 + addl $32, %r13d +L_AES_XTS_decrypt_vaes_done_32: + cmpl %eax, %r13d + movl %eax, %r11d + je L_AES_XTS_decrypt_vaes_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_vaes_mul16 + subl $16, %r11d + subl %r13d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_vaes_last_31_start + addl %r13d, %r11d +L_AES_XTS_decrypt_vaes_mul16: +L_AES_XTS_decrypt_vaes_dec_16: + # 16 bytes of input + leaq (%rdi,%r13,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_decrypt_vaes_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_decrypt_vaes_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_decrypt_vaes_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r13,1), %rcx + vmovdqu %xmm0, (%rcx) + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm8, %xmm8 + addl $16, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_decrypt_vaes_dec_16 + cmpl %eax, %r13d + je L_AES_XTS_decrypt_vaes_done_dec +L_AES_XTS_decrypt_vaes_last_31_start: + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm7 + vpsrad $31, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm7, %xmm7 + leaq (%rdi,%r13,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm7, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_decrypt_vaes_last_31_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_decrypt_vaes_last_31_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_decrypt_vaes_last_31_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm7, %xmm0, %xmm0 + vmovdqu %xmm0, (%rsp) + addq $16, %r13 + xorq %rdx, %rdx +L_AES_XTS_decrypt_vaes_last_31_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r13,1), %cl + movb %r11b, (%rsi,%r13,1) + movb %cl, (%rsp,%rdx,1) + incl %r13d + incl %edx + cmpl %eax, %r13d + jl L_AES_XTS_decrypt_vaes_last_31_byte_loop + subq %rdx, %r13 + vmovdqu (%rsp), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_decrypt_vaes_last_31_2_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_decrypt_vaes_last_31_2_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_decrypt_vaes_last_31_2_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + subq $16, %r13 + leaq (%rsi,%r13,1), %rcx + vmovdqu %xmm0, (%rcx) +L_AES_XTS_decrypt_vaes_done_dec: + addq $0x40, %rsp + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_decrypt_vaes,.-AES_XTS_decrypt_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_XTS_decrypt_update_vaes +.type AES_XTS_decrypt_update_vaes,@function +.align 16 +AES_XTS_decrypt_update_vaes: +#else +.section __TEXT,__text +.globl _AES_XTS_decrypt_update_vaes +.p2align 4 +_AES_XTS_decrypt_update_vaes: +#endif /* __APPLE__ */ + pushq %r12 + movq %rdx, %rax + movq %rcx, %r10 + subq $0x40, %rsp + vmovdqu L_vaes_aes_xts_gc_xts(%rip), %xmm12 + vbroadcasti128 L_vaes_aes_xts_poly(%rip), %ymm13 + vmovdqu L_vaes_aes_xts_shl(%rip), %ymm14 + vmovdqu L_vaes_aes_xts_shr(%rip), %ymm15 + vmovdqu (%r8), %xmm8 + xorl %r12d, %r12d + movl %eax, %r11d + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_update_vaes_mul16_128 + subl $16, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_update_vaes_last_31_start +L_AES_XTS_decrypt_update_vaes_mul16_128: + cmpl $32, %r11d + jl L_AES_XTS_decrypt_update_vaes_done_128 + cmpl $0x80, %r11d + jl L_AES_XTS_decrypt_update_vaes_done_128 + andl $0xffffff80, %r11d + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpsrlq $62, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm4, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + vpsrlq $62, %ymm5, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm5, %ymm6 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm9, %ymm6, %ymm6 + vpsrlq $62, %ymm6, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm6, %ymm7 + vpxor %ymm10, %ymm7, %ymm7 + vpxor %ymm9, %ymm7, %ymm7 +L_AES_XTS_decrypt_update_vaes_dec_128: + # 128 bytes of input + # aes_dec_128 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + # aes_dec_block + vbroadcasti128 (%r10), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm9, %ymm1, %ymm1 + vpxor %ymm6, %ymm2, %ymm2 + vpxor %ymm9, %ymm2, %ymm2 + vpxor %ymm7, %ymm3, %ymm3 + vpxor %ymm9, %ymm3, %ymm3 + vbroadcasti128 16(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 32(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 48(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 64(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 80(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 96(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 112(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 128(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 144(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + cmpl $11, %r9d + vbroadcasti128 160(%r10), %ymm9 + jl L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 176(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + cmpl $13, %r9d + vbroadcasti128 192(%r10), %ymm9 + jl L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 208(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 224(%r10), %ymm9 +L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last: + vaesdeclast %ymm9, %ymm0, %ymm0 + vaesdeclast %ymm9, %ymm1, %ymm1 + vaesdeclast %ymm9, %ymm2, %ymm2 + vaesdeclast %ymm9, %ymm3, %ymm3 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vpsrlq $56, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm4, %ymm4 + vpxor %ymm10, %ymm4, %ymm4 + vpxor %ymm9, %ymm4, %ymm4 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vpsrlq $56, %ymm5, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm5, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + vpxor %ymm6, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vpsrlq $56, %ymm6, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm6, %ymm6 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm9, %ymm6, %ymm6 + vpxor %ymm7, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + vpsrlq $56, %ymm7, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm7, %ymm7 + vpxor %ymm10, %ymm7, %ymm7 + vpxor %ymm9, %ymm7, %ymm7 + addl $0x80, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_decrypt_update_vaes_dec_128 + vextracti128 $0x00, %ymm4, %xmm8 +L_AES_XTS_decrypt_update_vaes_done_128: + cmpl %eax, %r12d + movl %eax, %r11d + je L_AES_XTS_decrypt_update_vaes_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_update_vaes_mul16_64 + subl $16, %r11d + subl %r12d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_update_vaes_last_31_start + addl %r12d, %r11d +L_AES_XTS_decrypt_update_vaes_mul16_64: + andl $0xffffffc0, %r11d + cmpl %r11d, %r12d + je L_AES_XTS_decrypt_update_vaes_done_64 + # 64 bytes of input + # aes_dec_64 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpsrlq $62, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm4, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + # aes_dec_block + vbroadcasti128 (%r10), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm9, %ymm1, %ymm1 + vbroadcasti128 16(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 32(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 48(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 64(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 80(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 96(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 112(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 128(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 144(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + cmpl $11, %r9d + vbroadcasti128 160(%r10), %ymm9 + jl L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 176(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + cmpl $13, %r9d + vbroadcasti128 192(%r10), %ymm9 + jl L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 208(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 224(%r10), %ymm9 +L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last: + vaesdeclast %ymm9, %ymm0, %ymm0 + vaesdeclast %ymm9, %ymm1, %ymm1 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vextracti128 $0x01, %ymm5, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpand %xmm12, %xmm9, %xmm9 + vpxor %xmm9, %xmm8, %xmm8 + addl $0x40, %r12d +L_AES_XTS_decrypt_update_vaes_done_64: + cmpl %eax, %r12d + movl %eax, %r11d + je L_AES_XTS_decrypt_update_vaes_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_update_vaes_mul16_32 + subl $16, %r11d + subl %r12d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_update_vaes_last_31_start + addl %r12d, %r11d +L_AES_XTS_decrypt_update_vaes_mul16_32: + andl $0xffffffe0, %r11d + cmpl %r11d, %r12d + je L_AES_XTS_decrypt_update_vaes_done_32 + # 32 bytes of input + # aes_dec_32 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu (%rcx), %ymm0 + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + # aes_dec_block + vbroadcasti128 (%r10), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vbroadcasti128 16(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 32(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 48(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 64(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 80(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 96(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 112(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 128(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 144(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + cmpl $11, %r9d + vbroadcasti128 160(%r10), %ymm9 + jl L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 176(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + cmpl $13, %r9d + vbroadcasti128 192(%r10), %ymm9 + jl L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 208(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 224(%r10), %ymm9 +L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last: + vaesdeclast %ymm9, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vextracti128 $0x01, %ymm4, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpand %xmm12, %xmm9, %xmm9 + vpxor %xmm9, %xmm8, %xmm8 + addl $32, %r12d +L_AES_XTS_decrypt_update_vaes_done_32: + cmpl %eax, %r12d + movl %eax, %r11d + je L_AES_XTS_decrypt_update_vaes_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_update_vaes_mul16 + subl $16, %r11d + subl %r12d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_update_vaes_last_31_start + addl %r12d, %r11d +L_AES_XTS_decrypt_update_vaes_mul16: +L_AES_XTS_decrypt_update_vaes_dec_16: + # 16 bytes of input + leaq (%rdi,%r12,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r10), %xmm0, %xmm0 + vmovdqu 16(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_vaes_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_vaes_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_decrypt_update_vaes_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r12,1), %rcx + vmovdqu %xmm0, (%rcx) + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm8, %xmm8 + addl $16, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_decrypt_update_vaes_dec_16 + cmpl %eax, %r12d + je L_AES_XTS_decrypt_update_vaes_done_dec +L_AES_XTS_decrypt_update_vaes_last_31_start: + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm7 + vpsrad $31, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm7, %xmm7 + leaq (%rdi,%r12,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm7, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r10), %xmm0, %xmm0 + vmovdqu 16(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_vaes_last_31_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_vaes_last_31_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_decrypt_update_vaes_last_31_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm7, %xmm0, %xmm0 + vmovdqu %xmm0, (%rsp) + addq $16, %r12 + xorq %rdx, %rdx +L_AES_XTS_decrypt_update_vaes_last_31_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r12,1), %cl + movb %r11b, (%rsi,%r12,1) + movb %cl, (%rsp,%rdx,1) + incl %r12d + incl %edx + cmpl %eax, %r12d + jl L_AES_XTS_decrypt_update_vaes_last_31_byte_loop + subq %rdx, %r12 + vmovdqu (%rsp), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r10), %xmm0, %xmm0 + vmovdqu 16(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_vaes_last_31_2_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_vaes_last_31_2_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_decrypt_update_vaes_last_31_2_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + subq $16, %r12 + leaq (%rsi,%r12,1), %rcx + vmovdqu %xmm0, (%rcx) +L_AES_XTS_decrypt_update_vaes_done_dec: + vmovdqu %xmm8, (%r8) + addq $0x40, %rsp + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_decrypt_update_vaes,.-AES_XTS_decrypt_update_vaes +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_VAES */ +#ifdef HAVE_INTEL_AVX512 +#ifndef __APPLE__ +.text +.globl AES_XTS_init_avx512 +.type AES_XTS_init_avx512,@function +.align 16 +AES_XTS_init_avx512: +#else +.section __TEXT,__text +.globl _AES_XTS_init_avx512 +.p2align 4 +_AES_XTS_init_avx512: +#endif /* __APPLE__ */ + vmovdqu (%rdi), %xmm0 + # aes_enc_block + vpxor (%rsi), %xmm0, %xmm0 + vmovdqu 16(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 32(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 48(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 64(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 80(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 96(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 112(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 128(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 144(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + cmpl $11, %edx + vmovdqu 160(%rsi), %xmm2 + jl L_AES_XTS_init_avx512_tweak_aes_enc_block_last + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 176(%rsi), %xmm3 + vaesenc %xmm3, %xmm0, %xmm0 + cmpl $13, %edx + vmovdqu 192(%rsi), %xmm2 + jl L_AES_XTS_init_avx512_tweak_aes_enc_block_last + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 208(%rsi), %xmm3 + vaesenc %xmm3, %xmm0, %xmm0 + vmovdqu 224(%rsi), %xmm2 +L_AES_XTS_init_avx512_tweak_aes_enc_block_last: + vaesenclast %xmm2, %xmm0, %xmm0 + vmovdqu %xmm0, (%rdi) + repz retq +#ifndef __APPLE__ +.size AES_XTS_init_avx512,.-AES_XTS_init_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_xts_gc_xts: +.long 0x00000087,0x00000000,0x00000001,0x00000000 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_xts_poly: +.long 0x00000087,0x00000000,0x00000000,0x00000000 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_xts_shl: +.long 0x00000000,0x00000000,0x00000000,0x00000000 +.long 0x00000001,0x00000000,0x00000001,0x00000000 +.long 0x00000002,0x00000000,0x00000002,0x00000000 +.long 0x00000003,0x00000000,0x00000003,0x00000000 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_xts_shr: +.long 0x00000040,0x00000000,0x00000040,0x00000000 +.long 0x0000003f,0x00000000,0x0000003f,0x00000000 +.long 0x0000003e,0x00000000,0x0000003e,0x00000000 +.long 0x0000003d,0x00000000,0x0000003d,0x00000000 +#ifndef __APPLE__ +.text +.globl AES_XTS_encrypt_avx512 +.type AES_XTS_encrypt_avx512,@function +.align 16 +AES_XTS_encrypt_avx512: +#else +.section __TEXT,__text +.globl _AES_XTS_encrypt_avx512 +.p2align 4 +_AES_XTS_encrypt_avx512: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + movq %rdx, %rax + movq %rcx, %r12 + movl 24(%rsp), %r10d + subq $0x40, %rsp + vmovdqu L_avx512_aes_xts_gc_xts(%rip), %xmm12 + vbroadcasti32x4 L_avx512_aes_xts_poly(%rip), %zmm13 + vmovdqu64 L_avx512_aes_xts_shl(%rip), %zmm14 + vmovdqu64 L_avx512_aes_xts_shr(%rip), %zmm15 + vmovdqu (%r12), %xmm8 + # aes_enc_block + vpxor (%r9), %xmm8, %xmm8 + vmovdqu 16(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 32(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 48(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 64(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 80(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 96(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 112(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 128(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 144(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + cmpl $11, %r10d + vmovdqu 160(%r9), %xmm5 + jl L_AES_XTS_encrypt_avx512_tweak_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 176(%r9), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqu 192(%r9), %xmm5 + jl L_AES_XTS_encrypt_avx512_tweak_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 208(%r9), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + vmovdqu 224(%r9), %xmm5 +L_AES_XTS_encrypt_avx512_tweak_aes_enc_block_last: + vaesenclast %xmm5, %xmm8, %xmm8 + xorl %r13d, %r13d + cmpl $32, %eax + jl L_AES_XTS_encrypt_avx512_done_128 + vbroadcasti32x4 (%r8), %zmm16 + vbroadcasti32x4 16(%r8), %zmm17 + vbroadcasti32x4 32(%r8), %zmm18 + vbroadcasti32x4 48(%r8), %zmm19 + vbroadcasti32x4 64(%r8), %zmm20 + vbroadcasti32x4 80(%r8), %zmm21 + vbroadcasti32x4 96(%r8), %zmm22 + vbroadcasti32x4 112(%r8), %zmm23 + vbroadcasti32x4 128(%r8), %zmm24 + vbroadcasti32x4 144(%r8), %zmm25 + vbroadcasti32x4 160(%r8), %zmm26 + cmpl $11, %r10d + jl L_AES_XTS_encrypt_avx512_key_cached + vbroadcasti32x4 176(%r8), %zmm27 + vbroadcasti32x4 192(%r8), %zmm28 + cmpl $13, %r10d + jl L_AES_XTS_encrypt_avx512_key_cached + vbroadcasti32x4 208(%r8), %zmm29 + vbroadcasti32x4 224(%r8), %zmm30 +L_AES_XTS_encrypt_avx512_key_cached: + cmpl $0x100, %eax + movl %eax, %r11d + jl L_AES_XTS_encrypt_avx512_done_256 + andl $0xffffff00, %r11d + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + vpsrlq $60, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm4, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + vpsrlq $60, %zmm5, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm5, %zmm6 + vpternlogq $0x96, %zmm9, %zmm10, %zmm6 + vpsrlq $60, %zmm6, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm6, %zmm7 + vpternlogq $0x96, %zmm9, %zmm10, %zmm7 +L_AES_XTS_encrypt_avx512_enc_256: + # 256 bytes of input + # aes_enc_256 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vmovdqu64 64(%rcx), %zmm1 + vmovdqu64 128(%rcx), %zmm2 + vmovdqu64 192(%rcx), %zmm3 + # aes_enc_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vpternlogq $0x96, %zmm5, %zmm16, %zmm1 + vpternlogq $0x96, %zmm6, %zmm16, %zmm2 + vpternlogq $0x96, %zmm7, %zmm16, %zmm3 + vaesenc %zmm17, %zmm0, %zmm0 + vaesenc %zmm17, %zmm1, %zmm1 + vaesenc %zmm17, %zmm2, %zmm2 + vaesenc %zmm17, %zmm3, %zmm3 + vaesenc %zmm18, %zmm0, %zmm0 + vaesenc %zmm18, %zmm1, %zmm1 + vaesenc %zmm18, %zmm2, %zmm2 + vaesenc %zmm18, %zmm3, %zmm3 + vaesenc %zmm19, %zmm0, %zmm0 + vaesenc %zmm19, %zmm1, %zmm1 + vaesenc %zmm19, %zmm2, %zmm2 + vaesenc %zmm19, %zmm3, %zmm3 + vaesenc %zmm20, %zmm0, %zmm0 + vaesenc %zmm20, %zmm1, %zmm1 + vaesenc %zmm20, %zmm2, %zmm2 + vaesenc %zmm20, %zmm3, %zmm3 + vaesenc %zmm21, %zmm0, %zmm0 + vaesenc %zmm21, %zmm1, %zmm1 + vaesenc %zmm21, %zmm2, %zmm2 + vaesenc %zmm21, %zmm3, %zmm3 + vaesenc %zmm22, %zmm0, %zmm0 + vaesenc %zmm22, %zmm1, %zmm1 + vaesenc %zmm22, %zmm2, %zmm2 + vaesenc %zmm22, %zmm3, %zmm3 + vaesenc %zmm23, %zmm0, %zmm0 + vaesenc %zmm23, %zmm1, %zmm1 + vaesenc %zmm23, %zmm2, %zmm2 + vaesenc %zmm23, %zmm3, %zmm3 + vaesenc %zmm24, %zmm0, %zmm0 + vaesenc %zmm24, %zmm1, %zmm1 + vaesenc %zmm24, %zmm2, %zmm2 + vaesenc %zmm24, %zmm3, %zmm3 + vaesenc %zmm25, %zmm0, %zmm0 + vaesenc %zmm25, %zmm1, %zmm1 + vaesenc %zmm25, %zmm2, %zmm2 + vaesenc %zmm25, %zmm3, %zmm3 + cmpl $11, %r10d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_encrypt_avx512_aes_enc_256_aes_enc_block_last + vaesenc %zmm26, %zmm0, %zmm0 + vaesenc %zmm26, %zmm1, %zmm1 + vaesenc %zmm26, %zmm2, %zmm2 + vaesenc %zmm26, %zmm3, %zmm3 + vaesenc %zmm27, %zmm0, %zmm0 + vaesenc %zmm27, %zmm1, %zmm1 + vaesenc %zmm27, %zmm2, %zmm2 + vaesenc %zmm27, %zmm3, %zmm3 + cmpl $13, %r10d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_encrypt_avx512_aes_enc_256_aes_enc_block_last + vaesenc %zmm28, %zmm0, %zmm0 + vaesenc %zmm28, %zmm1, %zmm1 + vaesenc %zmm28, %zmm2, %zmm2 + vaesenc %zmm28, %zmm3, %zmm3 + vaesenc %zmm29, %zmm0, %zmm0 + vaesenc %zmm29, %zmm1, %zmm1 + vaesenc %zmm29, %zmm2, %zmm2 + vaesenc %zmm29, %zmm3, %zmm3 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_encrypt_avx512_aes_enc_256_aes_enc_block_last: + vaesenclast %zmm9, %zmm0, %zmm0 + vaesenclast %zmm9, %zmm1, %zmm1 + vaesenclast %zmm9, %zmm2, %zmm2 + vaesenclast %zmm9, %zmm3, %zmm3 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vpsrlq $48, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm4, %zmm4 + vpternlogq $0x96, %zmm9, %zmm10, %zmm4 + vpxorq %zmm5, %zmm1, %zmm1 + vmovdqu64 %zmm1, 64(%rdx) + vpsrlq $48, %zmm5, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm5, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + vpxorq %zmm6, %zmm2, %zmm2 + vmovdqu64 %zmm2, 128(%rdx) + vpsrlq $48, %zmm6, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm6, %zmm6 + vpternlogq $0x96, %zmm9, %zmm10, %zmm6 + vpxorq %zmm7, %zmm3, %zmm3 + vmovdqu64 %zmm3, 192(%rdx) + vpsrlq $48, %zmm7, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm7, %zmm7 + vpternlogq $0x96, %zmm9, %zmm10, %zmm7 + addl $0x100, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_encrypt_avx512_enc_256 + vextracti32x4 $0x00, %zmm4, %xmm8 +L_AES_XTS_encrypt_avx512_done_256: + movl %eax, %r11d + andl $0xffffff80, %r11d + cmpl %r11d, %r13d + je L_AES_XTS_encrypt_avx512_done_128 + # 128 bytes of input + # aes_enc_128 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vmovdqu64 64(%rcx), %zmm1 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + vpsrlq $60, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm4, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + # aes_enc_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vpternlogq $0x96, %zmm5, %zmm16, %zmm1 + vaesenc %zmm17, %zmm0, %zmm0 + vaesenc %zmm17, %zmm1, %zmm1 + vaesenc %zmm18, %zmm0, %zmm0 + vaesenc %zmm18, %zmm1, %zmm1 + vaesenc %zmm19, %zmm0, %zmm0 + vaesenc %zmm19, %zmm1, %zmm1 + vaesenc %zmm20, %zmm0, %zmm0 + vaesenc %zmm20, %zmm1, %zmm1 + vaesenc %zmm21, %zmm0, %zmm0 + vaesenc %zmm21, %zmm1, %zmm1 + vaesenc %zmm22, %zmm0, %zmm0 + vaesenc %zmm22, %zmm1, %zmm1 + vaesenc %zmm23, %zmm0, %zmm0 + vaesenc %zmm23, %zmm1, %zmm1 + vaesenc %zmm24, %zmm0, %zmm0 + vaesenc %zmm24, %zmm1, %zmm1 + vaesenc %zmm25, %zmm0, %zmm0 + vaesenc %zmm25, %zmm1, %zmm1 + cmpl $11, %r10d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_encrypt_avx512_aes_enc_128_aes_enc_block_last + vaesenc %zmm26, %zmm0, %zmm0 + vaesenc %zmm26, %zmm1, %zmm1 + vaesenc %zmm27, %zmm0, %zmm0 + vaesenc %zmm27, %zmm1, %zmm1 + cmpl $13, %r10d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_encrypt_avx512_aes_enc_128_aes_enc_block_last + vaesenc %zmm28, %zmm0, %zmm0 + vaesenc %zmm28, %zmm1, %zmm1 + vaesenc %zmm29, %zmm0, %zmm0 + vaesenc %zmm29, %zmm1, %zmm1 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_encrypt_avx512_aes_enc_128_aes_enc_block_last: + vaesenclast %zmm9, %zmm0, %zmm0 + vaesenclast %zmm9, %zmm1, %zmm1 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vpxorq %zmm5, %zmm1, %zmm1 + vmovdqu64 %zmm1, 64(%rdx) + vextracti32x4 $3, %zmm5, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpternlogd $0x78, %xmm12, %xmm9, %xmm8 + addl $0x80, %r13d +L_AES_XTS_encrypt_avx512_done_128: + movl %eax, %r11d + andl $0xffffffc0, %r11d + cmpl %r11d, %r13d + je L_AES_XTS_encrypt_avx512_done_64 + # 64 bytes of input + # aes_enc_64 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + # aes_enc_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vaesenc %zmm17, %zmm0, %zmm0 + vaesenc %zmm18, %zmm0, %zmm0 + vaesenc %zmm19, %zmm0, %zmm0 + vaesenc %zmm20, %zmm0, %zmm0 + vaesenc %zmm21, %zmm0, %zmm0 + vaesenc %zmm22, %zmm0, %zmm0 + vaesenc %zmm23, %zmm0, %zmm0 + vaesenc %zmm24, %zmm0, %zmm0 + vaesenc %zmm25, %zmm0, %zmm0 + cmpl $11, %r10d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_encrypt_avx512_aes_enc_64_aes_enc_block_last + vaesenc %zmm26, %zmm0, %zmm0 + vaesenc %zmm27, %zmm0, %zmm0 + cmpl $13, %r10d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_encrypt_avx512_aes_enc_64_aes_enc_block_last + vaesenc %zmm28, %zmm0, %zmm0 + vaesenc %zmm29, %zmm0, %zmm0 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_encrypt_avx512_aes_enc_64_aes_enc_block_last: + vaesenclast %zmm9, %zmm0, %zmm0 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vextracti32x4 $3, %zmm4, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpternlogd $0x78, %xmm12, %xmm9, %xmm8 + addl $0x40, %r13d +L_AES_XTS_encrypt_avx512_done_64: + movl %eax, %r11d + andl $0xffffffe0, %r11d + cmpl %r11d, %r13d + je L_AES_XTS_encrypt_avx512_done_32 + # 32 bytes of input + # aes_enc_32 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu64 (%rcx), %ymm0 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + # aes_enc_block + vpternlogq $0x96, %ymm4, %ymm16, %ymm0 + vaesenc %ymm17, %ymm0, %ymm0 + vaesenc %ymm18, %ymm0, %ymm0 + vaesenc %ymm19, %ymm0, %ymm0 + vaesenc %ymm20, %ymm0, %ymm0 + vaesenc %ymm21, %ymm0, %ymm0 + vaesenc %ymm22, %ymm0, %ymm0 + vaesenc %ymm23, %ymm0, %ymm0 + vaesenc %ymm24, %ymm0, %ymm0 + vaesenc %ymm25, %ymm0, %ymm0 + cmpl $11, %r10d + vmovdqa64 %ymm26, %ymm9 + jl L_AES_XTS_encrypt_avx512_aes_enc_32_aes_enc_block_last + vaesenc %ymm26, %ymm0, %ymm0 + vaesenc %ymm27, %ymm0, %ymm0 + cmpl $13, %r10d + vmovdqa64 %ymm28, %ymm9 + jl L_AES_XTS_encrypt_avx512_aes_enc_32_aes_enc_block_last + vaesenc %ymm28, %ymm0, %ymm0 + vaesenc %ymm29, %ymm0, %ymm0 + vmovdqa64 %ymm30, %ymm9 +L_AES_XTS_encrypt_avx512_aes_enc_32_aes_enc_block_last: + vaesenclast %ymm9, %ymm0, %ymm0 + vpxorq %ymm4, %ymm0, %ymm0 + vmovdqu64 %ymm0, (%rdx) + vextracti32x4 $2, %zmm4, %xmm8 + addl $32, %r13d +L_AES_XTS_encrypt_avx512_done_32: + cmpl %eax, %r13d + movl %eax, %r11d + je L_AES_XTS_encrypt_avx512_done_enc + subl %r13d, %r11d + cmpl $16, %r11d + movl %eax, %r11d + jl L_AES_XTS_encrypt_avx512_last_15 + andl $0xfffffff0, %r11d + # 16 bytes of input +L_AES_XTS_encrypt_avx512_enc_16: + leaq (%rdi,%r13,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_enc_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_encrypt_avx512_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_encrypt_avx512_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_encrypt_avx512_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r13,1), %rcx + vmovdqu %xmm0, (%rcx) + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm4, %xmm4 + vpternlogd $0x78, %xmm12, %xmm4, %xmm8 + addl $16, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_encrypt_avx512_enc_16 + cmpl %eax, %r13d + je L_AES_XTS_encrypt_avx512_done_enc +L_AES_XTS_encrypt_avx512_last_15: + subq $16, %r13 + leaq (%rsi,%r13,1), %rcx + vmovdqu (%rcx), %xmm0 + addq $16, %r13 + vmovdqu %xmm0, (%rsp) + xorq %rdx, %rdx +L_AES_XTS_encrypt_avx512_last_15_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r13,1), %cl + movb %r11b, (%rsi,%r13,1) + movb %cl, (%rsp,%rdx,1) + incl %r13d + incl %edx + cmpl %eax, %r13d + jl L_AES_XTS_encrypt_avx512_last_15_byte_loop + subq %rdx, %r13 + vmovdqu (%rsp), %xmm0 + subq $16, %r13 + vpxor %xmm8, %xmm0, %xmm0 + # aes_enc_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_encrypt_avx512_last_15_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_encrypt_avx512_last_15_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_encrypt_avx512_last_15_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r13,1), %rcx + vmovdqu %xmm0, (%rcx) +L_AES_XTS_encrypt_avx512_done_enc: + addq $0x40, %rsp + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_encrypt_avx512,.-AES_XTS_encrypt_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_XTS_encrypt_update_avx512 +.type AES_XTS_encrypt_update_avx512,@function +.align 16 +AES_XTS_encrypt_update_avx512: +#else +.section __TEXT,__text +.globl _AES_XTS_encrypt_update_avx512 +.p2align 4 +_AES_XTS_encrypt_update_avx512: +#endif /* __APPLE__ */ + pushq %r12 + movq %rdx, %rax + movq %rcx, %r10 + subq $0x40, %rsp + vmovdqu L_avx512_aes_xts_gc_xts(%rip), %xmm12 + vbroadcasti32x4 L_avx512_aes_xts_poly(%rip), %zmm13 + vmovdqu64 L_avx512_aes_xts_shl(%rip), %zmm14 + vmovdqu64 L_avx512_aes_xts_shr(%rip), %zmm15 + vmovdqu (%r8), %xmm8 + xorl %r12d, %r12d + cmpl $32, %eax + jl L_AES_XTS_encrypt_update_avx512_done_128 + vbroadcasti32x4 (%r10), %zmm16 + vbroadcasti32x4 16(%r10), %zmm17 + vbroadcasti32x4 32(%r10), %zmm18 + vbroadcasti32x4 48(%r10), %zmm19 + vbroadcasti32x4 64(%r10), %zmm20 + vbroadcasti32x4 80(%r10), %zmm21 + vbroadcasti32x4 96(%r10), %zmm22 + vbroadcasti32x4 112(%r10), %zmm23 + vbroadcasti32x4 128(%r10), %zmm24 + vbroadcasti32x4 144(%r10), %zmm25 + vbroadcasti32x4 160(%r10), %zmm26 + cmpl $11, %r9d + jl L_AES_XTS_encrypt_update_avx512_key_cached + vbroadcasti32x4 176(%r10), %zmm27 + vbroadcasti32x4 192(%r10), %zmm28 + cmpl $13, %r9d + jl L_AES_XTS_encrypt_update_avx512_key_cached + vbroadcasti32x4 208(%r10), %zmm29 + vbroadcasti32x4 224(%r10), %zmm30 +L_AES_XTS_encrypt_update_avx512_key_cached: + cmpl $0x100, %eax + movl %eax, %r11d + jl L_AES_XTS_encrypt_update_avx512_done_256 + andl $0xffffff00, %r11d + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + vpsrlq $60, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm4, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + vpsrlq $60, %zmm5, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm5, %zmm6 + vpternlogq $0x96, %zmm9, %zmm10, %zmm6 + vpsrlq $60, %zmm6, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm6, %zmm7 + vpternlogq $0x96, %zmm9, %zmm10, %zmm7 +L_AES_XTS_encrypt_update_avx512_enc_256: + # 256 bytes of input + # aes_enc_256 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vmovdqu64 64(%rcx), %zmm1 + vmovdqu64 128(%rcx), %zmm2 + vmovdqu64 192(%rcx), %zmm3 + # aes_enc_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vpternlogq $0x96, %zmm5, %zmm16, %zmm1 + vpternlogq $0x96, %zmm6, %zmm16, %zmm2 + vpternlogq $0x96, %zmm7, %zmm16, %zmm3 + vaesenc %zmm17, %zmm0, %zmm0 + vaesenc %zmm17, %zmm1, %zmm1 + vaesenc %zmm17, %zmm2, %zmm2 + vaesenc %zmm17, %zmm3, %zmm3 + vaesenc %zmm18, %zmm0, %zmm0 + vaesenc %zmm18, %zmm1, %zmm1 + vaesenc %zmm18, %zmm2, %zmm2 + vaesenc %zmm18, %zmm3, %zmm3 + vaesenc %zmm19, %zmm0, %zmm0 + vaesenc %zmm19, %zmm1, %zmm1 + vaesenc %zmm19, %zmm2, %zmm2 + vaesenc %zmm19, %zmm3, %zmm3 + vaesenc %zmm20, %zmm0, %zmm0 + vaesenc %zmm20, %zmm1, %zmm1 + vaesenc %zmm20, %zmm2, %zmm2 + vaesenc %zmm20, %zmm3, %zmm3 + vaesenc %zmm21, %zmm0, %zmm0 + vaesenc %zmm21, %zmm1, %zmm1 + vaesenc %zmm21, %zmm2, %zmm2 + vaesenc %zmm21, %zmm3, %zmm3 + vaesenc %zmm22, %zmm0, %zmm0 + vaesenc %zmm22, %zmm1, %zmm1 + vaesenc %zmm22, %zmm2, %zmm2 + vaesenc %zmm22, %zmm3, %zmm3 + vaesenc %zmm23, %zmm0, %zmm0 + vaesenc %zmm23, %zmm1, %zmm1 + vaesenc %zmm23, %zmm2, %zmm2 + vaesenc %zmm23, %zmm3, %zmm3 + vaesenc %zmm24, %zmm0, %zmm0 + vaesenc %zmm24, %zmm1, %zmm1 + vaesenc %zmm24, %zmm2, %zmm2 + vaesenc %zmm24, %zmm3, %zmm3 + vaesenc %zmm25, %zmm0, %zmm0 + vaesenc %zmm25, %zmm1, %zmm1 + vaesenc %zmm25, %zmm2, %zmm2 + vaesenc %zmm25, %zmm3, %zmm3 + cmpl $11, %r9d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_256_aes_enc_block_last + vaesenc %zmm26, %zmm0, %zmm0 + vaesenc %zmm26, %zmm1, %zmm1 + vaesenc %zmm26, %zmm2, %zmm2 + vaesenc %zmm26, %zmm3, %zmm3 + vaesenc %zmm27, %zmm0, %zmm0 + vaesenc %zmm27, %zmm1, %zmm1 + vaesenc %zmm27, %zmm2, %zmm2 + vaesenc %zmm27, %zmm3, %zmm3 + cmpl $13, %r9d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_256_aes_enc_block_last + vaesenc %zmm28, %zmm0, %zmm0 + vaesenc %zmm28, %zmm1, %zmm1 + vaesenc %zmm28, %zmm2, %zmm2 + vaesenc %zmm28, %zmm3, %zmm3 + vaesenc %zmm29, %zmm0, %zmm0 + vaesenc %zmm29, %zmm1, %zmm1 + vaesenc %zmm29, %zmm2, %zmm2 + vaesenc %zmm29, %zmm3, %zmm3 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_encrypt_update_avx512_aes_enc_256_aes_enc_block_last: + vaesenclast %zmm9, %zmm0, %zmm0 + vaesenclast %zmm9, %zmm1, %zmm1 + vaesenclast %zmm9, %zmm2, %zmm2 + vaesenclast %zmm9, %zmm3, %zmm3 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vpsrlq $48, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm4, %zmm4 + vpternlogq $0x96, %zmm9, %zmm10, %zmm4 + vpxorq %zmm5, %zmm1, %zmm1 + vmovdqu64 %zmm1, 64(%rdx) + vpsrlq $48, %zmm5, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm5, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + vpxorq %zmm6, %zmm2, %zmm2 + vmovdqu64 %zmm2, 128(%rdx) + vpsrlq $48, %zmm6, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm6, %zmm6 + vpternlogq $0x96, %zmm9, %zmm10, %zmm6 + vpxorq %zmm7, %zmm3, %zmm3 + vmovdqu64 %zmm3, 192(%rdx) + vpsrlq $48, %zmm7, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm7, %zmm7 + vpternlogq $0x96, %zmm9, %zmm10, %zmm7 + addl $0x100, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_encrypt_update_avx512_enc_256 + vextracti32x4 $0x00, %zmm4, %xmm8 +L_AES_XTS_encrypt_update_avx512_done_256: + movl %eax, %r11d + andl $0xffffff80, %r11d + cmpl %r11d, %r12d + je L_AES_XTS_encrypt_update_avx512_done_128 + # 128 bytes of input + # aes_enc_128 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vmovdqu64 64(%rcx), %zmm1 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + vpsrlq $60, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm4, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + # aes_enc_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vpternlogq $0x96, %zmm5, %zmm16, %zmm1 + vaesenc %zmm17, %zmm0, %zmm0 + vaesenc %zmm17, %zmm1, %zmm1 + vaesenc %zmm18, %zmm0, %zmm0 + vaesenc %zmm18, %zmm1, %zmm1 + vaesenc %zmm19, %zmm0, %zmm0 + vaesenc %zmm19, %zmm1, %zmm1 + vaesenc %zmm20, %zmm0, %zmm0 + vaesenc %zmm20, %zmm1, %zmm1 + vaesenc %zmm21, %zmm0, %zmm0 + vaesenc %zmm21, %zmm1, %zmm1 + vaesenc %zmm22, %zmm0, %zmm0 + vaesenc %zmm22, %zmm1, %zmm1 + vaesenc %zmm23, %zmm0, %zmm0 + vaesenc %zmm23, %zmm1, %zmm1 + vaesenc %zmm24, %zmm0, %zmm0 + vaesenc %zmm24, %zmm1, %zmm1 + vaesenc %zmm25, %zmm0, %zmm0 + vaesenc %zmm25, %zmm1, %zmm1 + cmpl $11, %r9d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_128_aes_enc_block_last + vaesenc %zmm26, %zmm0, %zmm0 + vaesenc %zmm26, %zmm1, %zmm1 + vaesenc %zmm27, %zmm0, %zmm0 + vaesenc %zmm27, %zmm1, %zmm1 + cmpl $13, %r9d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_128_aes_enc_block_last + vaesenc %zmm28, %zmm0, %zmm0 + vaesenc %zmm28, %zmm1, %zmm1 + vaesenc %zmm29, %zmm0, %zmm0 + vaesenc %zmm29, %zmm1, %zmm1 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_encrypt_update_avx512_aes_enc_128_aes_enc_block_last: + vaesenclast %zmm9, %zmm0, %zmm0 + vaesenclast %zmm9, %zmm1, %zmm1 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vpxorq %zmm5, %zmm1, %zmm1 + vmovdqu64 %zmm1, 64(%rdx) + vextracti32x4 $3, %zmm5, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpternlogd $0x78, %xmm12, %xmm9, %xmm8 + addl $0x80, %r12d +L_AES_XTS_encrypt_update_avx512_done_128: + movl %eax, %r11d + andl $0xffffffc0, %r11d + cmpl %r11d, %r12d + je L_AES_XTS_encrypt_update_avx512_done_64 + # 64 bytes of input + # aes_enc_64 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + # aes_enc_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vaesenc %zmm17, %zmm0, %zmm0 + vaesenc %zmm18, %zmm0, %zmm0 + vaesenc %zmm19, %zmm0, %zmm0 + vaesenc %zmm20, %zmm0, %zmm0 + vaesenc %zmm21, %zmm0, %zmm0 + vaesenc %zmm22, %zmm0, %zmm0 + vaesenc %zmm23, %zmm0, %zmm0 + vaesenc %zmm24, %zmm0, %zmm0 + vaesenc %zmm25, %zmm0, %zmm0 + cmpl $11, %r9d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_64_aes_enc_block_last + vaesenc %zmm26, %zmm0, %zmm0 + vaesenc %zmm27, %zmm0, %zmm0 + cmpl $13, %r9d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_64_aes_enc_block_last + vaesenc %zmm28, %zmm0, %zmm0 + vaesenc %zmm29, %zmm0, %zmm0 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_encrypt_update_avx512_aes_enc_64_aes_enc_block_last: + vaesenclast %zmm9, %zmm0, %zmm0 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vextracti32x4 $3, %zmm4, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpternlogd $0x78, %xmm12, %xmm9, %xmm8 + addl $0x40, %r12d +L_AES_XTS_encrypt_update_avx512_done_64: + movl %eax, %r11d + andl $0xffffffe0, %r11d + cmpl %r11d, %r12d + je L_AES_XTS_encrypt_update_avx512_done_32 + # 32 bytes of input + # aes_enc_32 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %ymm0 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + # aes_enc_block + vpternlogq $0x96, %ymm4, %ymm16, %ymm0 + vaesenc %ymm17, %ymm0, %ymm0 + vaesenc %ymm18, %ymm0, %ymm0 + vaesenc %ymm19, %ymm0, %ymm0 + vaesenc %ymm20, %ymm0, %ymm0 + vaesenc %ymm21, %ymm0, %ymm0 + vaesenc %ymm22, %ymm0, %ymm0 + vaesenc %ymm23, %ymm0, %ymm0 + vaesenc %ymm24, %ymm0, %ymm0 + vaesenc %ymm25, %ymm0, %ymm0 + cmpl $11, %r9d + vmovdqa64 %ymm26, %ymm9 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_32_aes_enc_block_last + vaesenc %ymm26, %ymm0, %ymm0 + vaesenc %ymm27, %ymm0, %ymm0 + cmpl $13, %r9d + vmovdqa64 %ymm28, %ymm9 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_32_aes_enc_block_last + vaesenc %ymm28, %ymm0, %ymm0 + vaesenc %ymm29, %ymm0, %ymm0 + vmovdqa64 %ymm30, %ymm9 +L_AES_XTS_encrypt_update_avx512_aes_enc_32_aes_enc_block_last: + vaesenclast %ymm9, %ymm0, %ymm0 + vpxorq %ymm4, %ymm0, %ymm0 + vmovdqu64 %ymm0, (%rdx) + vextracti32x4 $2, %zmm4, %xmm8 + addl $32, %r12d +L_AES_XTS_encrypt_update_avx512_done_32: + cmpl %eax, %r12d + movl %eax, %r11d + je L_AES_XTS_encrypt_update_avx512_done_enc + subl %r12d, %r11d + cmpl $16, %r11d + movl %eax, %r11d + jl L_AES_XTS_encrypt_update_avx512_last_15 + andl $0xfffffff0, %r11d + # 16 bytes of input +L_AES_XTS_encrypt_update_avx512_enc_16: + leaq (%rdi,%r12,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_enc_block + vpxor (%r10), %xmm0, %xmm0 + vmovdqu 16(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r10), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r10), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_encrypt_update_avx512_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r12,1), %rcx + vmovdqu %xmm0, (%rcx) + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm4, %xmm4 + vpternlogd $0x78, %xmm12, %xmm4, %xmm8 + addl $16, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_encrypt_update_avx512_enc_16 + cmpl %eax, %r12d + je L_AES_XTS_encrypt_update_avx512_done_enc +L_AES_XTS_encrypt_update_avx512_last_15: + subq $16, %r12 + leaq (%rsi,%r12,1), %rcx + vmovdqu (%rcx), %xmm0 + addq $16, %r12 + vmovdqu %xmm0, (%rsp) + xorq %rdx, %rdx +L_AES_XTS_encrypt_update_avx512_last_15_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r12,1), %cl + movb %r11b, (%rsi,%r12,1) + movb %cl, (%rsp,%rdx,1) + incl %r12d + incl %edx + cmpl %eax, %r12d + jl L_AES_XTS_encrypt_update_avx512_last_15_byte_loop + subq %rdx, %r12 + vmovdqu (%rsp), %xmm0 + subq $16, %r12 + vpxor %xmm8, %xmm0, %xmm0 + # aes_enc_block + vpxor (%r10), %xmm0, %xmm0 + vmovdqu 16(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_avx512_last_15_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r10), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_avx512_last_15_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r10), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_encrypt_update_avx512_last_15_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r12,1), %rcx + vmovdqu %xmm0, (%rcx) +L_AES_XTS_encrypt_update_avx512_done_enc: + vmovdqu %xmm8, (%r8) + addq $0x40, %rsp + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_encrypt_update_avx512,.-AES_XTS_encrypt_update_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_XTS_decrypt_avx512 +.type AES_XTS_decrypt_avx512,@function +.align 16 +AES_XTS_decrypt_avx512: +#else +.section __TEXT,__text +.globl _AES_XTS_decrypt_avx512 +.p2align 4 +_AES_XTS_decrypt_avx512: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + movq %rdx, %rax + movq %rcx, %r12 + movl 24(%rsp), %r10d + subq $0x40, %rsp + vmovdqu L_avx512_aes_xts_gc_xts(%rip), %xmm12 + vbroadcasti32x4 L_avx512_aes_xts_poly(%rip), %zmm13 + vmovdqu64 L_avx512_aes_xts_shl(%rip), %zmm14 + vmovdqu64 L_avx512_aes_xts_shr(%rip), %zmm15 + vmovdqu (%r12), %xmm8 + # aes_enc_block + vpxor (%r9), %xmm8, %xmm8 + vmovdqu 16(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 32(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 48(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 64(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 80(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 96(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 112(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 128(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 144(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + cmpl $11, %r10d + vmovdqu 160(%r9), %xmm5 + jl L_AES_XTS_decrypt_avx512_tweak_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 176(%r9), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqu 192(%r9), %xmm5 + jl L_AES_XTS_decrypt_avx512_tweak_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 208(%r9), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + vmovdqu 224(%r9), %xmm5 +L_AES_XTS_decrypt_avx512_tweak_aes_enc_block_last: + vaesenclast %xmm5, %xmm8, %xmm8 + xorl %r13d, %r13d + movl %eax, %r11d + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_avx512_mul16_256 + subl $16, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_avx512_last_31_start +L_AES_XTS_decrypt_avx512_mul16_256: + cmpl $32, %r11d + jl L_AES_XTS_decrypt_avx512_done_128 + vbroadcasti32x4 (%r8), %zmm16 + vbroadcasti32x4 16(%r8), %zmm17 + vbroadcasti32x4 32(%r8), %zmm18 + vbroadcasti32x4 48(%r8), %zmm19 + vbroadcasti32x4 64(%r8), %zmm20 + vbroadcasti32x4 80(%r8), %zmm21 + vbroadcasti32x4 96(%r8), %zmm22 + vbroadcasti32x4 112(%r8), %zmm23 + vbroadcasti32x4 128(%r8), %zmm24 + vbroadcasti32x4 144(%r8), %zmm25 + vbroadcasti32x4 160(%r8), %zmm26 + cmpl $11, %r10d + jl L_AES_XTS_decrypt_avx512_key_cached + vbroadcasti32x4 176(%r8), %zmm27 + vbroadcasti32x4 192(%r8), %zmm28 + cmpl $13, %r10d + jl L_AES_XTS_decrypt_avx512_key_cached + vbroadcasti32x4 208(%r8), %zmm29 + vbroadcasti32x4 224(%r8), %zmm30 +L_AES_XTS_decrypt_avx512_key_cached: + cmpl $0x100, %r11d + jl L_AES_XTS_decrypt_avx512_done_256 + andl $0xffffff00, %r11d + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + vpsrlq $60, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm4, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + vpsrlq $60, %zmm5, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm5, %zmm6 + vpternlogq $0x96, %zmm9, %zmm10, %zmm6 + vpsrlq $60, %zmm6, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm6, %zmm7 + vpternlogq $0x96, %zmm9, %zmm10, %zmm7 +L_AES_XTS_decrypt_avx512_dec_256: + # 256 bytes of input + # aes_dec_256 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vmovdqu64 64(%rcx), %zmm1 + vmovdqu64 128(%rcx), %zmm2 + vmovdqu64 192(%rcx), %zmm3 + # aes_dec_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vpternlogq $0x96, %zmm5, %zmm16, %zmm1 + vpternlogq $0x96, %zmm6, %zmm16, %zmm2 + vpternlogq $0x96, %zmm7, %zmm16, %zmm3 + vaesdec %zmm17, %zmm0, %zmm0 + vaesdec %zmm17, %zmm1, %zmm1 + vaesdec %zmm17, %zmm2, %zmm2 + vaesdec %zmm17, %zmm3, %zmm3 + vaesdec %zmm18, %zmm0, %zmm0 + vaesdec %zmm18, %zmm1, %zmm1 + vaesdec %zmm18, %zmm2, %zmm2 + vaesdec %zmm18, %zmm3, %zmm3 + vaesdec %zmm19, %zmm0, %zmm0 + vaesdec %zmm19, %zmm1, %zmm1 + vaesdec %zmm19, %zmm2, %zmm2 + vaesdec %zmm19, %zmm3, %zmm3 + vaesdec %zmm20, %zmm0, %zmm0 + vaesdec %zmm20, %zmm1, %zmm1 + vaesdec %zmm20, %zmm2, %zmm2 + vaesdec %zmm20, %zmm3, %zmm3 + vaesdec %zmm21, %zmm0, %zmm0 + vaesdec %zmm21, %zmm1, %zmm1 + vaesdec %zmm21, %zmm2, %zmm2 + vaesdec %zmm21, %zmm3, %zmm3 + vaesdec %zmm22, %zmm0, %zmm0 + vaesdec %zmm22, %zmm1, %zmm1 + vaesdec %zmm22, %zmm2, %zmm2 + vaesdec %zmm22, %zmm3, %zmm3 + vaesdec %zmm23, %zmm0, %zmm0 + vaesdec %zmm23, %zmm1, %zmm1 + vaesdec %zmm23, %zmm2, %zmm2 + vaesdec %zmm23, %zmm3, %zmm3 + vaesdec %zmm24, %zmm0, %zmm0 + vaesdec %zmm24, %zmm1, %zmm1 + vaesdec %zmm24, %zmm2, %zmm2 + vaesdec %zmm24, %zmm3, %zmm3 + vaesdec %zmm25, %zmm0, %zmm0 + vaesdec %zmm25, %zmm1, %zmm1 + vaesdec %zmm25, %zmm2, %zmm2 + vaesdec %zmm25, %zmm3, %zmm3 + cmpl $11, %r10d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_decrypt_avx512_aes_dec_256_aes_dec_block_last + vaesdec %zmm26, %zmm0, %zmm0 + vaesdec %zmm26, %zmm1, %zmm1 + vaesdec %zmm26, %zmm2, %zmm2 + vaesdec %zmm26, %zmm3, %zmm3 + vaesdec %zmm27, %zmm0, %zmm0 + vaesdec %zmm27, %zmm1, %zmm1 + vaesdec %zmm27, %zmm2, %zmm2 + vaesdec %zmm27, %zmm3, %zmm3 + cmpl $13, %r10d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_decrypt_avx512_aes_dec_256_aes_dec_block_last + vaesdec %zmm28, %zmm0, %zmm0 + vaesdec %zmm28, %zmm1, %zmm1 + vaesdec %zmm28, %zmm2, %zmm2 + vaesdec %zmm28, %zmm3, %zmm3 + vaesdec %zmm29, %zmm0, %zmm0 + vaesdec %zmm29, %zmm1, %zmm1 + vaesdec %zmm29, %zmm2, %zmm2 + vaesdec %zmm29, %zmm3, %zmm3 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_decrypt_avx512_aes_dec_256_aes_dec_block_last: + vaesdeclast %zmm9, %zmm0, %zmm0 + vaesdeclast %zmm9, %zmm1, %zmm1 + vaesdeclast %zmm9, %zmm2, %zmm2 + vaesdeclast %zmm9, %zmm3, %zmm3 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vpsrlq $48, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm4, %zmm4 + vpternlogq $0x96, %zmm9, %zmm10, %zmm4 + vpxorq %zmm5, %zmm1, %zmm1 + vmovdqu64 %zmm1, 64(%rdx) + vpsrlq $48, %zmm5, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm5, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + vpxorq %zmm6, %zmm2, %zmm2 + vmovdqu64 %zmm2, 128(%rdx) + vpsrlq $48, %zmm6, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm6, %zmm6 + vpternlogq $0x96, %zmm9, %zmm10, %zmm6 + vpxorq %zmm7, %zmm3, %zmm3 + vmovdqu64 %zmm3, 192(%rdx) + vpsrlq $48, %zmm7, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm7, %zmm7 + vpternlogq $0x96, %zmm9, %zmm10, %zmm7 + addl $0x100, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_decrypt_avx512_dec_256 + vextracti32x4 $0x00, %zmm4, %xmm8 +L_AES_XTS_decrypt_avx512_done_256: + cmpl %eax, %r13d + movl %eax, %r11d + je L_AES_XTS_decrypt_avx512_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_avx512_mul16_128 + subl $16, %r11d + subl %r13d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_avx512_last_31_start + addl %r13d, %r11d +L_AES_XTS_decrypt_avx512_mul16_128: + andl $0xffffff80, %r11d + cmpl %r11d, %r13d + je L_AES_XTS_decrypt_avx512_done_128 + # 128 bytes of input + # aes_dec_128 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vmovdqu64 64(%rcx), %zmm1 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + vpsrlq $60, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm4, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + # aes_dec_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vpternlogq $0x96, %zmm5, %zmm16, %zmm1 + vaesdec %zmm17, %zmm0, %zmm0 + vaesdec %zmm17, %zmm1, %zmm1 + vaesdec %zmm18, %zmm0, %zmm0 + vaesdec %zmm18, %zmm1, %zmm1 + vaesdec %zmm19, %zmm0, %zmm0 + vaesdec %zmm19, %zmm1, %zmm1 + vaesdec %zmm20, %zmm0, %zmm0 + vaesdec %zmm20, %zmm1, %zmm1 + vaesdec %zmm21, %zmm0, %zmm0 + vaesdec %zmm21, %zmm1, %zmm1 + vaesdec %zmm22, %zmm0, %zmm0 + vaesdec %zmm22, %zmm1, %zmm1 + vaesdec %zmm23, %zmm0, %zmm0 + vaesdec %zmm23, %zmm1, %zmm1 + vaesdec %zmm24, %zmm0, %zmm0 + vaesdec %zmm24, %zmm1, %zmm1 + vaesdec %zmm25, %zmm0, %zmm0 + vaesdec %zmm25, %zmm1, %zmm1 + cmpl $11, %r10d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_decrypt_avx512_aes_dec_128_aes_dec_block_last + vaesdec %zmm26, %zmm0, %zmm0 + vaesdec %zmm26, %zmm1, %zmm1 + vaesdec %zmm27, %zmm0, %zmm0 + vaesdec %zmm27, %zmm1, %zmm1 + cmpl $13, %r10d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_decrypt_avx512_aes_dec_128_aes_dec_block_last + vaesdec %zmm28, %zmm0, %zmm0 + vaesdec %zmm28, %zmm1, %zmm1 + vaesdec %zmm29, %zmm0, %zmm0 + vaesdec %zmm29, %zmm1, %zmm1 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_decrypt_avx512_aes_dec_128_aes_dec_block_last: + vaesdeclast %zmm9, %zmm0, %zmm0 + vaesdeclast %zmm9, %zmm1, %zmm1 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vpxorq %zmm5, %zmm1, %zmm1 + vmovdqu64 %zmm1, 64(%rdx) + vextracti32x4 $3, %zmm5, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpternlogd $0x78, %xmm12, %xmm9, %xmm8 + addl $0x80, %r13d +L_AES_XTS_decrypt_avx512_done_128: + cmpl %eax, %r13d + movl %eax, %r11d + je L_AES_XTS_decrypt_avx512_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_avx512_mul16_64 + subl $16, %r11d + subl %r13d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_avx512_last_31_start + addl %r13d, %r11d +L_AES_XTS_decrypt_avx512_mul16_64: + andl $0xffffffc0, %r11d + cmpl %r11d, %r13d + je L_AES_XTS_decrypt_avx512_done_64 + # 64 bytes of input + # aes_dec_64 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + # aes_dec_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vaesdec %zmm17, %zmm0, %zmm0 + vaesdec %zmm18, %zmm0, %zmm0 + vaesdec %zmm19, %zmm0, %zmm0 + vaesdec %zmm20, %zmm0, %zmm0 + vaesdec %zmm21, %zmm0, %zmm0 + vaesdec %zmm22, %zmm0, %zmm0 + vaesdec %zmm23, %zmm0, %zmm0 + vaesdec %zmm24, %zmm0, %zmm0 + vaesdec %zmm25, %zmm0, %zmm0 + cmpl $11, %r10d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_decrypt_avx512_aes_dec_64_aes_dec_block_last + vaesdec %zmm26, %zmm0, %zmm0 + vaesdec %zmm27, %zmm0, %zmm0 + cmpl $13, %r10d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_decrypt_avx512_aes_dec_64_aes_dec_block_last + vaesdec %zmm28, %zmm0, %zmm0 + vaesdec %zmm29, %zmm0, %zmm0 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_decrypt_avx512_aes_dec_64_aes_dec_block_last: + vaesdeclast %zmm9, %zmm0, %zmm0 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vextracti32x4 $3, %zmm4, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpternlogd $0x78, %xmm12, %xmm9, %xmm8 + addl $0x40, %r13d +L_AES_XTS_decrypt_avx512_done_64: + cmpl %eax, %r13d + movl %eax, %r11d + je L_AES_XTS_decrypt_avx512_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_avx512_mul16_32 + subl $16, %r11d + subl %r13d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_avx512_last_31_start + addl %r13d, %r11d +L_AES_XTS_decrypt_avx512_mul16_32: + andl $0xffffffe0, %r11d + cmpl %r11d, %r13d + je L_AES_XTS_decrypt_avx512_done_32 + # 32 bytes of input + # aes_dec_32 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu64 (%rcx), %ymm0 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + # aes_dec_block + vpternlogq $0x96, %ymm4, %ymm16, %ymm0 + vaesdec %ymm17, %ymm0, %ymm0 + vaesdec %ymm18, %ymm0, %ymm0 + vaesdec %ymm19, %ymm0, %ymm0 + vaesdec %ymm20, %ymm0, %ymm0 + vaesdec %ymm21, %ymm0, %ymm0 + vaesdec %ymm22, %ymm0, %ymm0 + vaesdec %ymm23, %ymm0, %ymm0 + vaesdec %ymm24, %ymm0, %ymm0 + vaesdec %ymm25, %ymm0, %ymm0 + cmpl $11, %r10d + vmovdqa64 %ymm26, %ymm9 + jl L_AES_XTS_decrypt_avx512_aes_dec_32_aes_dec_block_last + vaesdec %ymm26, %ymm0, %ymm0 + vaesdec %ymm27, %ymm0, %ymm0 + cmpl $13, %r10d + vmovdqa64 %ymm28, %ymm9 + jl L_AES_XTS_decrypt_avx512_aes_dec_32_aes_dec_block_last + vaesdec %ymm28, %ymm0, %ymm0 + vaesdec %ymm29, %ymm0, %ymm0 + vmovdqa64 %ymm30, %ymm9 +L_AES_XTS_decrypt_avx512_aes_dec_32_aes_dec_block_last: + vaesdeclast %ymm9, %ymm0, %ymm0 + vpxorq %ymm4, %ymm0, %ymm0 + vmovdqu64 %ymm0, (%rdx) + vextracti32x4 $2, %zmm4, %xmm8 + addl $32, %r13d +L_AES_XTS_decrypt_avx512_done_32: + cmpl %eax, %r13d + movl %eax, %r11d + je L_AES_XTS_decrypt_avx512_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_avx512_mul16 + subl $16, %r11d + subl %r13d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_avx512_last_31_start + addl %r13d, %r11d +L_AES_XTS_decrypt_avx512_mul16: +L_AES_XTS_decrypt_avx512_dec_16: + # 16 bytes of input + leaq (%rdi,%r13,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_decrypt_avx512_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_decrypt_avx512_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_decrypt_avx512_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r13,1), %rcx + vmovdqu %xmm0, (%rcx) + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm4, %xmm4 + vpternlogd $0x78, %xmm12, %xmm4, %xmm8 + addl $16, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_decrypt_avx512_dec_16 + cmpl %eax, %r13d + je L_AES_XTS_decrypt_avx512_done_dec +L_AES_XTS_decrypt_avx512_last_31_start: + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm7 + vpsrad $31, %xmm4, %xmm4 + vpternlogd $0x78, %xmm12, %xmm4, %xmm7 + leaq (%rdi,%r13,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm7, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_decrypt_avx512_last_31_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_decrypt_avx512_last_31_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_decrypt_avx512_last_31_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm7, %xmm0, %xmm0 + vmovdqu %xmm0, (%rsp) + addq $16, %r13 + xorq %rdx, %rdx +L_AES_XTS_decrypt_avx512_last_31_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r13,1), %cl + movb %r11b, (%rsi,%r13,1) + movb %cl, (%rsp,%rdx,1) + incl %r13d + incl %edx + cmpl %eax, %r13d + jl L_AES_XTS_decrypt_avx512_last_31_byte_loop + subq %rdx, %r13 + vmovdqu (%rsp), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_decrypt_avx512_last_31_2_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_decrypt_avx512_last_31_2_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_decrypt_avx512_last_31_2_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + subq $16, %r13 + leaq (%rsi,%r13,1), %rcx + vmovdqu %xmm0, (%rcx) +L_AES_XTS_decrypt_avx512_done_dec: + addq $0x40, %rsp + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_decrypt_avx512,.-AES_XTS_decrypt_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_XTS_decrypt_update_avx512 +.type AES_XTS_decrypt_update_avx512,@function +.align 16 +AES_XTS_decrypt_update_avx512: +#else +.section __TEXT,__text +.globl _AES_XTS_decrypt_update_avx512 +.p2align 4 +_AES_XTS_decrypt_update_avx512: +#endif /* __APPLE__ */ + pushq %r12 + movq %rdx, %rax + movq %rcx, %r10 + subq $0x40, %rsp + vmovdqu L_avx512_aes_xts_gc_xts(%rip), %xmm12 + vbroadcasti32x4 L_avx512_aes_xts_poly(%rip), %zmm13 + vmovdqu64 L_avx512_aes_xts_shl(%rip), %zmm14 + vmovdqu64 L_avx512_aes_xts_shr(%rip), %zmm15 + vmovdqu (%r8), %xmm8 + xorl %r12d, %r12d + movl %eax, %r11d + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_update_avx512_mul16_256 + subl $16, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_update_avx512_last_31_start +L_AES_XTS_decrypt_update_avx512_mul16_256: + cmpl $32, %r11d + jl L_AES_XTS_decrypt_update_avx512_done_128 + vbroadcasti32x4 (%r10), %zmm16 + vbroadcasti32x4 16(%r10), %zmm17 + vbroadcasti32x4 32(%r10), %zmm18 + vbroadcasti32x4 48(%r10), %zmm19 + vbroadcasti32x4 64(%r10), %zmm20 + vbroadcasti32x4 80(%r10), %zmm21 + vbroadcasti32x4 96(%r10), %zmm22 + vbroadcasti32x4 112(%r10), %zmm23 + vbroadcasti32x4 128(%r10), %zmm24 + vbroadcasti32x4 144(%r10), %zmm25 + vbroadcasti32x4 160(%r10), %zmm26 + cmpl $11, %r9d + jl L_AES_XTS_decrypt_update_avx512_key_cached + vbroadcasti32x4 176(%r10), %zmm27 + vbroadcasti32x4 192(%r10), %zmm28 + cmpl $13, %r9d + jl L_AES_XTS_decrypt_update_avx512_key_cached + vbroadcasti32x4 208(%r10), %zmm29 + vbroadcasti32x4 224(%r10), %zmm30 +L_AES_XTS_decrypt_update_avx512_key_cached: + cmpl $0x100, %r11d + jl L_AES_XTS_decrypt_update_avx512_done_256 + andl $0xffffff00, %r11d + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + vpsrlq $60, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm4, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + vpsrlq $60, %zmm5, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm5, %zmm6 + vpternlogq $0x96, %zmm9, %zmm10, %zmm6 + vpsrlq $60, %zmm6, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm6, %zmm7 + vpternlogq $0x96, %zmm9, %zmm10, %zmm7 +L_AES_XTS_decrypt_update_avx512_dec_256: + # 256 bytes of input + # aes_dec_256 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vmovdqu64 64(%rcx), %zmm1 + vmovdqu64 128(%rcx), %zmm2 + vmovdqu64 192(%rcx), %zmm3 + # aes_dec_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vpternlogq $0x96, %zmm5, %zmm16, %zmm1 + vpternlogq $0x96, %zmm6, %zmm16, %zmm2 + vpternlogq $0x96, %zmm7, %zmm16, %zmm3 + vaesdec %zmm17, %zmm0, %zmm0 + vaesdec %zmm17, %zmm1, %zmm1 + vaesdec %zmm17, %zmm2, %zmm2 + vaesdec %zmm17, %zmm3, %zmm3 + vaesdec %zmm18, %zmm0, %zmm0 + vaesdec %zmm18, %zmm1, %zmm1 + vaesdec %zmm18, %zmm2, %zmm2 + vaesdec %zmm18, %zmm3, %zmm3 + vaesdec %zmm19, %zmm0, %zmm0 + vaesdec %zmm19, %zmm1, %zmm1 + vaesdec %zmm19, %zmm2, %zmm2 + vaesdec %zmm19, %zmm3, %zmm3 + vaesdec %zmm20, %zmm0, %zmm0 + vaesdec %zmm20, %zmm1, %zmm1 + vaesdec %zmm20, %zmm2, %zmm2 + vaesdec %zmm20, %zmm3, %zmm3 + vaesdec %zmm21, %zmm0, %zmm0 + vaesdec %zmm21, %zmm1, %zmm1 + vaesdec %zmm21, %zmm2, %zmm2 + vaesdec %zmm21, %zmm3, %zmm3 + vaesdec %zmm22, %zmm0, %zmm0 + vaesdec %zmm22, %zmm1, %zmm1 + vaesdec %zmm22, %zmm2, %zmm2 + vaesdec %zmm22, %zmm3, %zmm3 + vaesdec %zmm23, %zmm0, %zmm0 + vaesdec %zmm23, %zmm1, %zmm1 + vaesdec %zmm23, %zmm2, %zmm2 + vaesdec %zmm23, %zmm3, %zmm3 + vaesdec %zmm24, %zmm0, %zmm0 + vaesdec %zmm24, %zmm1, %zmm1 + vaesdec %zmm24, %zmm2, %zmm2 + vaesdec %zmm24, %zmm3, %zmm3 + vaesdec %zmm25, %zmm0, %zmm0 + vaesdec %zmm25, %zmm1, %zmm1 + vaesdec %zmm25, %zmm2, %zmm2 + vaesdec %zmm25, %zmm3, %zmm3 + cmpl $11, %r9d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_256_aes_dec_block_last + vaesdec %zmm26, %zmm0, %zmm0 + vaesdec %zmm26, %zmm1, %zmm1 + vaesdec %zmm26, %zmm2, %zmm2 + vaesdec %zmm26, %zmm3, %zmm3 + vaesdec %zmm27, %zmm0, %zmm0 + vaesdec %zmm27, %zmm1, %zmm1 + vaesdec %zmm27, %zmm2, %zmm2 + vaesdec %zmm27, %zmm3, %zmm3 + cmpl $13, %r9d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_256_aes_dec_block_last + vaesdec %zmm28, %zmm0, %zmm0 + vaesdec %zmm28, %zmm1, %zmm1 + vaesdec %zmm28, %zmm2, %zmm2 + vaesdec %zmm28, %zmm3, %zmm3 + vaesdec %zmm29, %zmm0, %zmm0 + vaesdec %zmm29, %zmm1, %zmm1 + vaesdec %zmm29, %zmm2, %zmm2 + vaesdec %zmm29, %zmm3, %zmm3 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_decrypt_update_avx512_aes_dec_256_aes_dec_block_last: + vaesdeclast %zmm9, %zmm0, %zmm0 + vaesdeclast %zmm9, %zmm1, %zmm1 + vaesdeclast %zmm9, %zmm2, %zmm2 + vaesdeclast %zmm9, %zmm3, %zmm3 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vpsrlq $48, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm4, %zmm4 + vpternlogq $0x96, %zmm9, %zmm10, %zmm4 + vpxorq %zmm5, %zmm1, %zmm1 + vmovdqu64 %zmm1, 64(%rdx) + vpsrlq $48, %zmm5, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm5, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + vpxorq %zmm6, %zmm2, %zmm2 + vmovdqu64 %zmm2, 128(%rdx) + vpsrlq $48, %zmm6, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm6, %zmm6 + vpternlogq $0x96, %zmm9, %zmm10, %zmm6 + vpxorq %zmm7, %zmm3, %zmm3 + vmovdqu64 %zmm3, 192(%rdx) + vpsrlq $48, %zmm7, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm7, %zmm7 + vpternlogq $0x96, %zmm9, %zmm10, %zmm7 + addl $0x100, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_decrypt_update_avx512_dec_256 + vextracti32x4 $0x00, %zmm4, %xmm8 +L_AES_XTS_decrypt_update_avx512_done_256: + cmpl %eax, %r12d + movl %eax, %r11d + je L_AES_XTS_decrypt_update_avx512_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_update_avx512_mul16_128 + subl $16, %r11d + subl %r12d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_update_avx512_last_31_start + addl %r12d, %r11d +L_AES_XTS_decrypt_update_avx512_mul16_128: + andl $0xffffff80, %r11d + cmpl %r11d, %r12d + je L_AES_XTS_decrypt_update_avx512_done_128 + # 128 bytes of input + # aes_dec_128 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vmovdqu64 64(%rcx), %zmm1 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + vpsrlq $60, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm4, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + # aes_dec_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vpternlogq $0x96, %zmm5, %zmm16, %zmm1 + vaesdec %zmm17, %zmm0, %zmm0 + vaesdec %zmm17, %zmm1, %zmm1 + vaesdec %zmm18, %zmm0, %zmm0 + vaesdec %zmm18, %zmm1, %zmm1 + vaesdec %zmm19, %zmm0, %zmm0 + vaesdec %zmm19, %zmm1, %zmm1 + vaesdec %zmm20, %zmm0, %zmm0 + vaesdec %zmm20, %zmm1, %zmm1 + vaesdec %zmm21, %zmm0, %zmm0 + vaesdec %zmm21, %zmm1, %zmm1 + vaesdec %zmm22, %zmm0, %zmm0 + vaesdec %zmm22, %zmm1, %zmm1 + vaesdec %zmm23, %zmm0, %zmm0 + vaesdec %zmm23, %zmm1, %zmm1 + vaesdec %zmm24, %zmm0, %zmm0 + vaesdec %zmm24, %zmm1, %zmm1 + vaesdec %zmm25, %zmm0, %zmm0 + vaesdec %zmm25, %zmm1, %zmm1 + cmpl $11, %r9d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_128_aes_dec_block_last + vaesdec %zmm26, %zmm0, %zmm0 + vaesdec %zmm26, %zmm1, %zmm1 + vaesdec %zmm27, %zmm0, %zmm0 + vaesdec %zmm27, %zmm1, %zmm1 + cmpl $13, %r9d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_128_aes_dec_block_last + vaesdec %zmm28, %zmm0, %zmm0 + vaesdec %zmm28, %zmm1, %zmm1 + vaesdec %zmm29, %zmm0, %zmm0 + vaesdec %zmm29, %zmm1, %zmm1 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_decrypt_update_avx512_aes_dec_128_aes_dec_block_last: + vaesdeclast %zmm9, %zmm0, %zmm0 + vaesdeclast %zmm9, %zmm1, %zmm1 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vpxorq %zmm5, %zmm1, %zmm1 + vmovdqu64 %zmm1, 64(%rdx) + vextracti32x4 $3, %zmm5, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpternlogd $0x78, %xmm12, %xmm9, %xmm8 + addl $0x80, %r12d +L_AES_XTS_decrypt_update_avx512_done_128: + cmpl %eax, %r12d + movl %eax, %r11d + je L_AES_XTS_decrypt_update_avx512_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_update_avx512_mul16_64 + subl $16, %r11d + subl %r12d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_update_avx512_last_31_start + addl %r12d, %r11d +L_AES_XTS_decrypt_update_avx512_mul16_64: + andl $0xffffffc0, %r11d + cmpl %r11d, %r12d + je L_AES_XTS_decrypt_update_avx512_done_64 + # 64 bytes of input + # aes_dec_64 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + # aes_dec_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vaesdec %zmm17, %zmm0, %zmm0 + vaesdec %zmm18, %zmm0, %zmm0 + vaesdec %zmm19, %zmm0, %zmm0 + vaesdec %zmm20, %zmm0, %zmm0 + vaesdec %zmm21, %zmm0, %zmm0 + vaesdec %zmm22, %zmm0, %zmm0 + vaesdec %zmm23, %zmm0, %zmm0 + vaesdec %zmm24, %zmm0, %zmm0 + vaesdec %zmm25, %zmm0, %zmm0 + cmpl $11, %r9d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_64_aes_dec_block_last + vaesdec %zmm26, %zmm0, %zmm0 + vaesdec %zmm27, %zmm0, %zmm0 + cmpl $13, %r9d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_64_aes_dec_block_last + vaesdec %zmm28, %zmm0, %zmm0 + vaesdec %zmm29, %zmm0, %zmm0 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_decrypt_update_avx512_aes_dec_64_aes_dec_block_last: + vaesdeclast %zmm9, %zmm0, %zmm0 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vextracti32x4 $3, %zmm4, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpternlogd $0x78, %xmm12, %xmm9, %xmm8 + addl $0x40, %r12d +L_AES_XTS_decrypt_update_avx512_done_64: + cmpl %eax, %r12d + movl %eax, %r11d + je L_AES_XTS_decrypt_update_avx512_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_update_avx512_mul16_32 + subl $16, %r11d + subl %r12d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_update_avx512_last_31_start + addl %r12d, %r11d +L_AES_XTS_decrypt_update_avx512_mul16_32: + andl $0xffffffe0, %r11d + cmpl %r11d, %r12d + je L_AES_XTS_decrypt_update_avx512_done_32 + # 32 bytes of input + # aes_dec_32 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %ymm0 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + # aes_dec_block + vpternlogq $0x96, %ymm4, %ymm16, %ymm0 + vaesdec %ymm17, %ymm0, %ymm0 + vaesdec %ymm18, %ymm0, %ymm0 + vaesdec %ymm19, %ymm0, %ymm0 + vaesdec %ymm20, %ymm0, %ymm0 + vaesdec %ymm21, %ymm0, %ymm0 + vaesdec %ymm22, %ymm0, %ymm0 + vaesdec %ymm23, %ymm0, %ymm0 + vaesdec %ymm24, %ymm0, %ymm0 + vaesdec %ymm25, %ymm0, %ymm0 + cmpl $11, %r9d + vmovdqa64 %ymm26, %ymm9 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_32_aes_dec_block_last + vaesdec %ymm26, %ymm0, %ymm0 + vaesdec %ymm27, %ymm0, %ymm0 + cmpl $13, %r9d + vmovdqa64 %ymm28, %ymm9 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_32_aes_dec_block_last + vaesdec %ymm28, %ymm0, %ymm0 + vaesdec %ymm29, %ymm0, %ymm0 + vmovdqa64 %ymm30, %ymm9 +L_AES_XTS_decrypt_update_avx512_aes_dec_32_aes_dec_block_last: + vaesdeclast %ymm9, %ymm0, %ymm0 + vpxorq %ymm4, %ymm0, %ymm0 + vmovdqu64 %ymm0, (%rdx) + vextracti32x4 $2, %zmm4, %xmm8 + addl $32, %r12d +L_AES_XTS_decrypt_update_avx512_done_32: + cmpl %eax, %r12d + movl %eax, %r11d + je L_AES_XTS_decrypt_update_avx512_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_update_avx512_mul16 + subl $16, %r11d + subl %r12d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_update_avx512_last_31_start + addl %r12d, %r11d +L_AES_XTS_decrypt_update_avx512_mul16: +L_AES_XTS_decrypt_update_avx512_dec_16: + # 16 bytes of input + leaq (%rdi,%r12,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r10), %xmm0, %xmm0 + vmovdqu 16(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_decrypt_update_avx512_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r12,1), %rcx + vmovdqu %xmm0, (%rcx) + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm4, %xmm4 + vpternlogd $0x78, %xmm12, %xmm4, %xmm8 + addl $16, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_decrypt_update_avx512_dec_16 + cmpl %eax, %r12d + je L_AES_XTS_decrypt_update_avx512_done_dec +L_AES_XTS_decrypt_update_avx512_last_31_start: + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm7 + vpsrad $31, %xmm4, %xmm4 + vpternlogd $0x78, %xmm12, %xmm4, %xmm7 + leaq (%rdi,%r12,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm7, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r10), %xmm0, %xmm0 + vmovdqu 16(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_avx512_last_31_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_avx512_last_31_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_decrypt_update_avx512_last_31_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm7, %xmm0, %xmm0 + vmovdqu %xmm0, (%rsp) + addq $16, %r12 + xorq %rdx, %rdx +L_AES_XTS_decrypt_update_avx512_last_31_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r12,1), %cl + movb %r11b, (%rsi,%r12,1) + movb %cl, (%rsp,%rdx,1) + incl %r12d + incl %edx + cmpl %eax, %r12d + jl L_AES_XTS_decrypt_update_avx512_last_31_byte_loop + subq %rdx, %r12 + vmovdqu (%rsp), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r10), %xmm0, %xmm0 + vmovdqu 16(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_avx512_last_31_2_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_avx512_last_31_2_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_decrypt_update_avx512_last_31_2_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + subq $16, %r12 + leaq (%rsi,%r12,1), %rcx + vmovdqu %xmm0, (%rcx) +L_AES_XTS_decrypt_update_avx512_done_dec: + vmovdqu %xmm8, (%r8) + addq $0x40, %rsp + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_decrypt_update_avx512,.-AES_XTS_decrypt_update_avx512 +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX512 */ #endif /* WOLFSSL_X86_64_BUILD */ #endif /* WOLFSSL_AES_XTS */ diff --git a/wolfcrypt/src/aes_xts_asm.asm b/wolfcrypt/src/aes_xts_asm.asm index b0e5cebf316..a904ffa4ce7 100644 --- a/wolfcrypt/src/aes_xts_asm.asm +++ b/wolfcrypt/src/aes_xts_asm.asm @@ -2831,4 +2831,4472 @@ L_AES_XTS_decrypt_update_avx1_done_dec: AES_XTS_decrypt_update_avx1 ENDP _TEXT ENDS ENDIF +IFDEF HAVE_INTEL_VAES +_TEXT SEGMENT READONLY PARA +AES_XTS_init_vaes PROC + vmovdqu xmm0, OWORD PTR [rcx] + ; aes_enc_block + vpxor xmm0, xmm0, [rdx] + vmovdqu xmm2, OWORD PTR [rdx+16] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+32] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+48] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+64] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+80] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+96] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+112] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+128] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+144] + vaesenc xmm0, xmm0, xmm2 + cmp r8d, 11 + vmovdqu xmm2, OWORD PTR [rdx+160] + jl L_AES_XTS_init_vaes_tweak_aes_enc_block_last + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm3, OWORD PTR [rdx+176] + vaesenc xmm0, xmm0, xmm3 + cmp r8d, 13 + vmovdqu xmm2, OWORD PTR [rdx+192] + jl L_AES_XTS_init_vaes_tweak_aes_enc_block_last + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm3, OWORD PTR [rdx+208] + vaesenc xmm0, xmm0, xmm3 + vmovdqu xmm2, OWORD PTR [rdx+224] +L_AES_XTS_init_vaes_tweak_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm2 + vmovdqu OWORD PTR [rcx], xmm0 + ret +AES_XTS_init_vaes ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_xts_gc_xts DWORD \ + 00000087h, 00000000h, 00000001h, 00000000h +ptr_L_vaes_aes_xts_gc_xts QWORD L_vaes_aes_xts_gc_xts +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_xts_poly DWORD \ + 00000087h, 00000000h, 00000000h, 00000000h +ptr_L_vaes_aes_xts_poly QWORD L_vaes_aes_xts_poly +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_xts_shl DWORD \ + 00000000h, 00000000h, 00000000h, 00000000h, + 00000001h, 00000000h, 00000001h, 00000000h +ptr_L_vaes_aes_xts_shl QWORD L_vaes_aes_xts_shl +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_xts_shr DWORD \ + 00000040h, 00000000h, 00000040h, 00000000h, + 0000003fh, 00000000h, 0000003fh, 00000000h +ptr_L_vaes_aes_xts_shr QWORD L_vaes_aes_xts_shr +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_XTS_encrypt_vaes PROC + push rdi + push rsi + push r12 + push r13 + mov rdi, rcx + mov rsi, rdx + mov rax, r8 + mov r12, r9 + mov r8, QWORD PTR [rsp+72] + mov r9, QWORD PTR [rsp+80] + mov r10d, DWORD PTR [rsp+88] + sub rsp, 224 + vmovdqu OWORD PTR [rsp+64], xmm6 + vmovdqu OWORD PTR [rsp+80], xmm7 + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + vmovdqu OWORD PTR [rsp+128], xmm10 + vmovdqu OWORD PTR [rsp+144], xmm11 + vmovdqu OWORD PTR [rsp+160], xmm12 + vmovdqu OWORD PTR [rsp+176], xmm13 + vmovdqu OWORD PTR [rsp+192], xmm14 + vmovdqu OWORD PTR [rsp+208], xmm15 + vmovdqu xmm12, OWORD PTR L_vaes_aes_xts_gc_xts + vbroadcasti128 ymm13, ptr_L_vaes_aes_xts_poly + vmovdqu ymm14, YMMWORD PTR L_vaes_aes_xts_shl + vmovdqu ymm15, YMMWORD PTR L_vaes_aes_xts_shr + vmovdqu xmm8, OWORD PTR [r12] + ; aes_enc_block + vpxor xmm8, xmm8, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm8, xmm8, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_XTS_encrypt_vaes_tweak_aes_enc_block_last + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm8, xmm8, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_XTS_encrypt_vaes_tweak_aes_enc_block_last + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm8, xmm8, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_XTS_encrypt_vaes_tweak_aes_enc_block_last: + vaesenclast xmm8, xmm8, xmm5 + xor r13d, r13d + cmp eax, 32 + jl L_AES_XTS_encrypt_vaes_done_128 + cmp eax, 128 + mov r11d, eax + jl L_AES_XTS_encrypt_vaes_done_128 + and r11d, 4294967168 + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + vpsrlq ymm9, ymm4, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm4, 2 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + vpsrlq ymm9, ymm5, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm6, ymm5, 2 + vpxor ymm6, ymm6, ymm10 + vpxor ymm6, ymm6, ymm9 + vpsrlq ymm9, ymm6, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm7, ymm6, 2 + vpxor ymm7, ymm7, ymm10 + vpxor ymm7, ymm7, ymm9 +L_AES_XTS_encrypt_vaes_enc_128: + ; 128 bytes of input + ; aes_enc_128 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + ; aes_enc_block + vbroadcasti128 ymm9, [r8] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vpxor ymm1, ymm1, ymm5 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm6 + vpxor ymm2, ymm2, ymm9 + vpxor ymm3, ymm3, ymm7 + vpxor ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+16] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+32] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+48] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+64] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+80] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+96] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+112] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+128] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+144] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + cmp r10d, 11 + vbroadcasti128 ymm9, [r8+160] + jl L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+176] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + cmp r10d, 13 + vbroadcasti128 ymm9, [r8+192] + jl L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+208] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+224] +L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm9 + vaesenclast ymm1, ymm1, ymm9 + vaesenclast ymm2, ymm2, ymm9 + vaesenclast ymm3, ymm3, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vpsrlq ymm9, ymm4, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm4, ymm4, 8 + vpxor ymm4, ymm4, ymm10 + vpxor ymm4, ymm4, ymm9 + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vpsrlq ymm9, ymm5, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm5, 8 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + vpxor ymm2, ymm2, ymm6 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vpsrlq ymm9, ymm6, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm6, ymm6, 8 + vpxor ymm6, ymm6, ymm10 + vpxor ymm6, ymm6, ymm9 + vpxor ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + vpsrlq ymm9, ymm7, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm7, ymm7, 8 + vpxor ymm7, ymm7, ymm10 + vpxor ymm7, ymm7, ymm9 + add r13d, 128 + cmp r13d, r11d + jl L_AES_XTS_encrypt_vaes_enc_128 + vextracti128 xmm8, ymm4, 0 +L_AES_XTS_encrypt_vaes_done_128: + mov r11d, eax + and r11d, 4294967232 + cmp r13d, r11d + je L_AES_XTS_encrypt_vaes_done_64 + ; 64 bytes of input + ; aes_enc_64 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + vpsrlq ymm9, ymm4, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm4, 2 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + ; aes_enc_block + vbroadcasti128 ymm9, [r8] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vpxor ymm1, ymm1, ymm5 + vpxor ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+16] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+32] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+48] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+64] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+80] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+96] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+112] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+128] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+144] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + cmp r10d, 11 + vbroadcasti128 ymm9, [r8+160] + jl L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+176] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + cmp r10d, 13 + vbroadcasti128 ymm9, [r8+192] + jl L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+208] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+224] +L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm9 + vaesenclast ymm1, ymm1, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vextracti128 xmm8, ymm5, 1 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpand xmm9, xmm9, xmm12 + vpxor xmm8, xmm8, xmm9 + add r13d, 64 +L_AES_XTS_encrypt_vaes_done_64: + mov r11d, eax + and r11d, 4294967264 + cmp r13d, r11d + je L_AES_XTS_encrypt_vaes_done_32 + ; 32 bytes of input + ; aes_enc_32 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu ymm0, YMMWORD PTR [rcx] + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + ; aes_enc_block + vbroadcasti128 ymm9, [r8] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+16] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+32] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+48] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+64] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+80] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+96] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+112] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+128] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+144] + vaesenc ymm0, ymm0, ymm9 + cmp r10d, 11 + vbroadcasti128 ymm9, [r8+160] + jl L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+176] + vaesenc ymm0, ymm0, ymm9 + cmp r10d, 13 + vbroadcasti128 ymm9, [r8+192] + jl L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+208] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+224] +L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vextracti128 xmm8, ymm4, 1 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpand xmm9, xmm9, xmm12 + vpxor xmm8, xmm8, xmm9 + add r13d, 32 +L_AES_XTS_encrypt_vaes_done_32: + cmp r13d, eax + mov r11d, eax + je L_AES_XTS_encrypt_vaes_done_enc + sub r11d, r13d + cmp r11d, 16 + mov r11d, eax + jl L_AES_XTS_encrypt_vaes_last_15 + and r11d, 4294967280 + ; 16 bytes of input +L_AES_XTS_encrypt_vaes_enc_16: + lea rcx, QWORD PTR [rdi+r13] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm8 + ; aes_enc_block + vpxor xmm0, xmm0, [r8] + vmovdqu xmm5, OWORD PTR [r8+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+144] + vaesenc xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r8+160] + jl L_AES_XTS_encrypt_vaes_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+176] + vaesenc xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r8+192] + jl L_AES_XTS_encrypt_vaes_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r8+224] +L_AES_XTS_encrypt_vaes_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r13] + vmovdqu OWORD PTR [rcx], xmm0 + vpshufd xmm4, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpand xmm4, xmm4, xmm12 + vpxor xmm8, xmm8, xmm4 + add r13d, 16 + cmp r13d, r11d + jl L_AES_XTS_encrypt_vaes_enc_16 + cmp r13d, eax + je L_AES_XTS_encrypt_vaes_done_enc +L_AES_XTS_encrypt_vaes_last_15: + sub r13, 16 + lea rcx, QWORD PTR [rsi+r13] + vmovdqu xmm0, OWORD PTR [rcx] + add r13, 16 + vmovdqu OWORD PTR [rsp], xmm0 + xor rdx, rdx +L_AES_XTS_encrypt_vaes_last_15_byte_loop: + mov r11b, BYTE PTR [rsp+rdx] + mov cl, BYTE PTR [rdi+r13] + mov BYTE PTR [rsi+r13], r11b + mov BYTE PTR [rsp+rdx], cl + inc r13d + inc edx + cmp r13d, eax + jl L_AES_XTS_encrypt_vaes_last_15_byte_loop + sub r13, rdx + vmovdqu xmm0, OWORD PTR [rsp] + sub r13, 16 + vpxor xmm0, xmm0, xmm8 + ; aes_enc_block + vpxor xmm0, xmm0, [r8] + vmovdqu xmm5, OWORD PTR [r8+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+144] + vaesenc xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r8+160] + jl L_AES_XTS_encrypt_vaes_last_15_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+176] + vaesenc xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r8+192] + jl L_AES_XTS_encrypt_vaes_last_15_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r8+224] +L_AES_XTS_encrypt_vaes_last_15_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r13] + vmovdqu OWORD PTR [rcx], xmm0 +L_AES_XTS_encrypt_vaes_done_enc: + vmovdqu xmm6, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm8, OWORD PTR [rsp+96] + vmovdqu xmm9, OWORD PTR [rsp+112] + vmovdqu xmm10, OWORD PTR [rsp+128] + vmovdqu xmm11, OWORD PTR [rsp+144] + vmovdqu xmm12, OWORD PTR [rsp+160] + vmovdqu xmm13, OWORD PTR [rsp+176] + vmovdqu xmm14, OWORD PTR [rsp+192] + vmovdqu xmm15, OWORD PTR [rsp+208] + add rsp, 224 + pop r13 + pop r12 + pop rsi + pop rdi + ret +AES_XTS_encrypt_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_XTS_encrypt_update_vaes PROC + push rdi + push rsi + push r12 + mov rdi, rcx + mov rsi, rdx + mov rax, r8 + mov r10, r9 + mov r8, QWORD PTR [rsp+64] + mov r9d, DWORD PTR [rsp+72] + sub rsp, 224 + vmovdqu OWORD PTR [rsp+64], xmm6 + vmovdqu OWORD PTR [rsp+80], xmm7 + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + vmovdqu OWORD PTR [rsp+128], xmm10 + vmovdqu OWORD PTR [rsp+144], xmm11 + vmovdqu OWORD PTR [rsp+160], xmm12 + vmovdqu OWORD PTR [rsp+176], xmm13 + vmovdqu OWORD PTR [rsp+192], xmm14 + vmovdqu OWORD PTR [rsp+208], xmm15 + vmovdqu xmm12, OWORD PTR L_vaes_aes_xts_gc_xts + vbroadcasti128 ymm13, ptr_L_vaes_aes_xts_poly + vmovdqu ymm14, YMMWORD PTR L_vaes_aes_xts_shl + vmovdqu ymm15, YMMWORD PTR L_vaes_aes_xts_shr + vmovdqu xmm8, OWORD PTR [r8] + xor r12d, r12d + cmp eax, 32 + jl L_AES_XTS_encrypt_update_vaes_done_128 + cmp eax, 128 + mov r11d, eax + jl L_AES_XTS_encrypt_update_vaes_done_128 + and r11d, 4294967168 + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + vpsrlq ymm9, ymm4, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm4, 2 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + vpsrlq ymm9, ymm5, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm6, ymm5, 2 + vpxor ymm6, ymm6, ymm10 + vpxor ymm6, ymm6, ymm9 + vpsrlq ymm9, ymm6, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm7, ymm6, 2 + vpxor ymm7, ymm7, ymm10 + vpxor ymm7, ymm7, ymm9 +L_AES_XTS_encrypt_update_vaes_enc_128: + ; 128 bytes of input + ; aes_enc_128 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + ; aes_enc_block + vbroadcasti128 ymm9, [r10] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vpxor ymm1, ymm1, ymm5 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm6 + vpxor ymm2, ymm2, ymm9 + vpxor ymm3, ymm3, ymm7 + vpxor ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+16] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+32] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+48] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+64] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+80] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+96] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+112] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+128] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+144] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + cmp r9d, 11 + vbroadcasti128 ymm9, [r10+160] + jl L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+176] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + cmp r9d, 13 + vbroadcasti128 ymm9, [r10+192] + jl L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+208] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+224] +L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm9 + vaesenclast ymm1, ymm1, ymm9 + vaesenclast ymm2, ymm2, ymm9 + vaesenclast ymm3, ymm3, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vpsrlq ymm9, ymm4, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm4, ymm4, 8 + vpxor ymm4, ymm4, ymm10 + vpxor ymm4, ymm4, ymm9 + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vpsrlq ymm9, ymm5, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm5, 8 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + vpxor ymm2, ymm2, ymm6 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vpsrlq ymm9, ymm6, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm6, ymm6, 8 + vpxor ymm6, ymm6, ymm10 + vpxor ymm6, ymm6, ymm9 + vpxor ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + vpsrlq ymm9, ymm7, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm7, ymm7, 8 + vpxor ymm7, ymm7, ymm10 + vpxor ymm7, ymm7, ymm9 + add r12d, 128 + cmp r12d, r11d + jl L_AES_XTS_encrypt_update_vaes_enc_128 + vextracti128 xmm8, ymm4, 0 +L_AES_XTS_encrypt_update_vaes_done_128: + mov r11d, eax + and r11d, 4294967232 + cmp r12d, r11d + je L_AES_XTS_encrypt_update_vaes_done_64 + ; 64 bytes of input + ; aes_enc_64 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + vpsrlq ymm9, ymm4, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm4, 2 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + ; aes_enc_block + vbroadcasti128 ymm9, [r10] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vpxor ymm1, ymm1, ymm5 + vpxor ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+16] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+32] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+48] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+64] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+80] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+96] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+112] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+128] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+144] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + cmp r9d, 11 + vbroadcasti128 ymm9, [r10+160] + jl L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+176] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + cmp r9d, 13 + vbroadcasti128 ymm9, [r10+192] + jl L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+208] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+224] +L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm9 + vaesenclast ymm1, ymm1, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vextracti128 xmm8, ymm5, 1 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpand xmm9, xmm9, xmm12 + vpxor xmm8, xmm8, xmm9 + add r12d, 64 +L_AES_XTS_encrypt_update_vaes_done_64: + mov r11d, eax + and r11d, 4294967264 + cmp r12d, r11d + je L_AES_XTS_encrypt_update_vaes_done_32 + ; 32 bytes of input + ; aes_enc_32 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu ymm0, YMMWORD PTR [rcx] + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + ; aes_enc_block + vbroadcasti128 ymm9, [r10] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+16] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+32] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+48] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+64] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+80] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+96] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+112] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+128] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+144] + vaesenc ymm0, ymm0, ymm9 + cmp r9d, 11 + vbroadcasti128 ymm9, [r10+160] + jl L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+176] + vaesenc ymm0, ymm0, ymm9 + cmp r9d, 13 + vbroadcasti128 ymm9, [r10+192] + jl L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+208] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+224] +L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vextracti128 xmm8, ymm4, 1 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpand xmm9, xmm9, xmm12 + vpxor xmm8, xmm8, xmm9 + add r12d, 32 +L_AES_XTS_encrypt_update_vaes_done_32: + cmp r12d, eax + mov r11d, eax + je L_AES_XTS_encrypt_update_vaes_done_enc + sub r11d, r12d + cmp r11d, 16 + mov r11d, eax + jl L_AES_XTS_encrypt_update_vaes_last_15 + and r11d, 4294967280 + ; 16 bytes of input +L_AES_XTS_encrypt_update_vaes_enc_16: + lea rcx, QWORD PTR [rdi+r12] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm8 + ; aes_enc_block + vpxor xmm0, xmm0, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesenc xmm0, xmm0, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_encrypt_update_vaes_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesenc xmm0, xmm0, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_encrypt_update_vaes_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_encrypt_update_vaes_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu OWORD PTR [rcx], xmm0 + vpshufd xmm4, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpand xmm4, xmm4, xmm12 + vpxor xmm8, xmm8, xmm4 + add r12d, 16 + cmp r12d, r11d + jl L_AES_XTS_encrypt_update_vaes_enc_16 + cmp r12d, eax + je L_AES_XTS_encrypt_update_vaes_done_enc +L_AES_XTS_encrypt_update_vaes_last_15: + sub r12, 16 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu xmm0, OWORD PTR [rcx] + add r12, 16 + vmovdqu OWORD PTR [rsp], xmm0 + xor rdx, rdx +L_AES_XTS_encrypt_update_vaes_last_15_byte_loop: + mov r11b, BYTE PTR [rsp+rdx] + mov cl, BYTE PTR [rdi+r12] + mov BYTE PTR [rsi+r12], r11b + mov BYTE PTR [rsp+rdx], cl + inc r12d + inc edx + cmp r12d, eax + jl L_AES_XTS_encrypt_update_vaes_last_15_byte_loop + sub r12, rdx + vmovdqu xmm0, OWORD PTR [rsp] + sub r12, 16 + vpxor xmm0, xmm0, xmm8 + ; aes_enc_block + vpxor xmm0, xmm0, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesenc xmm0, xmm0, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_encrypt_update_vaes_last_15_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesenc xmm0, xmm0, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_encrypt_update_vaes_last_15_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_encrypt_update_vaes_last_15_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu OWORD PTR [rcx], xmm0 +L_AES_XTS_encrypt_update_vaes_done_enc: + vmovdqu OWORD PTR [r8], xmm8 + vmovdqu xmm6, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm8, OWORD PTR [rsp+96] + vmovdqu xmm9, OWORD PTR [rsp+112] + vmovdqu xmm10, OWORD PTR [rsp+128] + vmovdqu xmm11, OWORD PTR [rsp+144] + vmovdqu xmm12, OWORD PTR [rsp+160] + vmovdqu xmm13, OWORD PTR [rsp+176] + vmovdqu xmm14, OWORD PTR [rsp+192] + vmovdqu xmm15, OWORD PTR [rsp+208] + add rsp, 224 + pop r12 + pop rsi + pop rdi + ret +AES_XTS_encrypt_update_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_XTS_decrypt_vaes PROC + push rdi + push rsi + push r12 + push r13 + mov rdi, rcx + mov rsi, rdx + mov rax, r8 + mov r12, r9 + mov r8, QWORD PTR [rsp+72] + mov r9, QWORD PTR [rsp+80] + mov r10d, DWORD PTR [rsp+88] + sub rsp, 224 + vmovdqu OWORD PTR [rsp+64], xmm6 + vmovdqu OWORD PTR [rsp+80], xmm7 + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + vmovdqu OWORD PTR [rsp+128], xmm10 + vmovdqu OWORD PTR [rsp+144], xmm11 + vmovdqu OWORD PTR [rsp+160], xmm12 + vmovdqu OWORD PTR [rsp+176], xmm13 + vmovdqu OWORD PTR [rsp+192], xmm14 + vmovdqu OWORD PTR [rsp+208], xmm15 + vmovdqu xmm12, OWORD PTR L_vaes_aes_xts_gc_xts + vbroadcasti128 ymm13, ptr_L_vaes_aes_xts_poly + vmovdqu ymm14, YMMWORD PTR L_vaes_aes_xts_shl + vmovdqu ymm15, YMMWORD PTR L_vaes_aes_xts_shr + vmovdqu xmm8, OWORD PTR [r12] + ; aes_enc_block + vpxor xmm8, xmm8, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm8, xmm8, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_XTS_decrypt_vaes_tweak_aes_enc_block_last + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm8, xmm8, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_XTS_decrypt_vaes_tweak_aes_enc_block_last + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm8, xmm8, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_XTS_decrypt_vaes_tweak_aes_enc_block_last: + vaesenclast xmm8, xmm8, xmm5 + xor r13d, r13d + mov r11d, eax + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_vaes_mul16_128 + sub r11d, 16 + cmp r11d, 16 + jl L_AES_XTS_decrypt_vaes_last_31_start +L_AES_XTS_decrypt_vaes_mul16_128: + cmp r11d, 32 + jl L_AES_XTS_decrypt_vaes_done_128 + cmp r11d, 128 + jl L_AES_XTS_decrypt_vaes_done_128 + and r11d, 4294967168 + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + vpsrlq ymm9, ymm4, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm4, 2 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + vpsrlq ymm9, ymm5, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm6, ymm5, 2 + vpxor ymm6, ymm6, ymm10 + vpxor ymm6, ymm6, ymm9 + vpsrlq ymm9, ymm6, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm7, ymm6, 2 + vpxor ymm7, ymm7, ymm10 + vpxor ymm7, ymm7, ymm9 +L_AES_XTS_decrypt_vaes_dec_128: + ; 128 bytes of input + ; aes_dec_128 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + ; aes_dec_block + vbroadcasti128 ymm9, [r8] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vpxor ymm1, ymm1, ymm5 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm6 + vpxor ymm2, ymm2, ymm9 + vpxor ymm3, ymm3, ymm7 + vpxor ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+16] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+32] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+48] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+64] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+80] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+96] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+112] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+128] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+144] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + cmp r10d, 11 + vbroadcasti128 ymm9, [r8+160] + jl L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+176] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + cmp r10d, 13 + vbroadcasti128 ymm9, [r8+192] + jl L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+208] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+224] +L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm9 + vaesdeclast ymm1, ymm1, ymm9 + vaesdeclast ymm2, ymm2, ymm9 + vaesdeclast ymm3, ymm3, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vpsrlq ymm9, ymm4, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm4, ymm4, 8 + vpxor ymm4, ymm4, ymm10 + vpxor ymm4, ymm4, ymm9 + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vpsrlq ymm9, ymm5, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm5, 8 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + vpxor ymm2, ymm2, ymm6 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vpsrlq ymm9, ymm6, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm6, ymm6, 8 + vpxor ymm6, ymm6, ymm10 + vpxor ymm6, ymm6, ymm9 + vpxor ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + vpsrlq ymm9, ymm7, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm7, ymm7, 8 + vpxor ymm7, ymm7, ymm10 + vpxor ymm7, ymm7, ymm9 + add r13d, 128 + cmp r13d, r11d + jl L_AES_XTS_decrypt_vaes_dec_128 + vextracti128 xmm8, ymm4, 0 +L_AES_XTS_decrypt_vaes_done_128: + cmp r13d, eax + mov r11d, eax + je L_AES_XTS_decrypt_vaes_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_vaes_mul16_64 + sub r11d, 16 + sub r11d, r13d + cmp r11d, 16 + jl L_AES_XTS_decrypt_vaes_last_31_start + add r11d, r13d +L_AES_XTS_decrypt_vaes_mul16_64: + and r11d, 4294967232 + cmp r13d, r11d + je L_AES_XTS_decrypt_vaes_done_64 + ; 64 bytes of input + ; aes_dec_64 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + vpsrlq ymm9, ymm4, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm4, 2 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + ; aes_dec_block + vbroadcasti128 ymm9, [r8] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vpxor ymm1, ymm1, ymm5 + vpxor ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+16] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+32] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+48] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+64] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+80] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+96] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+112] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+128] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+144] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + cmp r10d, 11 + vbroadcasti128 ymm9, [r8+160] + jl L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+176] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + cmp r10d, 13 + vbroadcasti128 ymm9, [r8+192] + jl L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+208] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+224] +L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm9 + vaesdeclast ymm1, ymm1, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vextracti128 xmm8, ymm5, 1 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpand xmm9, xmm9, xmm12 + vpxor xmm8, xmm8, xmm9 + add r13d, 64 +L_AES_XTS_decrypt_vaes_done_64: + cmp r13d, eax + mov r11d, eax + je L_AES_XTS_decrypt_vaes_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_vaes_mul16_32 + sub r11d, 16 + sub r11d, r13d + cmp r11d, 16 + jl L_AES_XTS_decrypt_vaes_last_31_start + add r11d, r13d +L_AES_XTS_decrypt_vaes_mul16_32: + and r11d, 4294967264 + cmp r13d, r11d + je L_AES_XTS_decrypt_vaes_done_32 + ; 32 bytes of input + ; aes_dec_32 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu ymm0, YMMWORD PTR [rcx] + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + ; aes_dec_block + vbroadcasti128 ymm9, [r8] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+16] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+32] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+48] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+64] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+80] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+96] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+112] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+128] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+144] + vaesdec ymm0, ymm0, ymm9 + cmp r10d, 11 + vbroadcasti128 ymm9, [r8+160] + jl L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+176] + vaesdec ymm0, ymm0, ymm9 + cmp r10d, 13 + vbroadcasti128 ymm9, [r8+192] + jl L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+208] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+224] +L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vextracti128 xmm8, ymm4, 1 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpand xmm9, xmm9, xmm12 + vpxor xmm8, xmm8, xmm9 + add r13d, 32 +L_AES_XTS_decrypt_vaes_done_32: + cmp r13d, eax + mov r11d, eax + je L_AES_XTS_decrypt_vaes_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_vaes_mul16 + sub r11d, 16 + sub r11d, r13d + cmp r11d, 16 + jl L_AES_XTS_decrypt_vaes_last_31_start + add r11d, r13d +L_AES_XTS_decrypt_vaes_mul16: +L_AES_XTS_decrypt_vaes_dec_16: + ; 16 bytes of input + lea rcx, QWORD PTR [rdi+r13] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm8 + ; aes_dec_block + vpxor xmm0, xmm0, [r8] + vmovdqu xmm5, OWORD PTR [r8+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+144] + vaesdec xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r8+160] + jl L_AES_XTS_decrypt_vaes_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+176] + vaesdec xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r8+192] + jl L_AES_XTS_decrypt_vaes_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r8+224] +L_AES_XTS_decrypt_vaes_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r13] + vmovdqu OWORD PTR [rcx], xmm0 + vpshufd xmm4, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpand xmm4, xmm4, xmm12 + vpxor xmm8, xmm8, xmm4 + add r13d, 16 + cmp r13d, r11d + jl L_AES_XTS_decrypt_vaes_dec_16 + cmp r13d, eax + je L_AES_XTS_decrypt_vaes_done_dec +L_AES_XTS_decrypt_vaes_last_31_start: + vpshufd xmm4, xmm8, 19 + vpaddq xmm7, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpand xmm4, xmm4, xmm12 + vpxor xmm7, xmm7, xmm4 + lea rcx, QWORD PTR [rdi+r13] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm7 + ; aes_dec_block + vpxor xmm0, xmm0, [r8] + vmovdqu xmm5, OWORD PTR [r8+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+144] + vaesdec xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r8+160] + jl L_AES_XTS_decrypt_vaes_last_31_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+176] + vaesdec xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r8+192] + jl L_AES_XTS_decrypt_vaes_last_31_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r8+224] +L_AES_XTS_decrypt_vaes_last_31_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm7 + vmovdqu OWORD PTR [rsp], xmm0 + add r13, 16 + xor rdx, rdx +L_AES_XTS_decrypt_vaes_last_31_byte_loop: + mov r11b, BYTE PTR [rsp+rdx] + mov cl, BYTE PTR [rdi+r13] + mov BYTE PTR [rsi+r13], r11b + mov BYTE PTR [rsp+rdx], cl + inc r13d + inc edx + cmp r13d, eax + jl L_AES_XTS_decrypt_vaes_last_31_byte_loop + sub r13, rdx + vmovdqu xmm0, OWORD PTR [rsp] + vpxor xmm0, xmm0, xmm8 + ; aes_dec_block + vpxor xmm0, xmm0, [r8] + vmovdqu xmm5, OWORD PTR [r8+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+144] + vaesdec xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r8+160] + jl L_AES_XTS_decrypt_vaes_last_31_2_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+176] + vaesdec xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r8+192] + jl L_AES_XTS_decrypt_vaes_last_31_2_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r8+224] +L_AES_XTS_decrypt_vaes_last_31_2_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + sub r13, 16 + lea rcx, QWORD PTR [rsi+r13] + vmovdqu OWORD PTR [rcx], xmm0 +L_AES_XTS_decrypt_vaes_done_dec: + vmovdqu xmm6, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm8, OWORD PTR [rsp+96] + vmovdqu xmm9, OWORD PTR [rsp+112] + vmovdqu xmm10, OWORD PTR [rsp+128] + vmovdqu xmm11, OWORD PTR [rsp+144] + vmovdqu xmm12, OWORD PTR [rsp+160] + vmovdqu xmm13, OWORD PTR [rsp+176] + vmovdqu xmm14, OWORD PTR [rsp+192] + vmovdqu xmm15, OWORD PTR [rsp+208] + add rsp, 224 + pop r13 + pop r12 + pop rsi + pop rdi + ret +AES_XTS_decrypt_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_XTS_decrypt_update_vaes PROC + push rdi + push rsi + push r12 + mov rdi, rcx + mov rsi, rdx + mov rax, r8 + mov r10, r9 + mov r8, QWORD PTR [rsp+64] + mov r9d, DWORD PTR [rsp+72] + sub rsp, 224 + vmovdqu OWORD PTR [rsp+64], xmm6 + vmovdqu OWORD PTR [rsp+80], xmm7 + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + vmovdqu OWORD PTR [rsp+128], xmm10 + vmovdqu OWORD PTR [rsp+144], xmm11 + vmovdqu OWORD PTR [rsp+160], xmm12 + vmovdqu OWORD PTR [rsp+176], xmm13 + vmovdqu OWORD PTR [rsp+192], xmm14 + vmovdqu OWORD PTR [rsp+208], xmm15 + vmovdqu xmm12, OWORD PTR L_vaes_aes_xts_gc_xts + vbroadcasti128 ymm13, ptr_L_vaes_aes_xts_poly + vmovdqu ymm14, YMMWORD PTR L_vaes_aes_xts_shl + vmovdqu ymm15, YMMWORD PTR L_vaes_aes_xts_shr + vmovdqu xmm8, OWORD PTR [r8] + xor r12d, r12d + mov r11d, eax + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_update_vaes_mul16_128 + sub r11d, 16 + cmp r11d, 16 + jl L_AES_XTS_decrypt_update_vaes_last_31_start +L_AES_XTS_decrypt_update_vaes_mul16_128: + cmp r11d, 32 + jl L_AES_XTS_decrypt_update_vaes_done_128 + cmp r11d, 128 + jl L_AES_XTS_decrypt_update_vaes_done_128 + and r11d, 4294967168 + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + vpsrlq ymm9, ymm4, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm4, 2 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + vpsrlq ymm9, ymm5, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm6, ymm5, 2 + vpxor ymm6, ymm6, ymm10 + vpxor ymm6, ymm6, ymm9 + vpsrlq ymm9, ymm6, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm7, ymm6, 2 + vpxor ymm7, ymm7, ymm10 + vpxor ymm7, ymm7, ymm9 +L_AES_XTS_decrypt_update_vaes_dec_128: + ; 128 bytes of input + ; aes_dec_128 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + ; aes_dec_block + vbroadcasti128 ymm9, [r10] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vpxor ymm1, ymm1, ymm5 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm6 + vpxor ymm2, ymm2, ymm9 + vpxor ymm3, ymm3, ymm7 + vpxor ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+16] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+32] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+48] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+64] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+80] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+96] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+112] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+128] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+144] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + cmp r9d, 11 + vbroadcasti128 ymm9, [r10+160] + jl L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+176] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + cmp r9d, 13 + vbroadcasti128 ymm9, [r10+192] + jl L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+208] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+224] +L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm9 + vaesdeclast ymm1, ymm1, ymm9 + vaesdeclast ymm2, ymm2, ymm9 + vaesdeclast ymm3, ymm3, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vpsrlq ymm9, ymm4, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm4, ymm4, 8 + vpxor ymm4, ymm4, ymm10 + vpxor ymm4, ymm4, ymm9 + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vpsrlq ymm9, ymm5, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm5, 8 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + vpxor ymm2, ymm2, ymm6 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vpsrlq ymm9, ymm6, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm6, ymm6, 8 + vpxor ymm6, ymm6, ymm10 + vpxor ymm6, ymm6, ymm9 + vpxor ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + vpsrlq ymm9, ymm7, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm7, ymm7, 8 + vpxor ymm7, ymm7, ymm10 + vpxor ymm7, ymm7, ymm9 + add r12d, 128 + cmp r12d, r11d + jl L_AES_XTS_decrypt_update_vaes_dec_128 + vextracti128 xmm8, ymm4, 0 +L_AES_XTS_decrypt_update_vaes_done_128: + cmp r12d, eax + mov r11d, eax + je L_AES_XTS_decrypt_update_vaes_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_update_vaes_mul16_64 + sub r11d, 16 + sub r11d, r12d + cmp r11d, 16 + jl L_AES_XTS_decrypt_update_vaes_last_31_start + add r11d, r12d +L_AES_XTS_decrypt_update_vaes_mul16_64: + and r11d, 4294967232 + cmp r12d, r11d + je L_AES_XTS_decrypt_update_vaes_done_64 + ; 64 bytes of input + ; aes_dec_64 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + vpsrlq ymm9, ymm4, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm4, 2 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + ; aes_dec_block + vbroadcasti128 ymm9, [r10] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vpxor ymm1, ymm1, ymm5 + vpxor ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+16] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+32] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+48] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+64] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+80] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+96] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+112] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+128] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+144] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + cmp r9d, 11 + vbroadcasti128 ymm9, [r10+160] + jl L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+176] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + cmp r9d, 13 + vbroadcasti128 ymm9, [r10+192] + jl L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+208] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+224] +L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm9 + vaesdeclast ymm1, ymm1, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vextracti128 xmm8, ymm5, 1 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpand xmm9, xmm9, xmm12 + vpxor xmm8, xmm8, xmm9 + add r12d, 64 +L_AES_XTS_decrypt_update_vaes_done_64: + cmp r12d, eax + mov r11d, eax + je L_AES_XTS_decrypt_update_vaes_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_update_vaes_mul16_32 + sub r11d, 16 + sub r11d, r12d + cmp r11d, 16 + jl L_AES_XTS_decrypt_update_vaes_last_31_start + add r11d, r12d +L_AES_XTS_decrypt_update_vaes_mul16_32: + and r11d, 4294967264 + cmp r12d, r11d + je L_AES_XTS_decrypt_update_vaes_done_32 + ; 32 bytes of input + ; aes_dec_32 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu ymm0, YMMWORD PTR [rcx] + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + ; aes_dec_block + vbroadcasti128 ymm9, [r10] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+16] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+32] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+48] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+64] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+80] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+96] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+112] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+128] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+144] + vaesdec ymm0, ymm0, ymm9 + cmp r9d, 11 + vbroadcasti128 ymm9, [r10+160] + jl L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+176] + vaesdec ymm0, ymm0, ymm9 + cmp r9d, 13 + vbroadcasti128 ymm9, [r10+192] + jl L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+208] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+224] +L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vextracti128 xmm8, ymm4, 1 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpand xmm9, xmm9, xmm12 + vpxor xmm8, xmm8, xmm9 + add r12d, 32 +L_AES_XTS_decrypt_update_vaes_done_32: + cmp r12d, eax + mov r11d, eax + je L_AES_XTS_decrypt_update_vaes_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_update_vaes_mul16 + sub r11d, 16 + sub r11d, r12d + cmp r11d, 16 + jl L_AES_XTS_decrypt_update_vaes_last_31_start + add r11d, r12d +L_AES_XTS_decrypt_update_vaes_mul16: +L_AES_XTS_decrypt_update_vaes_dec_16: + ; 16 bytes of input + lea rcx, QWORD PTR [rdi+r12] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm8 + ; aes_dec_block + vpxor xmm0, xmm0, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesdec xmm0, xmm0, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_decrypt_update_vaes_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesdec xmm0, xmm0, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_decrypt_update_vaes_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_decrypt_update_vaes_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu OWORD PTR [rcx], xmm0 + vpshufd xmm4, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpand xmm4, xmm4, xmm12 + vpxor xmm8, xmm8, xmm4 + add r12d, 16 + cmp r12d, r11d + jl L_AES_XTS_decrypt_update_vaes_dec_16 + cmp r12d, eax + je L_AES_XTS_decrypt_update_vaes_done_dec +L_AES_XTS_decrypt_update_vaes_last_31_start: + vpshufd xmm4, xmm8, 19 + vpaddq xmm7, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpand xmm4, xmm4, xmm12 + vpxor xmm7, xmm7, xmm4 + lea rcx, QWORD PTR [rdi+r12] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm7 + ; aes_dec_block + vpxor xmm0, xmm0, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesdec xmm0, xmm0, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_decrypt_update_vaes_last_31_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesdec xmm0, xmm0, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_decrypt_update_vaes_last_31_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_decrypt_update_vaes_last_31_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm7 + vmovdqu OWORD PTR [rsp], xmm0 + add r12, 16 + xor rdx, rdx +L_AES_XTS_decrypt_update_vaes_last_31_byte_loop: + mov r11b, BYTE PTR [rsp+rdx] + mov cl, BYTE PTR [rdi+r12] + mov BYTE PTR [rsi+r12], r11b + mov BYTE PTR [rsp+rdx], cl + inc r12d + inc edx + cmp r12d, eax + jl L_AES_XTS_decrypt_update_vaes_last_31_byte_loop + sub r12, rdx + vmovdqu xmm0, OWORD PTR [rsp] + vpxor xmm0, xmm0, xmm8 + ; aes_dec_block + vpxor xmm0, xmm0, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesdec xmm0, xmm0, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_decrypt_update_vaes_last_31_2_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesdec xmm0, xmm0, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_decrypt_update_vaes_last_31_2_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_decrypt_update_vaes_last_31_2_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + sub r12, 16 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu OWORD PTR [rcx], xmm0 +L_AES_XTS_decrypt_update_vaes_done_dec: + vmovdqu OWORD PTR [r8], xmm8 + vmovdqu xmm6, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm8, OWORD PTR [rsp+96] + vmovdqu xmm9, OWORD PTR [rsp+112] + vmovdqu xmm10, OWORD PTR [rsp+128] + vmovdqu xmm11, OWORD PTR [rsp+144] + vmovdqu xmm12, OWORD PTR [rsp+160] + vmovdqu xmm13, OWORD PTR [rsp+176] + vmovdqu xmm14, OWORD PTR [rsp+192] + vmovdqu xmm15, OWORD PTR [rsp+208] + add rsp, 224 + pop r12 + pop rsi + pop rdi + ret +AES_XTS_decrypt_update_vaes ENDP +_TEXT ENDS +ENDIF +IFDEF HAVE_INTEL_AVX512 +_TEXT SEGMENT READONLY PARA +AES_XTS_init_avx512 PROC + vmovdqu xmm0, OWORD PTR [rcx] + ; aes_enc_block + vpxor xmm0, xmm0, [rdx] + vmovdqu xmm2, OWORD PTR [rdx+16] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+32] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+48] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+64] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+80] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+96] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+112] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+128] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+144] + vaesenc xmm0, xmm0, xmm2 + cmp r8d, 11 + vmovdqu xmm2, OWORD PTR [rdx+160] + jl L_AES_XTS_init_avx512_tweak_aes_enc_block_last + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm3, OWORD PTR [rdx+176] + vaesenc xmm0, xmm0, xmm3 + cmp r8d, 13 + vmovdqu xmm2, OWORD PTR [rdx+192] + jl L_AES_XTS_init_avx512_tweak_aes_enc_block_last + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm3, OWORD PTR [rdx+208] + vaesenc xmm0, xmm0, xmm3 + vmovdqu xmm2, OWORD PTR [rdx+224] +L_AES_XTS_init_avx512_tweak_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm2 + vmovdqu OWORD PTR [rcx], xmm0 + ret +AES_XTS_init_avx512 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_xts_gc_xts DWORD \ + 00000087h, 00000000h, 00000001h, 00000000h +ptr_L_avx512_aes_xts_gc_xts QWORD L_avx512_aes_xts_gc_xts +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_xts_poly DWORD \ + 00000087h, 00000000h, 00000000h, 00000000h +ptr_L_avx512_aes_xts_poly QWORD L_avx512_aes_xts_poly +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_xts_shl DWORD \ + 00000000h, 00000000h, 00000000h, 00000000h, + 00000001h, 00000000h, 00000001h, 00000000h, + 00000002h, 00000000h, 00000002h, 00000000h, + 00000003h, 00000000h, 00000003h, 00000000h +ptr_L_avx512_aes_xts_shl QWORD L_avx512_aes_xts_shl +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_xts_shr DWORD \ + 00000040h, 00000000h, 00000040h, 00000000h, + 0000003fh, 00000000h, 0000003fh, 00000000h, + 0000003eh, 00000000h, 0000003eh, 00000000h, + 0000003dh, 00000000h, 0000003dh, 00000000h +ptr_L_avx512_aes_xts_shr QWORD L_avx512_aes_xts_shr +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_XTS_encrypt_avx512 PROC + push rdi + push rsi + push r12 + push r13 + mov rdi, rcx + mov rsi, rdx + mov rax, r8 + mov r12, r9 + mov r8, QWORD PTR [rsp+72] + mov r9, QWORD PTR [rsp+80] + mov r10d, DWORD PTR [rsp+88] + sub rsp, 224 + vmovdqu OWORD PTR [rsp+64], xmm6 + vmovdqu OWORD PTR [rsp+80], xmm7 + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + vmovdqu OWORD PTR [rsp+128], xmm10 + vmovdqu OWORD PTR [rsp+144], xmm11 + vmovdqu OWORD PTR [rsp+160], xmm12 + vmovdqu OWORD PTR [rsp+176], xmm13 + vmovdqu OWORD PTR [rsp+192], xmm14 + vmovdqu OWORD PTR [rsp+208], xmm15 + vmovdqu xmm12, OWORD PTR L_avx512_aes_xts_gc_xts + vbroadcasti32x4 zmm13, ptr_L_avx512_aes_xts_poly + vmovdqu64 zmm14, ptr_L_avx512_aes_xts_shl + vmovdqu64 zmm15, ptr_L_avx512_aes_xts_shr + vmovdqu xmm8, OWORD PTR [r12] + ; aes_enc_block + vpxor xmm8, xmm8, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm8, xmm8, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_XTS_encrypt_avx512_tweak_aes_enc_block_last + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm8, xmm8, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_XTS_encrypt_avx512_tweak_aes_enc_block_last + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm8, xmm8, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_XTS_encrypt_avx512_tweak_aes_enc_block_last: + vaesenclast xmm8, xmm8, xmm5 + xor r13d, r13d + cmp eax, 32 + jl L_AES_XTS_encrypt_avx512_done_128 + vbroadcasti32x4 zmm16, [r8] + vbroadcasti32x4 zmm17, [r8+16] + vbroadcasti32x4 zmm18, [r8+32] + vbroadcasti32x4 zmm19, [r8+48] + vbroadcasti32x4 zmm20, [r8+64] + vbroadcasti32x4 zmm21, [r8+80] + vbroadcasti32x4 zmm22, [r8+96] + vbroadcasti32x4 zmm23, [r8+112] + vbroadcasti32x4 zmm24, [r8+128] + vbroadcasti32x4 zmm25, [r8+144] + vbroadcasti32x4 zmm26, [r8+160] + cmp r10d, 11 + jl L_AES_XTS_encrypt_avx512_key_cached + vbroadcasti32x4 zmm27, [r8+176] + vbroadcasti32x4 zmm28, [r8+192] + cmp r10d, 13 + jl L_AES_XTS_encrypt_avx512_key_cached + vbroadcasti32x4 zmm29, [r8+208] + vbroadcasti32x4 zmm30, [r8+224] +L_AES_XTS_encrypt_avx512_key_cached: + cmp eax, 256 + mov r11d, eax + jl L_AES_XTS_encrypt_avx512_done_256 + and r11d, 4294967040 + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + vpsrlq zmm9, zmm4, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm4, 4 + vpternlogq zmm5, zmm10, zmm9, 150 + vpsrlq zmm9, zmm5, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm6, zmm5, 4 + vpternlogq zmm6, zmm10, zmm9, 150 + vpsrlq zmm9, zmm6, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm7, zmm6, 4 + vpternlogq zmm7, zmm10, zmm9, 150 +L_AES_XTS_encrypt_avx512_enc_256: + ; 256 bytes of input + ; aes_enc_256 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu64 zmm0, [rcx] + vmovdqu64 zmm1, [rcx+64] + vmovdqu64 zmm2, [rcx+128] + vmovdqu64 zmm3, [rcx+192] + ; aes_enc_block + vpternlogq zmm0, zmm16, zmm4, 150 + vpternlogq zmm1, zmm16, zmm5, 150 + vpternlogq zmm2, zmm16, zmm6, 150 + vpternlogq zmm3, zmm16, zmm7, 150 + vaesenc zmm0, zmm0, zmm17 + vaesenc zmm1, zmm1, zmm17 + vaesenc zmm2, zmm2, zmm17 + vaesenc zmm3, zmm3, zmm17 + vaesenc zmm0, zmm0, zmm18 + vaesenc zmm1, zmm1, zmm18 + vaesenc zmm2, zmm2, zmm18 + vaesenc zmm3, zmm3, zmm18 + vaesenc zmm0, zmm0, zmm19 + vaesenc zmm1, zmm1, zmm19 + vaesenc zmm2, zmm2, zmm19 + vaesenc zmm3, zmm3, zmm19 + vaesenc zmm0, zmm0, zmm20 + vaesenc zmm1, zmm1, zmm20 + vaesenc zmm2, zmm2, zmm20 + vaesenc zmm3, zmm3, zmm20 + vaesenc zmm0, zmm0, zmm21 + vaesenc zmm1, zmm1, zmm21 + vaesenc zmm2, zmm2, zmm21 + vaesenc zmm3, zmm3, zmm21 + vaesenc zmm0, zmm0, zmm22 + vaesenc zmm1, zmm1, zmm22 + vaesenc zmm2, zmm2, zmm22 + vaesenc zmm3, zmm3, zmm22 + vaesenc zmm0, zmm0, zmm23 + vaesenc zmm1, zmm1, zmm23 + vaesenc zmm2, zmm2, zmm23 + vaesenc zmm3, zmm3, zmm23 + vaesenc zmm0, zmm0, zmm24 + vaesenc zmm1, zmm1, zmm24 + vaesenc zmm2, zmm2, zmm24 + vaesenc zmm3, zmm3, zmm24 + vaesenc zmm0, zmm0, zmm25 + vaesenc zmm1, zmm1, zmm25 + vaesenc zmm2, zmm2, zmm25 + vaesenc zmm3, zmm3, zmm25 + cmp r10d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_encrypt_avx512_aes_enc_256_aes_enc_block_last + vaesenc zmm0, zmm0, zmm26 + vaesenc zmm1, zmm1, zmm26 + vaesenc zmm2, zmm2, zmm26 + vaesenc zmm3, zmm3, zmm26 + vaesenc zmm0, zmm0, zmm27 + vaesenc zmm1, zmm1, zmm27 + vaesenc zmm2, zmm2, zmm27 + vaesenc zmm3, zmm3, zmm27 + cmp r10d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_encrypt_avx512_aes_enc_256_aes_enc_block_last + vaesenc zmm0, zmm0, zmm28 + vaesenc zmm1, zmm1, zmm28 + vaesenc zmm2, zmm2, zmm28 + vaesenc zmm3, zmm3, zmm28 + vaesenc zmm0, zmm0, zmm29 + vaesenc zmm1, zmm1, zmm29 + vaesenc zmm2, zmm2, zmm29 + vaesenc zmm3, zmm3, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_encrypt_avx512_aes_enc_256_aes_enc_block_last: + vaesenclast zmm0, zmm0, zmm9 + vaesenclast zmm1, zmm1, zmm9 + vaesenclast zmm2, zmm2, zmm9 + vaesenclast zmm3, zmm3, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vpsrlq zmm9, zmm4, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm4, zmm4, 16 + vpternlogq zmm4, zmm10, zmm9, 150 + vpxorq zmm1, zmm1, zmm5 + vmovdqu64 [rdx+64], zmm1 + vpsrlq zmm9, zmm5, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm5, 16 + vpternlogq zmm5, zmm10, zmm9, 150 + vpxorq zmm2, zmm2, zmm6 + vmovdqu64 [rdx+128], zmm2 + vpsrlq zmm9, zmm6, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm6, zmm6, 16 + vpternlogq zmm6, zmm10, zmm9, 150 + vpxorq zmm3, zmm3, zmm7 + vmovdqu64 [rdx+192], zmm3 + vpsrlq zmm9, zmm7, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm7, zmm7, 16 + vpternlogq zmm7, zmm10, zmm9, 150 + add r13d, 256 + cmp r13d, r11d + jl L_AES_XTS_encrypt_avx512_enc_256 + vextracti32x4 xmm8, zmm4, 0 +L_AES_XTS_encrypt_avx512_done_256: + mov r11d, eax + and r11d, 4294967168 + cmp r13d, r11d + je L_AES_XTS_encrypt_avx512_done_128 + ; 128 bytes of input + ; aes_enc_128 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu64 zmm0, [rcx] + vmovdqu64 zmm1, [rcx+64] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + vpsrlq zmm9, zmm4, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm4, 4 + vpternlogq zmm5, zmm10, zmm9, 150 + ; aes_enc_block + vpternlogq zmm0, zmm16, zmm4, 150 + vpternlogq zmm1, zmm16, zmm5, 150 + vaesenc zmm0, zmm0, zmm17 + vaesenc zmm1, zmm1, zmm17 + vaesenc zmm0, zmm0, zmm18 + vaesenc zmm1, zmm1, zmm18 + vaesenc zmm0, zmm0, zmm19 + vaesenc zmm1, zmm1, zmm19 + vaesenc zmm0, zmm0, zmm20 + vaesenc zmm1, zmm1, zmm20 + vaesenc zmm0, zmm0, zmm21 + vaesenc zmm1, zmm1, zmm21 + vaesenc zmm0, zmm0, zmm22 + vaesenc zmm1, zmm1, zmm22 + vaesenc zmm0, zmm0, zmm23 + vaesenc zmm1, zmm1, zmm23 + vaesenc zmm0, zmm0, zmm24 + vaesenc zmm1, zmm1, zmm24 + vaesenc zmm0, zmm0, zmm25 + vaesenc zmm1, zmm1, zmm25 + cmp r10d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_encrypt_avx512_aes_enc_128_aes_enc_block_last + vaesenc zmm0, zmm0, zmm26 + vaesenc zmm1, zmm1, zmm26 + vaesenc zmm0, zmm0, zmm27 + vaesenc zmm1, zmm1, zmm27 + cmp r10d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_encrypt_avx512_aes_enc_128_aes_enc_block_last + vaesenc zmm0, zmm0, zmm28 + vaesenc zmm1, zmm1, zmm28 + vaesenc zmm0, zmm0, zmm29 + vaesenc zmm1, zmm1, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_encrypt_avx512_aes_enc_128_aes_enc_block_last: + vaesenclast zmm0, zmm0, zmm9 + vaesenclast zmm1, zmm1, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vpxorq zmm1, zmm1, zmm5 + vmovdqu64 [rdx+64], zmm1 + vextracti32x4 xmm8, zmm5, 3 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpternlogd xmm8, xmm9, xmm12, 120 + add r13d, 128 +L_AES_XTS_encrypt_avx512_done_128: + mov r11d, eax + and r11d, 4294967232 + cmp r13d, r11d + je L_AES_XTS_encrypt_avx512_done_64 + ; 64 bytes of input + ; aes_enc_64 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu64 zmm0, [rcx] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + ; aes_enc_block + vpternlogq zmm0, zmm16, zmm4, 150 + vaesenc zmm0, zmm0, zmm17 + vaesenc zmm0, zmm0, zmm18 + vaesenc zmm0, zmm0, zmm19 + vaesenc zmm0, zmm0, zmm20 + vaesenc zmm0, zmm0, zmm21 + vaesenc zmm0, zmm0, zmm22 + vaesenc zmm0, zmm0, zmm23 + vaesenc zmm0, zmm0, zmm24 + vaesenc zmm0, zmm0, zmm25 + cmp r10d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_encrypt_avx512_aes_enc_64_aes_enc_block_last + vaesenc zmm0, zmm0, zmm26 + vaesenc zmm0, zmm0, zmm27 + cmp r10d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_encrypt_avx512_aes_enc_64_aes_enc_block_last + vaesenc zmm0, zmm0, zmm28 + vaesenc zmm0, zmm0, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_encrypt_avx512_aes_enc_64_aes_enc_block_last: + vaesenclast zmm0, zmm0, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vextracti32x4 xmm8, zmm4, 3 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpternlogd xmm8, xmm9, xmm12, 120 + add r13d, 64 +L_AES_XTS_encrypt_avx512_done_64: + mov r11d, eax + and r11d, 4294967264 + cmp r13d, r11d + je L_AES_XTS_encrypt_avx512_done_32 + ; 32 bytes of input + ; aes_enc_32 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu64 ymm0, [rcx] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + ; aes_enc_block + vpternlogq ymm0, ymm16, ymm4, 150 + vaesenc ymm0, ymm0, ymm17 + vaesenc ymm0, ymm0, ymm18 + vaesenc ymm0, ymm0, ymm19 + vaesenc ymm0, ymm0, ymm20 + vaesenc ymm0, ymm0, ymm21 + vaesenc ymm0, ymm0, ymm22 + vaesenc ymm0, ymm0, ymm23 + vaesenc ymm0, ymm0, ymm24 + vaesenc ymm0, ymm0, ymm25 + cmp r10d, 11 + vmovdqa64 ymm9, ymm26 + jl L_AES_XTS_encrypt_avx512_aes_enc_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm26 + vaesenc ymm0, ymm0, ymm27 + cmp r10d, 13 + vmovdqa64 ymm9, ymm28 + jl L_AES_XTS_encrypt_avx512_aes_enc_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm28 + vaesenc ymm0, ymm0, ymm29 + vmovdqa64 ymm9, ymm30 +L_AES_XTS_encrypt_avx512_aes_enc_32_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm9 + vpxorq ymm0, ymm0, ymm4 + vmovdqu64 [rdx], ymm0 + vextracti32x4 xmm8, zmm4, 2 + add r13d, 32 +L_AES_XTS_encrypt_avx512_done_32: + cmp r13d, eax + mov r11d, eax + je L_AES_XTS_encrypt_avx512_done_enc + sub r11d, r13d + cmp r11d, 16 + mov r11d, eax + jl L_AES_XTS_encrypt_avx512_last_15 + and r11d, 4294967280 + ; 16 bytes of input +L_AES_XTS_encrypt_avx512_enc_16: + lea rcx, QWORD PTR [rdi+r13] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm8 + ; aes_enc_block + vpxor xmm0, xmm0, [r8] + vmovdqu xmm5, OWORD PTR [r8+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+144] + vaesenc xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r8+160] + jl L_AES_XTS_encrypt_avx512_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+176] + vaesenc xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r8+192] + jl L_AES_XTS_encrypt_avx512_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r8+224] +L_AES_XTS_encrypt_avx512_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r13] + vmovdqu OWORD PTR [rcx], xmm0 + vpshufd xmm4, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpternlogd xmm8, xmm4, xmm12, 120 + add r13d, 16 + cmp r13d, r11d + jl L_AES_XTS_encrypt_avx512_enc_16 + cmp r13d, eax + je L_AES_XTS_encrypt_avx512_done_enc +L_AES_XTS_encrypt_avx512_last_15: + sub r13, 16 + lea rcx, QWORD PTR [rsi+r13] + vmovdqu xmm0, OWORD PTR [rcx] + add r13, 16 + vmovdqu OWORD PTR [rsp], xmm0 + xor rdx, rdx +L_AES_XTS_encrypt_avx512_last_15_byte_loop: + mov r11b, BYTE PTR [rsp+rdx] + mov cl, BYTE PTR [rdi+r13] + mov BYTE PTR [rsi+r13], r11b + mov BYTE PTR [rsp+rdx], cl + inc r13d + inc edx + cmp r13d, eax + jl L_AES_XTS_encrypt_avx512_last_15_byte_loop + sub r13, rdx + vmovdqu xmm0, OWORD PTR [rsp] + sub r13, 16 + vpxor xmm0, xmm0, xmm8 + ; aes_enc_block + vpxor xmm0, xmm0, [r8] + vmovdqu xmm5, OWORD PTR [r8+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+144] + vaesenc xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r8+160] + jl L_AES_XTS_encrypt_avx512_last_15_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+176] + vaesenc xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r8+192] + jl L_AES_XTS_encrypt_avx512_last_15_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r8+224] +L_AES_XTS_encrypt_avx512_last_15_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r13] + vmovdqu OWORD PTR [rcx], xmm0 +L_AES_XTS_encrypt_avx512_done_enc: + vmovdqu xmm6, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm8, OWORD PTR [rsp+96] + vmovdqu xmm9, OWORD PTR [rsp+112] + vmovdqu xmm10, OWORD PTR [rsp+128] + vmovdqu xmm11, OWORD PTR [rsp+144] + vmovdqu xmm12, OWORD PTR [rsp+160] + vmovdqu xmm13, OWORD PTR [rsp+176] + vmovdqu xmm14, OWORD PTR [rsp+192] + vmovdqu xmm15, OWORD PTR [rsp+208] + add rsp, 224 + pop r13 + pop r12 + pop rsi + pop rdi + ret +AES_XTS_encrypt_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_XTS_encrypt_update_avx512 PROC + push rdi + push rsi + push r12 + mov rdi, rcx + mov rsi, rdx + mov rax, r8 + mov r10, r9 + mov r8, QWORD PTR [rsp+64] + mov r9d, DWORD PTR [rsp+72] + sub rsp, 224 + vmovdqu OWORD PTR [rsp+64], xmm6 + vmovdqu OWORD PTR [rsp+80], xmm7 + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + vmovdqu OWORD PTR [rsp+128], xmm10 + vmovdqu OWORD PTR [rsp+144], xmm11 + vmovdqu OWORD PTR [rsp+160], xmm12 + vmovdqu OWORD PTR [rsp+176], xmm13 + vmovdqu OWORD PTR [rsp+192], xmm14 + vmovdqu OWORD PTR [rsp+208], xmm15 + vmovdqu xmm12, OWORD PTR L_avx512_aes_xts_gc_xts + vbroadcasti32x4 zmm13, ptr_L_avx512_aes_xts_poly + vmovdqu64 zmm14, ptr_L_avx512_aes_xts_shl + vmovdqu64 zmm15, ptr_L_avx512_aes_xts_shr + vmovdqu xmm8, OWORD PTR [r8] + xor r12d, r12d + cmp eax, 32 + jl L_AES_XTS_encrypt_update_avx512_done_128 + vbroadcasti32x4 zmm16, [r10] + vbroadcasti32x4 zmm17, [r10+16] + vbroadcasti32x4 zmm18, [r10+32] + vbroadcasti32x4 zmm19, [r10+48] + vbroadcasti32x4 zmm20, [r10+64] + vbroadcasti32x4 zmm21, [r10+80] + vbroadcasti32x4 zmm22, [r10+96] + vbroadcasti32x4 zmm23, [r10+112] + vbroadcasti32x4 zmm24, [r10+128] + vbroadcasti32x4 zmm25, [r10+144] + vbroadcasti32x4 zmm26, [r10+160] + cmp r9d, 11 + jl L_AES_XTS_encrypt_update_avx512_key_cached + vbroadcasti32x4 zmm27, [r10+176] + vbroadcasti32x4 zmm28, [r10+192] + cmp r9d, 13 + jl L_AES_XTS_encrypt_update_avx512_key_cached + vbroadcasti32x4 zmm29, [r10+208] + vbroadcasti32x4 zmm30, [r10+224] +L_AES_XTS_encrypt_update_avx512_key_cached: + cmp eax, 256 + mov r11d, eax + jl L_AES_XTS_encrypt_update_avx512_done_256 + and r11d, 4294967040 + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + vpsrlq zmm9, zmm4, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm4, 4 + vpternlogq zmm5, zmm10, zmm9, 150 + vpsrlq zmm9, zmm5, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm6, zmm5, 4 + vpternlogq zmm6, zmm10, zmm9, 150 + vpsrlq zmm9, zmm6, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm7, zmm6, 4 + vpternlogq zmm7, zmm10, zmm9, 150 +L_AES_XTS_encrypt_update_avx512_enc_256: + ; 256 bytes of input + ; aes_enc_256 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 zmm0, [rcx] + vmovdqu64 zmm1, [rcx+64] + vmovdqu64 zmm2, [rcx+128] + vmovdqu64 zmm3, [rcx+192] + ; aes_enc_block + vpternlogq zmm0, zmm16, zmm4, 150 + vpternlogq zmm1, zmm16, zmm5, 150 + vpternlogq zmm2, zmm16, zmm6, 150 + vpternlogq zmm3, zmm16, zmm7, 150 + vaesenc zmm0, zmm0, zmm17 + vaesenc zmm1, zmm1, zmm17 + vaesenc zmm2, zmm2, zmm17 + vaesenc zmm3, zmm3, zmm17 + vaesenc zmm0, zmm0, zmm18 + vaesenc zmm1, zmm1, zmm18 + vaesenc zmm2, zmm2, zmm18 + vaesenc zmm3, zmm3, zmm18 + vaesenc zmm0, zmm0, zmm19 + vaesenc zmm1, zmm1, zmm19 + vaesenc zmm2, zmm2, zmm19 + vaesenc zmm3, zmm3, zmm19 + vaesenc zmm0, zmm0, zmm20 + vaesenc zmm1, zmm1, zmm20 + vaesenc zmm2, zmm2, zmm20 + vaesenc zmm3, zmm3, zmm20 + vaesenc zmm0, zmm0, zmm21 + vaesenc zmm1, zmm1, zmm21 + vaesenc zmm2, zmm2, zmm21 + vaesenc zmm3, zmm3, zmm21 + vaesenc zmm0, zmm0, zmm22 + vaesenc zmm1, zmm1, zmm22 + vaesenc zmm2, zmm2, zmm22 + vaesenc zmm3, zmm3, zmm22 + vaesenc zmm0, zmm0, zmm23 + vaesenc zmm1, zmm1, zmm23 + vaesenc zmm2, zmm2, zmm23 + vaesenc zmm3, zmm3, zmm23 + vaesenc zmm0, zmm0, zmm24 + vaesenc zmm1, zmm1, zmm24 + vaesenc zmm2, zmm2, zmm24 + vaesenc zmm3, zmm3, zmm24 + vaesenc zmm0, zmm0, zmm25 + vaesenc zmm1, zmm1, zmm25 + vaesenc zmm2, zmm2, zmm25 + vaesenc zmm3, zmm3, zmm25 + cmp r9d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_256_aes_enc_block_last + vaesenc zmm0, zmm0, zmm26 + vaesenc zmm1, zmm1, zmm26 + vaesenc zmm2, zmm2, zmm26 + vaesenc zmm3, zmm3, zmm26 + vaesenc zmm0, zmm0, zmm27 + vaesenc zmm1, zmm1, zmm27 + vaesenc zmm2, zmm2, zmm27 + vaesenc zmm3, zmm3, zmm27 + cmp r9d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_256_aes_enc_block_last + vaesenc zmm0, zmm0, zmm28 + vaesenc zmm1, zmm1, zmm28 + vaesenc zmm2, zmm2, zmm28 + vaesenc zmm3, zmm3, zmm28 + vaesenc zmm0, zmm0, zmm29 + vaesenc zmm1, zmm1, zmm29 + vaesenc zmm2, zmm2, zmm29 + vaesenc zmm3, zmm3, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_encrypt_update_avx512_aes_enc_256_aes_enc_block_last: + vaesenclast zmm0, zmm0, zmm9 + vaesenclast zmm1, zmm1, zmm9 + vaesenclast zmm2, zmm2, zmm9 + vaesenclast zmm3, zmm3, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vpsrlq zmm9, zmm4, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm4, zmm4, 16 + vpternlogq zmm4, zmm10, zmm9, 150 + vpxorq zmm1, zmm1, zmm5 + vmovdqu64 [rdx+64], zmm1 + vpsrlq zmm9, zmm5, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm5, 16 + vpternlogq zmm5, zmm10, zmm9, 150 + vpxorq zmm2, zmm2, zmm6 + vmovdqu64 [rdx+128], zmm2 + vpsrlq zmm9, zmm6, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm6, zmm6, 16 + vpternlogq zmm6, zmm10, zmm9, 150 + vpxorq zmm3, zmm3, zmm7 + vmovdqu64 [rdx+192], zmm3 + vpsrlq zmm9, zmm7, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm7, zmm7, 16 + vpternlogq zmm7, zmm10, zmm9, 150 + add r12d, 256 + cmp r12d, r11d + jl L_AES_XTS_encrypt_update_avx512_enc_256 + vextracti32x4 xmm8, zmm4, 0 +L_AES_XTS_encrypt_update_avx512_done_256: + mov r11d, eax + and r11d, 4294967168 + cmp r12d, r11d + je L_AES_XTS_encrypt_update_avx512_done_128 + ; 128 bytes of input + ; aes_enc_128 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 zmm0, [rcx] + vmovdqu64 zmm1, [rcx+64] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + vpsrlq zmm9, zmm4, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm4, 4 + vpternlogq zmm5, zmm10, zmm9, 150 + ; aes_enc_block + vpternlogq zmm0, zmm16, zmm4, 150 + vpternlogq zmm1, zmm16, zmm5, 150 + vaesenc zmm0, zmm0, zmm17 + vaesenc zmm1, zmm1, zmm17 + vaesenc zmm0, zmm0, zmm18 + vaesenc zmm1, zmm1, zmm18 + vaesenc zmm0, zmm0, zmm19 + vaesenc zmm1, zmm1, zmm19 + vaesenc zmm0, zmm0, zmm20 + vaesenc zmm1, zmm1, zmm20 + vaesenc zmm0, zmm0, zmm21 + vaesenc zmm1, zmm1, zmm21 + vaesenc zmm0, zmm0, zmm22 + vaesenc zmm1, zmm1, zmm22 + vaesenc zmm0, zmm0, zmm23 + vaesenc zmm1, zmm1, zmm23 + vaesenc zmm0, zmm0, zmm24 + vaesenc zmm1, zmm1, zmm24 + vaesenc zmm0, zmm0, zmm25 + vaesenc zmm1, zmm1, zmm25 + cmp r9d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_128_aes_enc_block_last + vaesenc zmm0, zmm0, zmm26 + vaesenc zmm1, zmm1, zmm26 + vaesenc zmm0, zmm0, zmm27 + vaesenc zmm1, zmm1, zmm27 + cmp r9d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_128_aes_enc_block_last + vaesenc zmm0, zmm0, zmm28 + vaesenc zmm1, zmm1, zmm28 + vaesenc zmm0, zmm0, zmm29 + vaesenc zmm1, zmm1, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_encrypt_update_avx512_aes_enc_128_aes_enc_block_last: + vaesenclast zmm0, zmm0, zmm9 + vaesenclast zmm1, zmm1, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vpxorq zmm1, zmm1, zmm5 + vmovdqu64 [rdx+64], zmm1 + vextracti32x4 xmm8, zmm5, 3 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpternlogd xmm8, xmm9, xmm12, 120 + add r12d, 128 +L_AES_XTS_encrypt_update_avx512_done_128: + mov r11d, eax + and r11d, 4294967232 + cmp r12d, r11d + je L_AES_XTS_encrypt_update_avx512_done_64 + ; 64 bytes of input + ; aes_enc_64 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 zmm0, [rcx] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + ; aes_enc_block + vpternlogq zmm0, zmm16, zmm4, 150 + vaesenc zmm0, zmm0, zmm17 + vaesenc zmm0, zmm0, zmm18 + vaesenc zmm0, zmm0, zmm19 + vaesenc zmm0, zmm0, zmm20 + vaesenc zmm0, zmm0, zmm21 + vaesenc zmm0, zmm0, zmm22 + vaesenc zmm0, zmm0, zmm23 + vaesenc zmm0, zmm0, zmm24 + vaesenc zmm0, zmm0, zmm25 + cmp r9d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_64_aes_enc_block_last + vaesenc zmm0, zmm0, zmm26 + vaesenc zmm0, zmm0, zmm27 + cmp r9d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_64_aes_enc_block_last + vaesenc zmm0, zmm0, zmm28 + vaesenc zmm0, zmm0, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_encrypt_update_avx512_aes_enc_64_aes_enc_block_last: + vaesenclast zmm0, zmm0, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vextracti32x4 xmm8, zmm4, 3 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpternlogd xmm8, xmm9, xmm12, 120 + add r12d, 64 +L_AES_XTS_encrypt_update_avx512_done_64: + mov r11d, eax + and r11d, 4294967264 + cmp r12d, r11d + je L_AES_XTS_encrypt_update_avx512_done_32 + ; 32 bytes of input + ; aes_enc_32 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 ymm0, [rcx] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + ; aes_enc_block + vpternlogq ymm0, ymm16, ymm4, 150 + vaesenc ymm0, ymm0, ymm17 + vaesenc ymm0, ymm0, ymm18 + vaesenc ymm0, ymm0, ymm19 + vaesenc ymm0, ymm0, ymm20 + vaesenc ymm0, ymm0, ymm21 + vaesenc ymm0, ymm0, ymm22 + vaesenc ymm0, ymm0, ymm23 + vaesenc ymm0, ymm0, ymm24 + vaesenc ymm0, ymm0, ymm25 + cmp r9d, 11 + vmovdqa64 ymm9, ymm26 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm26 + vaesenc ymm0, ymm0, ymm27 + cmp r9d, 13 + vmovdqa64 ymm9, ymm28 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm28 + vaesenc ymm0, ymm0, ymm29 + vmovdqa64 ymm9, ymm30 +L_AES_XTS_encrypt_update_avx512_aes_enc_32_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm9 + vpxorq ymm0, ymm0, ymm4 + vmovdqu64 [rdx], ymm0 + vextracti32x4 xmm8, zmm4, 2 + add r12d, 32 +L_AES_XTS_encrypt_update_avx512_done_32: + cmp r12d, eax + mov r11d, eax + je L_AES_XTS_encrypt_update_avx512_done_enc + sub r11d, r12d + cmp r11d, 16 + mov r11d, eax + jl L_AES_XTS_encrypt_update_avx512_last_15 + and r11d, 4294967280 + ; 16 bytes of input +L_AES_XTS_encrypt_update_avx512_enc_16: + lea rcx, QWORD PTR [rdi+r12] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm8 + ; aes_enc_block + vpxor xmm0, xmm0, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesenc xmm0, xmm0, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_encrypt_update_avx512_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesenc xmm0, xmm0, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_encrypt_update_avx512_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_encrypt_update_avx512_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu OWORD PTR [rcx], xmm0 + vpshufd xmm4, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpternlogd xmm8, xmm4, xmm12, 120 + add r12d, 16 + cmp r12d, r11d + jl L_AES_XTS_encrypt_update_avx512_enc_16 + cmp r12d, eax + je L_AES_XTS_encrypt_update_avx512_done_enc +L_AES_XTS_encrypt_update_avx512_last_15: + sub r12, 16 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu xmm0, OWORD PTR [rcx] + add r12, 16 + vmovdqu OWORD PTR [rsp], xmm0 + xor rdx, rdx +L_AES_XTS_encrypt_update_avx512_last_15_byte_loop: + mov r11b, BYTE PTR [rsp+rdx] + mov cl, BYTE PTR [rdi+r12] + mov BYTE PTR [rsi+r12], r11b + mov BYTE PTR [rsp+rdx], cl + inc r12d + inc edx + cmp r12d, eax + jl L_AES_XTS_encrypt_update_avx512_last_15_byte_loop + sub r12, rdx + vmovdqu xmm0, OWORD PTR [rsp] + sub r12, 16 + vpxor xmm0, xmm0, xmm8 + ; aes_enc_block + vpxor xmm0, xmm0, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesenc xmm0, xmm0, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_encrypt_update_avx512_last_15_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesenc xmm0, xmm0, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_encrypt_update_avx512_last_15_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_encrypt_update_avx512_last_15_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu OWORD PTR [rcx], xmm0 +L_AES_XTS_encrypt_update_avx512_done_enc: + vmovdqu OWORD PTR [r8], xmm8 + vmovdqu xmm6, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm8, OWORD PTR [rsp+96] + vmovdqu xmm9, OWORD PTR [rsp+112] + vmovdqu xmm10, OWORD PTR [rsp+128] + vmovdqu xmm11, OWORD PTR [rsp+144] + vmovdqu xmm12, OWORD PTR [rsp+160] + vmovdqu xmm13, OWORD PTR [rsp+176] + vmovdqu xmm14, OWORD PTR [rsp+192] + vmovdqu xmm15, OWORD PTR [rsp+208] + add rsp, 224 + pop r12 + pop rsi + pop rdi + ret +AES_XTS_encrypt_update_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_XTS_decrypt_avx512 PROC + push rdi + push rsi + push r12 + push r13 + mov rdi, rcx + mov rsi, rdx + mov rax, r8 + mov r12, r9 + mov r8, QWORD PTR [rsp+72] + mov r9, QWORD PTR [rsp+80] + mov r10d, DWORD PTR [rsp+88] + sub rsp, 224 + vmovdqu OWORD PTR [rsp+64], xmm6 + vmovdqu OWORD PTR [rsp+80], xmm7 + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + vmovdqu OWORD PTR [rsp+128], xmm10 + vmovdqu OWORD PTR [rsp+144], xmm11 + vmovdqu OWORD PTR [rsp+160], xmm12 + vmovdqu OWORD PTR [rsp+176], xmm13 + vmovdqu OWORD PTR [rsp+192], xmm14 + vmovdqu OWORD PTR [rsp+208], xmm15 + vmovdqu xmm12, OWORD PTR L_avx512_aes_xts_gc_xts + vbroadcasti32x4 zmm13, ptr_L_avx512_aes_xts_poly + vmovdqu64 zmm14, ptr_L_avx512_aes_xts_shl + vmovdqu64 zmm15, ptr_L_avx512_aes_xts_shr + vmovdqu xmm8, OWORD PTR [r12] + ; aes_enc_block + vpxor xmm8, xmm8, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm8, xmm8, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_XTS_decrypt_avx512_tweak_aes_enc_block_last + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm8, xmm8, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_XTS_decrypt_avx512_tweak_aes_enc_block_last + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm8, xmm8, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_XTS_decrypt_avx512_tweak_aes_enc_block_last: + vaesenclast xmm8, xmm8, xmm5 + xor r13d, r13d + mov r11d, eax + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_avx512_mul16_256 + sub r11d, 16 + cmp r11d, 16 + jl L_AES_XTS_decrypt_avx512_last_31_start +L_AES_XTS_decrypt_avx512_mul16_256: + cmp r11d, 32 + jl L_AES_XTS_decrypt_avx512_done_128 + vbroadcasti32x4 zmm16, [r8] + vbroadcasti32x4 zmm17, [r8+16] + vbroadcasti32x4 zmm18, [r8+32] + vbroadcasti32x4 zmm19, [r8+48] + vbroadcasti32x4 zmm20, [r8+64] + vbroadcasti32x4 zmm21, [r8+80] + vbroadcasti32x4 zmm22, [r8+96] + vbroadcasti32x4 zmm23, [r8+112] + vbroadcasti32x4 zmm24, [r8+128] + vbroadcasti32x4 zmm25, [r8+144] + vbroadcasti32x4 zmm26, [r8+160] + cmp r10d, 11 + jl L_AES_XTS_decrypt_avx512_key_cached + vbroadcasti32x4 zmm27, [r8+176] + vbroadcasti32x4 zmm28, [r8+192] + cmp r10d, 13 + jl L_AES_XTS_decrypt_avx512_key_cached + vbroadcasti32x4 zmm29, [r8+208] + vbroadcasti32x4 zmm30, [r8+224] +L_AES_XTS_decrypt_avx512_key_cached: + cmp r11d, 256 + jl L_AES_XTS_decrypt_avx512_done_256 + and r11d, 4294967040 + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + vpsrlq zmm9, zmm4, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm4, 4 + vpternlogq zmm5, zmm10, zmm9, 150 + vpsrlq zmm9, zmm5, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm6, zmm5, 4 + vpternlogq zmm6, zmm10, zmm9, 150 + vpsrlq zmm9, zmm6, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm7, zmm6, 4 + vpternlogq zmm7, zmm10, zmm9, 150 +L_AES_XTS_decrypt_avx512_dec_256: + ; 256 bytes of input + ; aes_dec_256 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu64 zmm0, [rcx] + vmovdqu64 zmm1, [rcx+64] + vmovdqu64 zmm2, [rcx+128] + vmovdqu64 zmm3, [rcx+192] + ; aes_dec_block + vpternlogq zmm0, zmm16, zmm4, 150 + vpternlogq zmm1, zmm16, zmm5, 150 + vpternlogq zmm2, zmm16, zmm6, 150 + vpternlogq zmm3, zmm16, zmm7, 150 + vaesdec zmm0, zmm0, zmm17 + vaesdec zmm1, zmm1, zmm17 + vaesdec zmm2, zmm2, zmm17 + vaesdec zmm3, zmm3, zmm17 + vaesdec zmm0, zmm0, zmm18 + vaesdec zmm1, zmm1, zmm18 + vaesdec zmm2, zmm2, zmm18 + vaesdec zmm3, zmm3, zmm18 + vaesdec zmm0, zmm0, zmm19 + vaesdec zmm1, zmm1, zmm19 + vaesdec zmm2, zmm2, zmm19 + vaesdec zmm3, zmm3, zmm19 + vaesdec zmm0, zmm0, zmm20 + vaesdec zmm1, zmm1, zmm20 + vaesdec zmm2, zmm2, zmm20 + vaesdec zmm3, zmm3, zmm20 + vaesdec zmm0, zmm0, zmm21 + vaesdec zmm1, zmm1, zmm21 + vaesdec zmm2, zmm2, zmm21 + vaesdec zmm3, zmm3, zmm21 + vaesdec zmm0, zmm0, zmm22 + vaesdec zmm1, zmm1, zmm22 + vaesdec zmm2, zmm2, zmm22 + vaesdec zmm3, zmm3, zmm22 + vaesdec zmm0, zmm0, zmm23 + vaesdec zmm1, zmm1, zmm23 + vaesdec zmm2, zmm2, zmm23 + vaesdec zmm3, zmm3, zmm23 + vaesdec zmm0, zmm0, zmm24 + vaesdec zmm1, zmm1, zmm24 + vaesdec zmm2, zmm2, zmm24 + vaesdec zmm3, zmm3, zmm24 + vaesdec zmm0, zmm0, zmm25 + vaesdec zmm1, zmm1, zmm25 + vaesdec zmm2, zmm2, zmm25 + vaesdec zmm3, zmm3, zmm25 + cmp r10d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_decrypt_avx512_aes_dec_256_aes_dec_block_last + vaesdec zmm0, zmm0, zmm26 + vaesdec zmm1, zmm1, zmm26 + vaesdec zmm2, zmm2, zmm26 + vaesdec zmm3, zmm3, zmm26 + vaesdec zmm0, zmm0, zmm27 + vaesdec zmm1, zmm1, zmm27 + vaesdec zmm2, zmm2, zmm27 + vaesdec zmm3, zmm3, zmm27 + cmp r10d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_decrypt_avx512_aes_dec_256_aes_dec_block_last + vaesdec zmm0, zmm0, zmm28 + vaesdec zmm1, zmm1, zmm28 + vaesdec zmm2, zmm2, zmm28 + vaesdec zmm3, zmm3, zmm28 + vaesdec zmm0, zmm0, zmm29 + vaesdec zmm1, zmm1, zmm29 + vaesdec zmm2, zmm2, zmm29 + vaesdec zmm3, zmm3, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_decrypt_avx512_aes_dec_256_aes_dec_block_last: + vaesdeclast zmm0, zmm0, zmm9 + vaesdeclast zmm1, zmm1, zmm9 + vaesdeclast zmm2, zmm2, zmm9 + vaesdeclast zmm3, zmm3, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vpsrlq zmm9, zmm4, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm4, zmm4, 16 + vpternlogq zmm4, zmm10, zmm9, 150 + vpxorq zmm1, zmm1, zmm5 + vmovdqu64 [rdx+64], zmm1 + vpsrlq zmm9, zmm5, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm5, 16 + vpternlogq zmm5, zmm10, zmm9, 150 + vpxorq zmm2, zmm2, zmm6 + vmovdqu64 [rdx+128], zmm2 + vpsrlq zmm9, zmm6, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm6, zmm6, 16 + vpternlogq zmm6, zmm10, zmm9, 150 + vpxorq zmm3, zmm3, zmm7 + vmovdqu64 [rdx+192], zmm3 + vpsrlq zmm9, zmm7, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm7, zmm7, 16 + vpternlogq zmm7, zmm10, zmm9, 150 + add r13d, 256 + cmp r13d, r11d + jl L_AES_XTS_decrypt_avx512_dec_256 + vextracti32x4 xmm8, zmm4, 0 +L_AES_XTS_decrypt_avx512_done_256: + cmp r13d, eax + mov r11d, eax + je L_AES_XTS_decrypt_avx512_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_avx512_mul16_128 + sub r11d, 16 + sub r11d, r13d + cmp r11d, 16 + jl L_AES_XTS_decrypt_avx512_last_31_start + add r11d, r13d +L_AES_XTS_decrypt_avx512_mul16_128: + and r11d, 4294967168 + cmp r13d, r11d + je L_AES_XTS_decrypt_avx512_done_128 + ; 128 bytes of input + ; aes_dec_128 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu64 zmm0, [rcx] + vmovdqu64 zmm1, [rcx+64] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + vpsrlq zmm9, zmm4, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm4, 4 + vpternlogq zmm5, zmm10, zmm9, 150 + ; aes_dec_block + vpternlogq zmm0, zmm16, zmm4, 150 + vpternlogq zmm1, zmm16, zmm5, 150 + vaesdec zmm0, zmm0, zmm17 + vaesdec zmm1, zmm1, zmm17 + vaesdec zmm0, zmm0, zmm18 + vaesdec zmm1, zmm1, zmm18 + vaesdec zmm0, zmm0, zmm19 + vaesdec zmm1, zmm1, zmm19 + vaesdec zmm0, zmm0, zmm20 + vaesdec zmm1, zmm1, zmm20 + vaesdec zmm0, zmm0, zmm21 + vaesdec zmm1, zmm1, zmm21 + vaesdec zmm0, zmm0, zmm22 + vaesdec zmm1, zmm1, zmm22 + vaesdec zmm0, zmm0, zmm23 + vaesdec zmm1, zmm1, zmm23 + vaesdec zmm0, zmm0, zmm24 + vaesdec zmm1, zmm1, zmm24 + vaesdec zmm0, zmm0, zmm25 + vaesdec zmm1, zmm1, zmm25 + cmp r10d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_decrypt_avx512_aes_dec_128_aes_dec_block_last + vaesdec zmm0, zmm0, zmm26 + vaesdec zmm1, zmm1, zmm26 + vaesdec zmm0, zmm0, zmm27 + vaesdec zmm1, zmm1, zmm27 + cmp r10d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_decrypt_avx512_aes_dec_128_aes_dec_block_last + vaesdec zmm0, zmm0, zmm28 + vaesdec zmm1, zmm1, zmm28 + vaesdec zmm0, zmm0, zmm29 + vaesdec zmm1, zmm1, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_decrypt_avx512_aes_dec_128_aes_dec_block_last: + vaesdeclast zmm0, zmm0, zmm9 + vaesdeclast zmm1, zmm1, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vpxorq zmm1, zmm1, zmm5 + vmovdqu64 [rdx+64], zmm1 + vextracti32x4 xmm8, zmm5, 3 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpternlogd xmm8, xmm9, xmm12, 120 + add r13d, 128 +L_AES_XTS_decrypt_avx512_done_128: + cmp r13d, eax + mov r11d, eax + je L_AES_XTS_decrypt_avx512_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_avx512_mul16_64 + sub r11d, 16 + sub r11d, r13d + cmp r11d, 16 + jl L_AES_XTS_decrypt_avx512_last_31_start + add r11d, r13d +L_AES_XTS_decrypt_avx512_mul16_64: + and r11d, 4294967232 + cmp r13d, r11d + je L_AES_XTS_decrypt_avx512_done_64 + ; 64 bytes of input + ; aes_dec_64 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu64 zmm0, [rcx] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + ; aes_dec_block + vpternlogq zmm0, zmm16, zmm4, 150 + vaesdec zmm0, zmm0, zmm17 + vaesdec zmm0, zmm0, zmm18 + vaesdec zmm0, zmm0, zmm19 + vaesdec zmm0, zmm0, zmm20 + vaesdec zmm0, zmm0, zmm21 + vaesdec zmm0, zmm0, zmm22 + vaesdec zmm0, zmm0, zmm23 + vaesdec zmm0, zmm0, zmm24 + vaesdec zmm0, zmm0, zmm25 + cmp r10d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_decrypt_avx512_aes_dec_64_aes_dec_block_last + vaesdec zmm0, zmm0, zmm26 + vaesdec zmm0, zmm0, zmm27 + cmp r10d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_decrypt_avx512_aes_dec_64_aes_dec_block_last + vaesdec zmm0, zmm0, zmm28 + vaesdec zmm0, zmm0, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_decrypt_avx512_aes_dec_64_aes_dec_block_last: + vaesdeclast zmm0, zmm0, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vextracti32x4 xmm8, zmm4, 3 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpternlogd xmm8, xmm9, xmm12, 120 + add r13d, 64 +L_AES_XTS_decrypt_avx512_done_64: + cmp r13d, eax + mov r11d, eax + je L_AES_XTS_decrypt_avx512_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_avx512_mul16_32 + sub r11d, 16 + sub r11d, r13d + cmp r11d, 16 + jl L_AES_XTS_decrypt_avx512_last_31_start + add r11d, r13d +L_AES_XTS_decrypt_avx512_mul16_32: + and r11d, 4294967264 + cmp r13d, r11d + je L_AES_XTS_decrypt_avx512_done_32 + ; 32 bytes of input + ; aes_dec_32 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu64 ymm0, [rcx] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + ; aes_dec_block + vpternlogq ymm0, ymm16, ymm4, 150 + vaesdec ymm0, ymm0, ymm17 + vaesdec ymm0, ymm0, ymm18 + vaesdec ymm0, ymm0, ymm19 + vaesdec ymm0, ymm0, ymm20 + vaesdec ymm0, ymm0, ymm21 + vaesdec ymm0, ymm0, ymm22 + vaesdec ymm0, ymm0, ymm23 + vaesdec ymm0, ymm0, ymm24 + vaesdec ymm0, ymm0, ymm25 + cmp r10d, 11 + vmovdqa64 ymm9, ymm26 + jl L_AES_XTS_decrypt_avx512_aes_dec_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm26 + vaesdec ymm0, ymm0, ymm27 + cmp r10d, 13 + vmovdqa64 ymm9, ymm28 + jl L_AES_XTS_decrypt_avx512_aes_dec_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm28 + vaesdec ymm0, ymm0, ymm29 + vmovdqa64 ymm9, ymm30 +L_AES_XTS_decrypt_avx512_aes_dec_32_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm9 + vpxorq ymm0, ymm0, ymm4 + vmovdqu64 [rdx], ymm0 + vextracti32x4 xmm8, zmm4, 2 + add r13d, 32 +L_AES_XTS_decrypt_avx512_done_32: + cmp r13d, eax + mov r11d, eax + je L_AES_XTS_decrypt_avx512_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_avx512_mul16 + sub r11d, 16 + sub r11d, r13d + cmp r11d, 16 + jl L_AES_XTS_decrypt_avx512_last_31_start + add r11d, r13d +L_AES_XTS_decrypt_avx512_mul16: +L_AES_XTS_decrypt_avx512_dec_16: + ; 16 bytes of input + lea rcx, QWORD PTR [rdi+r13] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm8 + ; aes_dec_block + vpxor xmm0, xmm0, [r8] + vmovdqu xmm5, OWORD PTR [r8+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+144] + vaesdec xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r8+160] + jl L_AES_XTS_decrypt_avx512_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+176] + vaesdec xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r8+192] + jl L_AES_XTS_decrypt_avx512_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r8+224] +L_AES_XTS_decrypt_avx512_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r13] + vmovdqu OWORD PTR [rcx], xmm0 + vpshufd xmm4, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpternlogd xmm8, xmm4, xmm12, 120 + add r13d, 16 + cmp r13d, r11d + jl L_AES_XTS_decrypt_avx512_dec_16 + cmp r13d, eax + je L_AES_XTS_decrypt_avx512_done_dec +L_AES_XTS_decrypt_avx512_last_31_start: + vpshufd xmm4, xmm8, 19 + vpaddq xmm7, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpternlogd xmm7, xmm4, xmm12, 120 + lea rcx, QWORD PTR [rdi+r13] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm7 + ; aes_dec_block + vpxor xmm0, xmm0, [r8] + vmovdqu xmm5, OWORD PTR [r8+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+144] + vaesdec xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r8+160] + jl L_AES_XTS_decrypt_avx512_last_31_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+176] + vaesdec xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r8+192] + jl L_AES_XTS_decrypt_avx512_last_31_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r8+224] +L_AES_XTS_decrypt_avx512_last_31_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm7 + vmovdqu OWORD PTR [rsp], xmm0 + add r13, 16 + xor rdx, rdx +L_AES_XTS_decrypt_avx512_last_31_byte_loop: + mov r11b, BYTE PTR [rsp+rdx] + mov cl, BYTE PTR [rdi+r13] + mov BYTE PTR [rsi+r13], r11b + mov BYTE PTR [rsp+rdx], cl + inc r13d + inc edx + cmp r13d, eax + jl L_AES_XTS_decrypt_avx512_last_31_byte_loop + sub r13, rdx + vmovdqu xmm0, OWORD PTR [rsp] + vpxor xmm0, xmm0, xmm8 + ; aes_dec_block + vpxor xmm0, xmm0, [r8] + vmovdqu xmm5, OWORD PTR [r8+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+144] + vaesdec xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r8+160] + jl L_AES_XTS_decrypt_avx512_last_31_2_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+176] + vaesdec xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r8+192] + jl L_AES_XTS_decrypt_avx512_last_31_2_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r8+224] +L_AES_XTS_decrypt_avx512_last_31_2_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + sub r13, 16 + lea rcx, QWORD PTR [rsi+r13] + vmovdqu OWORD PTR [rcx], xmm0 +L_AES_XTS_decrypt_avx512_done_dec: + vmovdqu xmm6, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm8, OWORD PTR [rsp+96] + vmovdqu xmm9, OWORD PTR [rsp+112] + vmovdqu xmm10, OWORD PTR [rsp+128] + vmovdqu xmm11, OWORD PTR [rsp+144] + vmovdqu xmm12, OWORD PTR [rsp+160] + vmovdqu xmm13, OWORD PTR [rsp+176] + vmovdqu xmm14, OWORD PTR [rsp+192] + vmovdqu xmm15, OWORD PTR [rsp+208] + add rsp, 224 + pop r13 + pop r12 + pop rsi + pop rdi + ret +AES_XTS_decrypt_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_XTS_decrypt_update_avx512 PROC + push rdi + push rsi + push r12 + mov rdi, rcx + mov rsi, rdx + mov rax, r8 + mov r10, r9 + mov r8, QWORD PTR [rsp+64] + mov r9d, DWORD PTR [rsp+72] + sub rsp, 224 + vmovdqu OWORD PTR [rsp+64], xmm6 + vmovdqu OWORD PTR [rsp+80], xmm7 + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + vmovdqu OWORD PTR [rsp+128], xmm10 + vmovdqu OWORD PTR [rsp+144], xmm11 + vmovdqu OWORD PTR [rsp+160], xmm12 + vmovdqu OWORD PTR [rsp+176], xmm13 + vmovdqu OWORD PTR [rsp+192], xmm14 + vmovdqu OWORD PTR [rsp+208], xmm15 + vmovdqu xmm12, OWORD PTR L_avx512_aes_xts_gc_xts + vbroadcasti32x4 zmm13, ptr_L_avx512_aes_xts_poly + vmovdqu64 zmm14, ptr_L_avx512_aes_xts_shl + vmovdqu64 zmm15, ptr_L_avx512_aes_xts_shr + vmovdqu xmm8, OWORD PTR [r8] + xor r12d, r12d + mov r11d, eax + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_update_avx512_mul16_256 + sub r11d, 16 + cmp r11d, 16 + jl L_AES_XTS_decrypt_update_avx512_last_31_start +L_AES_XTS_decrypt_update_avx512_mul16_256: + cmp r11d, 32 + jl L_AES_XTS_decrypt_update_avx512_done_128 + vbroadcasti32x4 zmm16, [r10] + vbroadcasti32x4 zmm17, [r10+16] + vbroadcasti32x4 zmm18, [r10+32] + vbroadcasti32x4 zmm19, [r10+48] + vbroadcasti32x4 zmm20, [r10+64] + vbroadcasti32x4 zmm21, [r10+80] + vbroadcasti32x4 zmm22, [r10+96] + vbroadcasti32x4 zmm23, [r10+112] + vbroadcasti32x4 zmm24, [r10+128] + vbroadcasti32x4 zmm25, [r10+144] + vbroadcasti32x4 zmm26, [r10+160] + cmp r9d, 11 + jl L_AES_XTS_decrypt_update_avx512_key_cached + vbroadcasti32x4 zmm27, [r10+176] + vbroadcasti32x4 zmm28, [r10+192] + cmp r9d, 13 + jl L_AES_XTS_decrypt_update_avx512_key_cached + vbroadcasti32x4 zmm29, [r10+208] + vbroadcasti32x4 zmm30, [r10+224] +L_AES_XTS_decrypt_update_avx512_key_cached: + cmp r11d, 256 + jl L_AES_XTS_decrypt_update_avx512_done_256 + and r11d, 4294967040 + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + vpsrlq zmm9, zmm4, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm4, 4 + vpternlogq zmm5, zmm10, zmm9, 150 + vpsrlq zmm9, zmm5, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm6, zmm5, 4 + vpternlogq zmm6, zmm10, zmm9, 150 + vpsrlq zmm9, zmm6, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm7, zmm6, 4 + vpternlogq zmm7, zmm10, zmm9, 150 +L_AES_XTS_decrypt_update_avx512_dec_256: + ; 256 bytes of input + ; aes_dec_256 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 zmm0, [rcx] + vmovdqu64 zmm1, [rcx+64] + vmovdqu64 zmm2, [rcx+128] + vmovdqu64 zmm3, [rcx+192] + ; aes_dec_block + vpternlogq zmm0, zmm16, zmm4, 150 + vpternlogq zmm1, zmm16, zmm5, 150 + vpternlogq zmm2, zmm16, zmm6, 150 + vpternlogq zmm3, zmm16, zmm7, 150 + vaesdec zmm0, zmm0, zmm17 + vaesdec zmm1, zmm1, zmm17 + vaesdec zmm2, zmm2, zmm17 + vaesdec zmm3, zmm3, zmm17 + vaesdec zmm0, zmm0, zmm18 + vaesdec zmm1, zmm1, zmm18 + vaesdec zmm2, zmm2, zmm18 + vaesdec zmm3, zmm3, zmm18 + vaesdec zmm0, zmm0, zmm19 + vaesdec zmm1, zmm1, zmm19 + vaesdec zmm2, zmm2, zmm19 + vaesdec zmm3, zmm3, zmm19 + vaesdec zmm0, zmm0, zmm20 + vaesdec zmm1, zmm1, zmm20 + vaesdec zmm2, zmm2, zmm20 + vaesdec zmm3, zmm3, zmm20 + vaesdec zmm0, zmm0, zmm21 + vaesdec zmm1, zmm1, zmm21 + vaesdec zmm2, zmm2, zmm21 + vaesdec zmm3, zmm3, zmm21 + vaesdec zmm0, zmm0, zmm22 + vaesdec zmm1, zmm1, zmm22 + vaesdec zmm2, zmm2, zmm22 + vaesdec zmm3, zmm3, zmm22 + vaesdec zmm0, zmm0, zmm23 + vaesdec zmm1, zmm1, zmm23 + vaesdec zmm2, zmm2, zmm23 + vaesdec zmm3, zmm3, zmm23 + vaesdec zmm0, zmm0, zmm24 + vaesdec zmm1, zmm1, zmm24 + vaesdec zmm2, zmm2, zmm24 + vaesdec zmm3, zmm3, zmm24 + vaesdec zmm0, zmm0, zmm25 + vaesdec zmm1, zmm1, zmm25 + vaesdec zmm2, zmm2, zmm25 + vaesdec zmm3, zmm3, zmm25 + cmp r9d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_256_aes_dec_block_last + vaesdec zmm0, zmm0, zmm26 + vaesdec zmm1, zmm1, zmm26 + vaesdec zmm2, zmm2, zmm26 + vaesdec zmm3, zmm3, zmm26 + vaesdec zmm0, zmm0, zmm27 + vaesdec zmm1, zmm1, zmm27 + vaesdec zmm2, zmm2, zmm27 + vaesdec zmm3, zmm3, zmm27 + cmp r9d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_256_aes_dec_block_last + vaesdec zmm0, zmm0, zmm28 + vaesdec zmm1, zmm1, zmm28 + vaesdec zmm2, zmm2, zmm28 + vaesdec zmm3, zmm3, zmm28 + vaesdec zmm0, zmm0, zmm29 + vaesdec zmm1, zmm1, zmm29 + vaesdec zmm2, zmm2, zmm29 + vaesdec zmm3, zmm3, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_decrypt_update_avx512_aes_dec_256_aes_dec_block_last: + vaesdeclast zmm0, zmm0, zmm9 + vaesdeclast zmm1, zmm1, zmm9 + vaesdeclast zmm2, zmm2, zmm9 + vaesdeclast zmm3, zmm3, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vpsrlq zmm9, zmm4, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm4, zmm4, 16 + vpternlogq zmm4, zmm10, zmm9, 150 + vpxorq zmm1, zmm1, zmm5 + vmovdqu64 [rdx+64], zmm1 + vpsrlq zmm9, zmm5, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm5, 16 + vpternlogq zmm5, zmm10, zmm9, 150 + vpxorq zmm2, zmm2, zmm6 + vmovdqu64 [rdx+128], zmm2 + vpsrlq zmm9, zmm6, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm6, zmm6, 16 + vpternlogq zmm6, zmm10, zmm9, 150 + vpxorq zmm3, zmm3, zmm7 + vmovdqu64 [rdx+192], zmm3 + vpsrlq zmm9, zmm7, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm7, zmm7, 16 + vpternlogq zmm7, zmm10, zmm9, 150 + add r12d, 256 + cmp r12d, r11d + jl L_AES_XTS_decrypt_update_avx512_dec_256 + vextracti32x4 xmm8, zmm4, 0 +L_AES_XTS_decrypt_update_avx512_done_256: + cmp r12d, eax + mov r11d, eax + je L_AES_XTS_decrypt_update_avx512_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_update_avx512_mul16_128 + sub r11d, 16 + sub r11d, r12d + cmp r11d, 16 + jl L_AES_XTS_decrypt_update_avx512_last_31_start + add r11d, r12d +L_AES_XTS_decrypt_update_avx512_mul16_128: + and r11d, 4294967168 + cmp r12d, r11d + je L_AES_XTS_decrypt_update_avx512_done_128 + ; 128 bytes of input + ; aes_dec_128 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 zmm0, [rcx] + vmovdqu64 zmm1, [rcx+64] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + vpsrlq zmm9, zmm4, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm4, 4 + vpternlogq zmm5, zmm10, zmm9, 150 + ; aes_dec_block + vpternlogq zmm0, zmm16, zmm4, 150 + vpternlogq zmm1, zmm16, zmm5, 150 + vaesdec zmm0, zmm0, zmm17 + vaesdec zmm1, zmm1, zmm17 + vaesdec zmm0, zmm0, zmm18 + vaesdec zmm1, zmm1, zmm18 + vaesdec zmm0, zmm0, zmm19 + vaesdec zmm1, zmm1, zmm19 + vaesdec zmm0, zmm0, zmm20 + vaesdec zmm1, zmm1, zmm20 + vaesdec zmm0, zmm0, zmm21 + vaesdec zmm1, zmm1, zmm21 + vaesdec zmm0, zmm0, zmm22 + vaesdec zmm1, zmm1, zmm22 + vaesdec zmm0, zmm0, zmm23 + vaesdec zmm1, zmm1, zmm23 + vaesdec zmm0, zmm0, zmm24 + vaesdec zmm1, zmm1, zmm24 + vaesdec zmm0, zmm0, zmm25 + vaesdec zmm1, zmm1, zmm25 + cmp r9d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_128_aes_dec_block_last + vaesdec zmm0, zmm0, zmm26 + vaesdec zmm1, zmm1, zmm26 + vaesdec zmm0, zmm0, zmm27 + vaesdec zmm1, zmm1, zmm27 + cmp r9d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_128_aes_dec_block_last + vaesdec zmm0, zmm0, zmm28 + vaesdec zmm1, zmm1, zmm28 + vaesdec zmm0, zmm0, zmm29 + vaesdec zmm1, zmm1, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_decrypt_update_avx512_aes_dec_128_aes_dec_block_last: + vaesdeclast zmm0, zmm0, zmm9 + vaesdeclast zmm1, zmm1, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vpxorq zmm1, zmm1, zmm5 + vmovdqu64 [rdx+64], zmm1 + vextracti32x4 xmm8, zmm5, 3 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpternlogd xmm8, xmm9, xmm12, 120 + add r12d, 128 +L_AES_XTS_decrypt_update_avx512_done_128: + cmp r12d, eax + mov r11d, eax + je L_AES_XTS_decrypt_update_avx512_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_update_avx512_mul16_64 + sub r11d, 16 + sub r11d, r12d + cmp r11d, 16 + jl L_AES_XTS_decrypt_update_avx512_last_31_start + add r11d, r12d +L_AES_XTS_decrypt_update_avx512_mul16_64: + and r11d, 4294967232 + cmp r12d, r11d + je L_AES_XTS_decrypt_update_avx512_done_64 + ; 64 bytes of input + ; aes_dec_64 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 zmm0, [rcx] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + ; aes_dec_block + vpternlogq zmm0, zmm16, zmm4, 150 + vaesdec zmm0, zmm0, zmm17 + vaesdec zmm0, zmm0, zmm18 + vaesdec zmm0, zmm0, zmm19 + vaesdec zmm0, zmm0, zmm20 + vaesdec zmm0, zmm0, zmm21 + vaesdec zmm0, zmm0, zmm22 + vaesdec zmm0, zmm0, zmm23 + vaesdec zmm0, zmm0, zmm24 + vaesdec zmm0, zmm0, zmm25 + cmp r9d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_64_aes_dec_block_last + vaesdec zmm0, zmm0, zmm26 + vaesdec zmm0, zmm0, zmm27 + cmp r9d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_64_aes_dec_block_last + vaesdec zmm0, zmm0, zmm28 + vaesdec zmm0, zmm0, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_decrypt_update_avx512_aes_dec_64_aes_dec_block_last: + vaesdeclast zmm0, zmm0, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vextracti32x4 xmm8, zmm4, 3 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpternlogd xmm8, xmm9, xmm12, 120 + add r12d, 64 +L_AES_XTS_decrypt_update_avx512_done_64: + cmp r12d, eax + mov r11d, eax + je L_AES_XTS_decrypt_update_avx512_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_update_avx512_mul16_32 + sub r11d, 16 + sub r11d, r12d + cmp r11d, 16 + jl L_AES_XTS_decrypt_update_avx512_last_31_start + add r11d, r12d +L_AES_XTS_decrypt_update_avx512_mul16_32: + and r11d, 4294967264 + cmp r12d, r11d + je L_AES_XTS_decrypt_update_avx512_done_32 + ; 32 bytes of input + ; aes_dec_32 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 ymm0, [rcx] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + ; aes_dec_block + vpternlogq ymm0, ymm16, ymm4, 150 + vaesdec ymm0, ymm0, ymm17 + vaesdec ymm0, ymm0, ymm18 + vaesdec ymm0, ymm0, ymm19 + vaesdec ymm0, ymm0, ymm20 + vaesdec ymm0, ymm0, ymm21 + vaesdec ymm0, ymm0, ymm22 + vaesdec ymm0, ymm0, ymm23 + vaesdec ymm0, ymm0, ymm24 + vaesdec ymm0, ymm0, ymm25 + cmp r9d, 11 + vmovdqa64 ymm9, ymm26 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm26 + vaesdec ymm0, ymm0, ymm27 + cmp r9d, 13 + vmovdqa64 ymm9, ymm28 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm28 + vaesdec ymm0, ymm0, ymm29 + vmovdqa64 ymm9, ymm30 +L_AES_XTS_decrypt_update_avx512_aes_dec_32_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm9 + vpxorq ymm0, ymm0, ymm4 + vmovdqu64 [rdx], ymm0 + vextracti32x4 xmm8, zmm4, 2 + add r12d, 32 +L_AES_XTS_decrypt_update_avx512_done_32: + cmp r12d, eax + mov r11d, eax + je L_AES_XTS_decrypt_update_avx512_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_update_avx512_mul16 + sub r11d, 16 + sub r11d, r12d + cmp r11d, 16 + jl L_AES_XTS_decrypt_update_avx512_last_31_start + add r11d, r12d +L_AES_XTS_decrypt_update_avx512_mul16: +L_AES_XTS_decrypt_update_avx512_dec_16: + ; 16 bytes of input + lea rcx, QWORD PTR [rdi+r12] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm8 + ; aes_dec_block + vpxor xmm0, xmm0, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesdec xmm0, xmm0, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_decrypt_update_avx512_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesdec xmm0, xmm0, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_decrypt_update_avx512_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_decrypt_update_avx512_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu OWORD PTR [rcx], xmm0 + vpshufd xmm4, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpternlogd xmm8, xmm4, xmm12, 120 + add r12d, 16 + cmp r12d, r11d + jl L_AES_XTS_decrypt_update_avx512_dec_16 + cmp r12d, eax + je L_AES_XTS_decrypt_update_avx512_done_dec +L_AES_XTS_decrypt_update_avx512_last_31_start: + vpshufd xmm4, xmm8, 19 + vpaddq xmm7, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpternlogd xmm7, xmm4, xmm12, 120 + lea rcx, QWORD PTR [rdi+r12] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm7 + ; aes_dec_block + vpxor xmm0, xmm0, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesdec xmm0, xmm0, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_decrypt_update_avx512_last_31_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesdec xmm0, xmm0, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_decrypt_update_avx512_last_31_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_decrypt_update_avx512_last_31_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm7 + vmovdqu OWORD PTR [rsp], xmm0 + add r12, 16 + xor rdx, rdx +L_AES_XTS_decrypt_update_avx512_last_31_byte_loop: + mov r11b, BYTE PTR [rsp+rdx] + mov cl, BYTE PTR [rdi+r12] + mov BYTE PTR [rsi+r12], r11b + mov BYTE PTR [rsp+rdx], cl + inc r12d + inc edx + cmp r12d, eax + jl L_AES_XTS_decrypt_update_avx512_last_31_byte_loop + sub r12, rdx + vmovdqu xmm0, OWORD PTR [rsp] + vpxor xmm0, xmm0, xmm8 + ; aes_dec_block + vpxor xmm0, xmm0, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesdec xmm0, xmm0, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_decrypt_update_avx512_last_31_2_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesdec xmm0, xmm0, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_decrypt_update_avx512_last_31_2_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_decrypt_update_avx512_last_31_2_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + sub r12, 16 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu OWORD PTR [rcx], xmm0 +L_AES_XTS_decrypt_update_avx512_done_dec: + vmovdqu OWORD PTR [r8], xmm8 + vmovdqu xmm6, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm8, OWORD PTR [rsp+96] + vmovdqu xmm9, OWORD PTR [rsp+112] + vmovdqu xmm10, OWORD PTR [rsp+128] + vmovdqu xmm11, OWORD PTR [rsp+144] + vmovdqu xmm12, OWORD PTR [rsp+160] + vmovdqu xmm13, OWORD PTR [rsp+176] + vmovdqu xmm14, OWORD PTR [rsp+192] + vmovdqu xmm15, OWORD PTR [rsp+208] + add rsp, 224 + pop r12 + pop rsi + pop rdi + ret +AES_XTS_decrypt_update_avx512 ENDP +_TEXT ENDS +ENDIF END diff --git a/wolfcrypt/src/chacha_asm.S b/wolfcrypt/src/chacha_asm.S index 6109e22f603..ba8768bd9c2 100644 --- a/wolfcrypt/src/chacha_asm.S +++ b/wolfcrypt/src/chacha_asm.S @@ -46,6 +46,16 @@ #define HAVE_INTEL_AVX2 #endif /* HAVE_INTEL_AVX2 */ #endif /* NO_AVX2_SUPPORT */ +#ifndef NO_VAES_SUPPORT +#ifndef HAVE_INTEL_VAES +#define HAVE_INTEL_VAES +#endif /* HAVE_INTEL_VAES */ +#endif /* NO_VAES_SUPPORT */ +#ifndef NO_AVX512_SUPPORT +#ifndef HAVE_INTEL_AVX512 +#define HAVE_INTEL_AVX512 +#endif /* HAVE_INTEL_AVX512 */ +#endif /* NO_AVX512_SUPPORT */ #ifdef WOLFSSL_X86_64_BUILD #ifndef __APPLE__ diff --git a/wolfcrypt/src/cpuid.c b/wolfcrypt/src/cpuid.c index 2c3670234a6..8963abb49a8 100644 --- a/wolfcrypt/src/cpuid.c +++ b/wolfcrypt/src/cpuid.c @@ -130,6 +130,8 @@ if (cpuid_flag(1, 0, ECX, 22)) { new_cpuid_flags |= CPUID_MOVBE ; } if (cpuid_flag(7, 0, EBX, 3)) { new_cpuid_flags |= CPUID_BMI1 ; } if (cpuid_flag(7, 0, EBX, 29)) { new_cpuid_flags |= CPUID_SHA ; } + if (cpuid_flag(7, 0, ECX, 9)) { new_cpuid_flags |= CPUID_VAES ; } + if (cpuid_flag(7, 0, EBX, 16)) { new_cpuid_flags |= CPUID_AVX512; } (void)wolfSSL_Atomic_Uint_CompareExchange (&cpuid_flags, &old_cpuid_flags, new_cpuid_flags); } diff --git a/wolfcrypt/src/fe_x25519_asm.S b/wolfcrypt/src/fe_x25519_asm.S index 3f0e0dd6a89..7e976fa1f28 100644 --- a/wolfcrypt/src/fe_x25519_asm.S +++ b/wolfcrypt/src/fe_x25519_asm.S @@ -46,6 +46,16 @@ #define HAVE_INTEL_AVX2 #endif /* HAVE_INTEL_AVX2 */ #endif /* NO_AVX2_SUPPORT */ +#ifndef NO_VAES_SUPPORT +#ifndef HAVE_INTEL_VAES +#define HAVE_INTEL_VAES +#endif /* HAVE_INTEL_VAES */ +#endif /* NO_VAES_SUPPORT */ +#ifndef NO_AVX512_SUPPORT +#ifndef HAVE_INTEL_AVX512 +#define HAVE_INTEL_AVX512 +#endif /* HAVE_INTEL_AVX512 */ +#endif /* NO_AVX512_SUPPORT */ #ifndef __APPLE__ .text diff --git a/wolfcrypt/src/include.am b/wolfcrypt/src/include.am index 18d7a339cd5..908c43984cd 100644 --- a/wolfcrypt/src/include.am +++ b/wolfcrypt/src/include.am @@ -16,6 +16,7 @@ EXTRA_DIST += wolfcrypt/src/evp.c EXTRA_DIST += wolfcrypt/src/evp_pk.c EXTRA_DIST += wolfcrypt/src/asm.c EXTRA_DIST += wolfcrypt/src/aes_asm.asm +EXTRA_DIST += wolfcrypt/src/aes_x86_64_asm.asm EXTRA_DIST += wolfcrypt/src/aes_gcm_asm.asm EXTRA_DIST += wolfcrypt/src/aes_xts_asm.asm EXTRA_DIST += wolfcrypt/src/chacha_asm.asm diff --git a/wolfcrypt/src/poly1305_asm.S b/wolfcrypt/src/poly1305_asm.S index 7f73e87b67e..f55cce5a079 100644 --- a/wolfcrypt/src/poly1305_asm.S +++ b/wolfcrypt/src/poly1305_asm.S @@ -46,6 +46,16 @@ #define HAVE_INTEL_AVX2 #endif /* HAVE_INTEL_AVX2 */ #endif /* NO_AVX2_SUPPORT */ +#ifndef NO_VAES_SUPPORT +#ifndef HAVE_INTEL_VAES +#define HAVE_INTEL_VAES +#endif /* HAVE_INTEL_VAES */ +#endif /* NO_VAES_SUPPORT */ +#ifndef NO_AVX512_SUPPORT +#ifndef HAVE_INTEL_AVX512 +#define HAVE_INTEL_AVX512 +#endif /* HAVE_INTEL_AVX512 */ +#endif /* NO_AVX512_SUPPORT */ #ifdef WOLFSSL_X86_64_BUILD #ifdef HAVE_INTEL_AVX1 diff --git a/wolfcrypt/src/sha256_asm.S b/wolfcrypt/src/sha256_asm.S index a407b7de1f5..d91a82aff94 100644 --- a/wolfcrypt/src/sha256_asm.S +++ b/wolfcrypt/src/sha256_asm.S @@ -46,6 +46,16 @@ #define HAVE_INTEL_AVX2 #endif /* HAVE_INTEL_AVX2 */ #endif /* NO_AVX2_SUPPORT */ +#ifndef NO_VAES_SUPPORT +#ifndef HAVE_INTEL_VAES +#define HAVE_INTEL_VAES +#endif /* HAVE_INTEL_VAES */ +#endif /* NO_VAES_SUPPORT */ +#ifndef NO_AVX512_SUPPORT +#ifndef HAVE_INTEL_AVX512 +#define HAVE_INTEL_AVX512 +#endif /* HAVE_INTEL_AVX512 */ +#endif /* NO_AVX512_SUPPORT */ #ifdef WOLFSSL_X86_64_BUILD #ifndef __APPLE__ diff --git a/wolfcrypt/src/sha3_asm.S b/wolfcrypt/src/sha3_asm.S index 6abc9d851b1..810a1c67433 100644 --- a/wolfcrypt/src/sha3_asm.S +++ b/wolfcrypt/src/sha3_asm.S @@ -46,6 +46,16 @@ #define HAVE_INTEL_AVX2 #endif /* HAVE_INTEL_AVX2 */ #endif /* NO_AVX2_SUPPORT */ +#ifndef NO_VAES_SUPPORT +#ifndef HAVE_INTEL_VAES +#define HAVE_INTEL_VAES +#endif /* HAVE_INTEL_VAES */ +#endif /* NO_VAES_SUPPORT */ +#ifndef NO_AVX512_SUPPORT +#ifndef HAVE_INTEL_AVX512 +#define HAVE_INTEL_AVX512 +#endif /* HAVE_INTEL_AVX512 */ +#endif /* NO_AVX512_SUPPORT */ #ifndef __APPLE__ .data diff --git a/wolfcrypt/src/sha512_asm.S b/wolfcrypt/src/sha512_asm.S index d0ca1dd4fd4..b3c377deabc 100644 --- a/wolfcrypt/src/sha512_asm.S +++ b/wolfcrypt/src/sha512_asm.S @@ -46,6 +46,16 @@ #define HAVE_INTEL_AVX2 #endif /* HAVE_INTEL_AVX2 */ #endif /* NO_AVX2_SUPPORT */ +#ifndef NO_VAES_SUPPORT +#ifndef HAVE_INTEL_VAES +#define HAVE_INTEL_VAES +#endif /* HAVE_INTEL_VAES */ +#endif /* NO_VAES_SUPPORT */ +#ifndef NO_AVX512_SUPPORT +#ifndef HAVE_INTEL_AVX512 +#define HAVE_INTEL_AVX512 +#endif /* HAVE_INTEL_AVX512 */ +#endif /* NO_AVX512_SUPPORT */ #ifdef HAVE_INTEL_AVX1 #ifndef __APPLE__ diff --git a/wolfcrypt/src/wc_mldsa_asm.S b/wolfcrypt/src/wc_mldsa_asm.S index 717986e4a5c..e1e77a93783 100644 --- a/wolfcrypt/src/wc_mldsa_asm.S +++ b/wolfcrypt/src/wc_mldsa_asm.S @@ -46,6 +46,16 @@ #define HAVE_INTEL_AVX2 #endif /* HAVE_INTEL_AVX2 */ #endif /* NO_AVX2_SUPPORT */ +#ifndef NO_VAES_SUPPORT +#ifndef HAVE_INTEL_VAES +#define HAVE_INTEL_VAES +#endif /* HAVE_INTEL_VAES */ +#endif /* NO_VAES_SUPPORT */ +#ifndef NO_AVX512_SUPPORT +#ifndef HAVE_INTEL_AVX512 +#define HAVE_INTEL_AVX512 +#endif /* HAVE_INTEL_AVX512 */ +#endif /* NO_AVX512_SUPPORT */ #ifdef WOLFSSL_HAVE_MLDSA #ifdef HAVE_INTEL_AVX2 diff --git a/wolfcrypt/src/wc_mlkem_asm.S b/wolfcrypt/src/wc_mlkem_asm.S index 9b80cf8d432..b399218dfdd 100644 --- a/wolfcrypt/src/wc_mlkem_asm.S +++ b/wolfcrypt/src/wc_mlkem_asm.S @@ -46,6 +46,16 @@ #define HAVE_INTEL_AVX2 #endif /* HAVE_INTEL_AVX2 */ #endif /* NO_AVX2_SUPPORT */ +#ifndef NO_VAES_SUPPORT +#ifndef HAVE_INTEL_VAES +#define HAVE_INTEL_VAES +#endif /* HAVE_INTEL_VAES */ +#endif /* NO_VAES_SUPPORT */ +#ifndef NO_AVX512_SUPPORT +#ifndef HAVE_INTEL_AVX512 +#define HAVE_INTEL_AVX512 +#endif /* HAVE_INTEL_AVX512 */ +#endif /* NO_AVX512_SUPPORT */ #ifdef WOLFSSL_HAVE_MLKEM #ifdef HAVE_INTEL_AVX2 diff --git a/wolfssl-VS2022.vcxproj b/wolfssl-VS2022.vcxproj index a4b8d39b196..81d32758e91 100644 --- a/wolfssl-VS2022.vcxproj +++ b/wolfssl-VS2022.vcxproj @@ -1,577 +1,591 @@ - - - - - Debug - Win32 - - - Debug - x64 - - - Debug - ARM64 - - - DLL Debug - Win32 - - - DLL Debug - x64 - - - DLL Debug - ARM64 - - - DLL Release - Win32 - - - DLL Release - x64 - - - DLL Release - ARM64 - - - Release - Win32 - - - Release - x64 - - - Release - ARM64 - - - - {12226DBE-7278-4DFA-A119-5A0294CF0B33} - wolfssl - Win32Proj - wolfssl - - - - StaticLibrary - v143 - Unicode - true - - - DynamicLibrary - v143 - Unicode - true - - - StaticLibrary - v143 - Unicode - true - - - DynamicLibrary - v143 - Unicode - true - - - StaticLibrary - v143 - Unicode - true - - - DynamicLibrary - v143 - Unicode - true - - - StaticLibrary - v143 - Unicode - - - DynamicLibrary - v143 - Unicode - - - StaticLibrary - v143 - Unicode - - - DynamicLibrary - v143 - Unicode - - - StaticLibrary - v143 - Unicode - - - DynamicLibrary - v143 - Unicode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - - Disabled - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - true - EnableFastChecks - MultiThreadedDebugDLL - - Level4 - EditAndContinue - 4206;4214;4706;%(DisableSpecificWarnings) - - - - - Disabled - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - true - EnableFastChecks - MultiThreadedDebugDLL - - - Level4 - EditAndContinue - 4206;4214;4706;%(DisableSpecificWarnings) - - - ws2_32.lib;%(AdditionalDependencies) - false - true - false - - - - - Disabled - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - EnableFastChecks - MultiThreadedDebugDLL - - - Level4 - ProgramDatabase - 4206;4214;4706;%(DisableSpecificWarnings) - - - - - Disabled - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - EnableFastChecks - MultiThreadedDebugDLL - - - Level4 - ProgramDatabase - 4206;4214;4706;%(DisableSpecificWarnings) - - - ws2_32.lib;%(AdditionalDependencies) - false - true - - - - - Disabled - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - EnableFastChecks - MultiThreadedDebugDLL - - - Level4 - ProgramDatabase - 4206;4214;4706;%(DisableSpecificWarnings) - - - - - Disabled - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - EnableFastChecks - MultiThreadedDebugDLL - - - Level4 - ProgramDatabase - 4206;4214;4706;%(DisableSpecificWarnings) - - - ws2_32.lib;%(AdditionalDependencies) - false - true - - - - - MaxSpeed - true - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - MultiThreadedDLL - true - - Level3 - ProgramDatabase - - - - - MaxSpeed - true - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - MultiThreadedDLL - true - - - Level3 - ProgramDatabase - - - ws2_32.lib;%(AdditionalDependencies) - true - - - - - MaxSpeed - true - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - MultiThreadedDLL - true - - - Level3 - ProgramDatabase - - - - - MaxSpeed - true - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - MultiThreadedDLL - true - - - Level3 - ProgramDatabase - - - ws2_32.lib;%(AdditionalDependencies) - true - - - - - MaxSpeed - true - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - MultiThreadedDLL - true - - - Level3 - ProgramDatabase - - - - - MaxSpeed - true - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - MultiThreadedDLL - true - - - Level3 - ProgramDatabase - - - ws2_32.lib;%(AdditionalDependencies) - true - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - - - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - - - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - - - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - - - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - - - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - - - - - - true - true - true - true - true - true - - - - - - + + + + + Debug + Win32 + + + Debug + x64 + + + Debug + ARM64 + + + DLL Debug + Win32 + + + DLL Debug + x64 + + + DLL Debug + ARM64 + + + DLL Release + Win32 + + + DLL Release + x64 + + + DLL Release + ARM64 + + + Release + Win32 + + + Release + x64 + + + Release + ARM64 + + + + {12226DBE-7278-4DFA-A119-5A0294CF0B33} + wolfssl + Win32Proj + wolfssl + + + + StaticLibrary + v143 + Unicode + true + + + DynamicLibrary + v143 + Unicode + true + + + StaticLibrary + v143 + Unicode + true + + + DynamicLibrary + v143 + Unicode + true + + + StaticLibrary + v143 + Unicode + true + + + DynamicLibrary + v143 + Unicode + true + + + StaticLibrary + v143 + Unicode + + + DynamicLibrary + v143 + Unicode + + + StaticLibrary + v143 + Unicode + + + DynamicLibrary + v143 + Unicode + + + StaticLibrary + v143 + Unicode + + + DynamicLibrary + v143 + Unicode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + + Disabled + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebugDLL + + Level4 + EditAndContinue + 4206;4214;4706;%(DisableSpecificWarnings) + + + + + Disabled + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebugDLL + + + Level4 + EditAndContinue + 4206;4214;4706;%(DisableSpecificWarnings) + + + ws2_32.lib;%(AdditionalDependencies) + false + true + false + + + + + Disabled + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + Level4 + ProgramDatabase + 4206;4214;4706;%(DisableSpecificWarnings) + + + + + Disabled + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + Level4 + ProgramDatabase + 4206;4214;4706;%(DisableSpecificWarnings) + + + ws2_32.lib;%(AdditionalDependencies) + false + true + + + + + Disabled + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + Level4 + ProgramDatabase + 4206;4214;4706;%(DisableSpecificWarnings) + + + + + Disabled + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + Level4 + ProgramDatabase + 4206;4214;4706;%(DisableSpecificWarnings) + + + ws2_32.lib;%(AdditionalDependencies) + false + true + + + + + MaxSpeed + true + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + MultiThreadedDLL + true + + Level3 + ProgramDatabase + + + + + MaxSpeed + true + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + MultiThreadedDLL + true + + + Level3 + ProgramDatabase + + + ws2_32.lib;%(AdditionalDependencies) + true + + + + + MaxSpeed + true + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + MultiThreadedDLL + true + + + Level3 + ProgramDatabase + + + + + MaxSpeed + true + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + MultiThreadedDLL + true + + + Level3 + ProgramDatabase + + + ws2_32.lib;%(AdditionalDependencies) + true + + + + + MaxSpeed + true + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + MultiThreadedDLL + true + + + Level3 + ProgramDatabase + + + + + MaxSpeed + true + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + MultiThreadedDLL + true + + + Level3 + ProgramDatabase + + + ws2_32.lib;%(AdditionalDependencies) + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + + + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + + + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + + + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + + + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + + + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + + + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + + + + + + true + true + true + true + true + true + + + + + + diff --git a/wolfssl.vcxproj b/wolfssl.vcxproj index c38bc90b99d..44c23ab74ee 100644 --- a/wolfssl.vcxproj +++ b/wolfssl.vcxproj @@ -489,6 +489,20 @@ $(OutDir)%(Filename).obj $(IntDir)%(Filename).obj + + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false false diff --git a/wolfssl/wolfcrypt/cpuid.h b/wolfssl/wolfcrypt/cpuid.h index bb7e68436b4..aada8801191 100644 --- a/wolfssl/wolfcrypt/cpuid.h +++ b/wolfssl/wolfcrypt/cpuid.h @@ -67,6 +67,8 @@ typedef word32 cpuid_flags_t; #define CPUID_MOVBE 0x0080 /* Move and byte swap */ #define CPUID_BMI1 0x0100 /* ANDN */ #define CPUID_SHA 0x0200 /* SHA-1 and SHA-256 instructions */ + #define CPUID_VAES 0x0400 + #define CPUID_AVX512 0x0800 #define IS_INTEL_AVX1(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_AVX1) #define IS_INTEL_AVX2(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_AVX2) @@ -78,6 +80,8 @@ typedef word32 cpuid_flags_t; #define IS_INTEL_MOVBE(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_MOVBE) #define IS_INTEL_BMI1(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_BMI1) #define IS_INTEL_SHA(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_SHA) + #define IS_INTEL_VAES(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_VAES) + #define IS_INTEL_AVX512(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_AVX512) #elif defined(HAVE_CPUID_AARCH64) diff --git a/wrapper/CSharp/wolfssl.vcxproj b/wrapper/CSharp/wolfssl.vcxproj index 66694f76438..7a963cbd913 100644 --- a/wrapper/CSharp/wolfssl.vcxproj +++ b/wrapper/CSharp/wolfssl.vcxproj @@ -371,6 +371,20 @@ $(OutDir)%(Filename).obj $(IntDir)%(Filename).obj + + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false false