diff --git a/.github/workflows/symbol-prefixes.yml b/.github/workflows/symbol-prefixes.yml index 33142162ccf..fd767ca0092 100644 --- a/.github/workflows/symbol-prefixes.yml +++ b/.github/workflows/symbol-prefixes.yml @@ -49,7 +49,7 @@ jobs: { if (($7 !~ /^[0-9]+$/) || ($8 ~ /^(wc_|wolf|WOLF|__pfx|fe_|sp_[a-zA-Z090-0_]*[0-9])/) || - ($8 ~ /(_avx[12]|_AVX[12]|_sse[12]|_SSE[12]|_aesni|_AESNI|_bmi2|_x64$)/)) + ($8 ~ /(_avx[12]|_AVX[12]|_sse[12]|_SSE[12]|_aesni|_AESNI|_vaes|_VAES|_avx512|_AVX512|_bmi2|_x64$)/)) { next; } diff --git a/.wolfssl_known_macro_extras b/.wolfssl_known_macro_extras index 101ebf2fa88..c08a8580379 100644 --- a/.wolfssl_known_macro_extras +++ b/.wolfssl_known_macro_extras @@ -381,6 +381,7 @@ NO_AES_DECRYPT NO_ARDUINO_DEFAULT NO_ASM NO_ASN_OLD_TYPE_NAMES +NO_AVX512_SUPPORT NO_CAMELLIA_CBC NO_CERT NO_CERT_IN_TICKET @@ -459,6 +460,7 @@ NO_STDIO_FGETS_REMAP NO_STM32_HMAC NO_TKERNEL_MEM_POOL NO_TLSX_PSKKEM_PLAIN_ANNOUNCE +NO_VAES_SUPPORT NO_VERIFY_OID NO_WC_DHGENERATEPUBLIC NO_WC_SHE_GETUID diff --git a/linuxkm/Kbuild b/linuxkm/Kbuild index fe3f823942f..831a45c76a4 100644 --- a/linuxkm/Kbuild +++ b/linuxkm/Kbuild @@ -200,6 +200,8 @@ $(obj)/wolfcrypt/src/aes_gcm_asm.o: asflags-y := $(WOLFSSL_ASFLAGS) $(ASFLAGS_FP $(obj)/wolfcrypt/src/aes_gcm_asm.o: OBJECT_FILES_NON_STANDARD := y $(obj)/wolfcrypt/src/aes_xts_asm.o: asflags-y := $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE) $(obj)/wolfcrypt/src/aes_xts_asm.o: OBJECT_FILES_NON_STANDARD := y +$(obj)/wolfcrypt/src/aes_x86_64_asm.o: asflags-y := $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE) +$(obj)/wolfcrypt/src/aes_x86_64_asm.o: OBJECT_FILES_NON_STANDARD := y $(obj)/wolfcrypt/src/sp_x86_64_asm.o: asflags-y := $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE) $(obj)/wolfcrypt/src/sp_x86_64_asm.o: OBJECT_FILES_NON_STANDARD := y $(obj)/wolfcrypt/src/sha256_asm.o: asflags-y := $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE) diff --git a/src/include.am b/src/include.am index 4b80e149bac..2e904706f83 100644 --- a/src/include.am +++ b/src/include.am @@ -109,6 +109,7 @@ endif if BUILD_AESNI src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_asm.S +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_x86_64_asm.S if BUILD_X86_ASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S else @@ -259,6 +260,7 @@ endif BUILD_PPC64_ASM if BUILD_AESNI src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_asm.S +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_x86_64_asm.S if BUILD_X86_ASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S else @@ -532,6 +534,7 @@ endif BUILD_PPC64_ASM if BUILD_AESNI src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_asm.S +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_x86_64_asm.S if BUILD_X86_ASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S else @@ -867,6 +870,7 @@ endif BUILD_AES if BUILD_AESNI src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_asm.S +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_x86_64_asm.S if BUILD_X86_ASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S else @@ -1708,6 +1712,7 @@ endif if !BUILD_FIPS_V2_PLUS if BUILD_AESNI src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_asm.S +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_x86_64_asm.S if BUILD_X86_ASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S else diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c index 214d873bc2f..a58fd300a3a 100644 --- a/wolfcrypt/benchmark/benchmark.c +++ b/wolfcrypt/benchmark/benchmark.c @@ -4794,6 +4794,8 @@ static void print_cpu_features(void) if (IS_INTEL_MOVBE(cpuid_flags)) printf(" movbe"); if (IS_INTEL_BMI1(cpuid_flags)) printf(" bmi1"); if (IS_INTEL_SHA(cpuid_flags)) printf(" sha"); + if (IS_INTEL_VAES(cpuid_flags)) printf(" vaes"); + if (IS_INTEL_AVX512(cpuid_flags)) printf(" avx512"); #endif #ifdef __aarch64__ printf("Aarch64 -"); diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index 6806acbc965..8a630217b9d 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -809,6 +809,218 @@ block cipher mechanism that uses n-bit binary string parameter key with 128-bits unsigned char* key_schedule) XASM_LINK("AES_256_Key_Expansion_AESNI"); +#ifdef WOLFSSL_X86_64_BUILD + /* Wide ECB / CBC / CTR variants for x86_64. They share the AES-NI key + * schedule declared above and are selected at runtime from intel_flags. + * AES_CBC_decrypt_AESNI is the single max-width path (the by4/by6/by8 + * variants are only used by the 32-bit x86 build). */ + #if defined(USE_INTEL_SPEEDUP) + #ifndef HAVE_INTEL_AVX1 + #define HAVE_INTEL_AVX1 + #endif + #if !defined(NO_AVX2_SUPPORT) && !defined(HAVE_INTEL_AVX2) + #define HAVE_INTEL_AVX2 + #endif + #if !defined(NO_VAES_SUPPORT) && !defined(HAVE_INTEL_VAES) + #define HAVE_INTEL_VAES + #endif + #if !defined(NO_AVX512_SUPPORT) && !defined(HAVE_INTEL_AVX512) + #define HAVE_INTEL_AVX512 + #endif + #endif + + void AES_CTR_encrypt_AESNI(const unsigned char* in, unsigned char* out, + unsigned long length, const unsigned char* KS, int nr, + unsigned char* ctr) XASM_LINK("AES_CTR_encrypt_AESNI"); + #ifdef HAVE_AES_DECRYPT + void AES_CBC_decrypt_AESNI(const unsigned char* in, unsigned char* out, + unsigned char* ivec, unsigned long length, const unsigned char* KS, + int nr) XASM_LINK("AES_CBC_decrypt_AESNI"); + #endif + + #define AES_DECL_VARIANT(suff) \ + void AES_ECB_encrypt_##suff(const unsigned char* in, \ + unsigned char* out, unsigned long length, \ + const unsigned char* KS, int nr) \ + XASM_LINK("AES_ECB_encrypt_" #suff); \ + void AES_CBC_encrypt_##suff(const unsigned char* in, \ + unsigned char* out, unsigned char* ivec, unsigned long length, \ + const unsigned char* KS, int nr) \ + XASM_LINK("AES_CBC_encrypt_" #suff); \ + void AES_CTR_encrypt_##suff(const unsigned char* in, \ + unsigned char* out, unsigned long length, \ + const unsigned char* KS, int nr, unsigned char* ctr) \ + XASM_LINK("AES_CTR_encrypt_" #suff) + #ifdef HAVE_AES_DECRYPT + #define AES_DECL_VARIANT_DEC(suff) \ + void AES_ECB_decrypt_##suff(const unsigned char* in, \ + unsigned char* out, unsigned long length, \ + const unsigned char* KS, int nr) \ + XASM_LINK("AES_ECB_decrypt_" #suff); \ + void AES_CBC_decrypt_##suff(const unsigned char* in, \ + unsigned char* out, unsigned char* ivec, \ + unsigned long length, const unsigned char* KS, int nr) \ + XASM_LINK("AES_CBC_decrypt_" #suff) + #else + #define AES_DECL_VARIANT_DEC(suff) /* no decrypt */ + #endif + + #ifdef HAVE_INTEL_AVX1 + AES_DECL_VARIANT(avx1); + AES_DECL_VARIANT_DEC(avx1); + #endif + #ifdef HAVE_INTEL_VAES + AES_DECL_VARIANT(vaes); + AES_DECL_VARIANT_DEC(vaes); + #endif + #ifdef HAVE_INTEL_AVX512 + AES_DECL_VARIANT(avx512); + AES_DECL_VARIANT_DEC(avx512); + #endif + + /* Pick the widest available implementation at runtime. Callers must + * already be inside a VECTOR_REGISTERS_PUSH / SAVE_VECTOR_REGISTERS + * region (all bulk AES-NI call sites are). */ + static WC_INLINE void AesEcbEncryptBlocks(const unsigned char* in, + unsigned char* out, word32 sz, const unsigned char* key, int nr) + { + #ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_ECB_encrypt_avx512(in, out, sz, key, nr); + } + else + #endif + #ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_ECB_encrypt_vaes(in, out, sz, key, nr); + } + else + #endif + #ifdef HAVE_INTEL_AVX1 + if (IS_INTEL_AVX1(intel_flags)) { + AES_ECB_encrypt_avx1(in, out, sz, key, nr); + } + else + #endif + { + AES_ECB_encrypt_AESNI(in, out, sz, key, nr); + } + } + + #ifdef HAVE_AES_DECRYPT + static WC_INLINE void AesEcbDecryptBlocks(const unsigned char* in, + unsigned char* out, word32 sz, const unsigned char* key, int nr) + { + #ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_ECB_decrypt_avx512(in, out, sz, key, nr); + } + else + #endif + #ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_ECB_decrypt_vaes(in, out, sz, key, nr); + } + else + #endif + #ifdef HAVE_INTEL_AVX1 + if (IS_INTEL_AVX1(intel_flags)) { + AES_ECB_decrypt_avx1(in, out, sz, key, nr); + } + else + #endif + { + AES_ECB_decrypt_AESNI(in, out, sz, key, nr); + } + } + #endif + + #ifdef HAVE_AES_CBC + static WC_INLINE void AesCbcEncryptBlocks(const unsigned char* in, + unsigned char* out, unsigned char* iv, word32 sz, + const unsigned char* key, int nr) + { + #ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_CBC_encrypt_avx512(in, out, iv, sz, key, nr); + } + else + #endif + #ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_CBC_encrypt_vaes(in, out, iv, sz, key, nr); + } + else + #endif + #ifdef HAVE_INTEL_AVX1 + if (IS_INTEL_AVX1(intel_flags)) { + AES_CBC_encrypt_avx1(in, out, iv, sz, key, nr); + } + else + #endif + { + AES_CBC_encrypt_AESNI(in, out, iv, sz, key, nr); + } + } + #endif /* HAVE_AES_CBC */ + + #ifdef HAVE_AES_DECRYPT + static WC_INLINE void AesCbcDecryptBlocks(const unsigned char* in, + unsigned char* out, unsigned char* iv, word32 sz, + const unsigned char* key, int nr) + { + #ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_CBC_decrypt_avx512(in, out, iv, sz, key, nr); + } + else + #endif + #ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_CBC_decrypt_vaes(in, out, iv, sz, key, nr); + } + else + #endif + #ifdef HAVE_INTEL_AVX1 + if (IS_INTEL_AVX1(intel_flags)) { + AES_CBC_decrypt_avx1(in, out, iv, sz, key, nr); + } + else + #endif + { + AES_CBC_decrypt_AESNI(in, out, iv, sz, key, nr); + } + } + #endif /* HAVE_AES_DECRYPT */ + + static WC_INLINE void AesCtrEncryptBlocks(const unsigned char* in, + unsigned char* out, word32 sz, const unsigned char* key, int nr, + unsigned char* ctr) + { + #ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_CTR_encrypt_avx512(in, out, sz, key, nr, ctr); + } + else + #endif + #ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_CTR_encrypt_vaes(in, out, sz, key, nr, ctr); + } + else + #endif + #ifdef HAVE_INTEL_AVX1 + if (IS_INTEL_AVX1(intel_flags)) { + AES_CTR_encrypt_avx1(in, out, sz, key, nr, ctr); + } + else + #endif + { + AES_CTR_encrypt_AESNI(in, out, sz, key, nr, ctr); + } + } +#endif /* WOLFSSL_X86_64_BUILD */ + static WARN_UNUSED_RESULT int AES_set_encrypt_key_AESNI( const unsigned char *userKey, const int bits, Aes* aes) @@ -6858,8 +7070,13 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) else { tmp_align = tmp + (AESNI_ALIGN - ((wc_ptr_t)tmp % AESNI_ALIGN)); XMEMCPY(tmp_align, in, sz); + #ifdef WOLFSSL_X86_64_BUILD + AesCbcEncryptBlocks(tmp_align, tmp_align, (byte*)aes->reg, sz, + (byte*)aes->key, (int)aes->rounds); + #else AES_CBC_encrypt_AESNI(tmp_align, tmp_align, (byte*)aes->reg, sz, (byte*)aes->key, (int)aes->rounds); + #endif /* store iv for next call */ XMEMCPY(aes->reg, tmp_align + sz - WC_AES_BLOCK_SIZE, WC_AES_BLOCK_SIZE); @@ -6873,8 +7090,13 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) ret = BAD_ALIGN_E; #endif } else { + #ifdef WOLFSSL_X86_64_BUILD + AesCbcEncryptBlocks(in, out, (byte*)aes->reg, sz, (byte*)aes->key, + (int)aes->rounds); + #else AES_CBC_encrypt_AESNI(in, out, (byte*)aes->reg, sz, (byte*)aes->key, (int)aes->rounds); + #endif /* store iv for next call */ XMEMCPY(aes->reg, out + sz - WC_AES_BLOCK_SIZE, WC_AES_BLOCK_SIZE); @@ -7056,7 +7278,10 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) /* if input and output same will overwrite input iv */ XMEMCPY(aes->tmp, in + sz - WC_AES_BLOCK_SIZE, WC_AES_BLOCK_SIZE); - #if defined(WOLFSSL_AESNI_BY4) || defined(WOLFSSL_X86_BUILD) + #if defined(WOLFSSL_X86_64_BUILD) + AesCbcDecryptBlocks(in, out, (byte*)aes->reg, sz, (byte*)aes->key, + (int)aes->rounds); + #elif defined(WOLFSSL_AESNI_BY4) || defined(WOLFSSL_X86_BUILD) AES_CBC_decrypt_AESNI_by4(in, out, (byte*)aes->reg, sz, (byte*)aes->key, aes->rounds); #elif defined(WOLFSSL_AESNI_BY6) @@ -7563,6 +7788,19 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) #else VECTOR_REGISTERS_PUSH; + #if defined(WOLFSSL_AESNI) && defined(WOLFSSL_X86_64_BUILD) + if (aes->use_aesni && sz >= WC_AES_BLOCK_SIZE) { + word32 ctrBlocks = sz / WC_AES_BLOCK_SIZE; + word32 ctrBytes = ctrBlocks * WC_AES_BLOCK_SIZE; + AesCtrEncryptBlocks(in, out, ctrBytes, (byte*)aes->key, + (int)aes->rounds, (byte*)aes->reg); + in += ctrBytes; + out += ctrBytes; + sz -= ctrBytes; + aes->left = 0; + } + #endif + #if defined(HAVE_AES_ECB) && !defined(WOLFSSL_PIC32MZ_CRYPT) && \ !defined(XTRANSFORM_AESCTRBLOCK) if (in != out && sz >= WC_AES_BLOCK_SIZE) { @@ -7910,7 +8148,17 @@ void GenerateM0(Gcm* gcm) #if defined(WOLFSSL_AESNI) && defined(USE_INTEL_SPEEDUP) #define HAVE_INTEL_AVX1 - #define HAVE_INTEL_AVX2 + #ifndef NO_AVX2_SUPPORT + #define HAVE_INTEL_AVX2 + #endif + #ifdef WOLFSSL_X86_64_BUILD + #ifndef NO_VAES_SUPPORT + #define HAVE_INTEL_VAES + #endif + #ifndef NO_AVX512_SUPPORT + #define HAVE_INTEL_AVX512 + #endif + #endif #endif #if defined(WOLFSSL_AESNI) && defined(GCM_TABLE_4BIT) && \ @@ -8128,6 +8376,24 @@ void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, word32 tbytes, const unsigned char* key, int nr) XASM_LINK("AES_GCM_encrypt_avx2"); +#ifdef HAVE_INTEL_AVX512 +void AES_GCM_encrypt_avx512(const unsigned char *in, unsigned char *out, + const unsigned char* addt, const unsigned char* ivec, + unsigned char *tag, word32 nbytes, + word32 abytes, word32 ibytes, + word32 tbytes, const unsigned char* key, + int nr) + XASM_LINK("AES_GCM_encrypt_avx512"); +#endif +#ifdef HAVE_INTEL_VAES +void AES_GCM_encrypt_vaes(const unsigned char *in, unsigned char *out, + const unsigned char* addt, const unsigned char* ivec, + unsigned char *tag, word32 nbytes, + word32 abytes, word32 ibytes, + word32 tbytes, const unsigned char* key, + int nr) + XASM_LINK("AES_GCM_encrypt_vaes"); +#endif #endif /* HAVE_INTEL_AVX2 */ #endif /* HAVE_INTEL_AVX1 */ @@ -8152,6 +8418,22 @@ void AES_GCM_decrypt_avx2(const unsigned char *in, unsigned char *out, word32 abytes, word32 ibytes, word32 tbytes, const unsigned char* key, int nr, int* res) XASM_LINK("AES_GCM_decrypt_avx2"); +#ifdef HAVE_INTEL_AVX512 +void AES_GCM_decrypt_avx512(const unsigned char *in, unsigned char *out, + const unsigned char* addt, const unsigned char* ivec, + const unsigned char *tag, word32 nbytes, + word32 abytes, word32 ibytes, word32 tbytes, + const unsigned char* key, int nr, int* res) + XASM_LINK("AES_GCM_decrypt_avx512"); +#endif +#ifdef HAVE_INTEL_VAES +void AES_GCM_decrypt_vaes(const unsigned char *in, unsigned char *out, + const unsigned char* addt, const unsigned char* ivec, + const unsigned char *tag, word32 nbytes, + word32 abytes, word32 ibytes, word32 tbytes, + const unsigned char* key, int nr, int* res) + XASM_LINK("AES_GCM_decrypt_vaes"); +#endif #endif /* HAVE_INTEL_AVX2 */ #endif /* HAVE_INTEL_AVX1 */ #endif /* HAVE_AES_DECRYPT */ @@ -10535,6 +10817,22 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, #else #ifdef WOLFSSL_AESNI if (aes->use_aesni) { +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_encrypt_avx512(in, out, authIn, iv, authTag, sz, authInSz, ivSz, + authTagSz, (const byte*)aes->key, (int)aes->rounds); + ret = 0; + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_encrypt_vaes(in, out, authIn, iv, authTag, sz, authInSz, ivSz, + authTagSz, (const byte*)aes->key, (int)aes->rounds); + ret = 0; + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_encrypt_avx2(in, out, authIn, iv, authTag, sz, authInSz, ivSz, @@ -11293,6 +11591,28 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, #else #ifdef WOLFSSL_AESNI if (aes->use_aesni) { +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_decrypt_avx512(in, out, authIn, iv, authTag, sz, authInSz, ivSz, + authTagSz, (byte*)aes->key, (int)aes->rounds, &res); + if (res == 0) + ret = AES_GCM_AUTH_E; + else + ret = 0; + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_decrypt_vaes(in, out, authIn, iv, authTag, sz, authInSz, ivSz, + authTagSz, (byte*)aes->key, (int)aes->rounds, &res); + if (res == 0) + ret = AES_GCM_AUTH_E; + else + ret = 0; + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_decrypt_avx2(in, out, authIn, iv, authTag, sz, authInSz, ivSz, @@ -11513,19 +11833,73 @@ static WARN_UNUSED_RESULT int AesGcmFinal_C( extern void AES_GCM_init_avx2(const unsigned char* key, int nr, const unsigned char* ivec, unsigned int ibytes, unsigned char* h, unsigned char* counter, unsigned char* initCtr); +#ifdef HAVE_INTEL_AVX512 +extern void AES_GCM_init_avx512(const unsigned char* key, int nr, + const unsigned char* ivec, unsigned int ibytes, unsigned char* h, + unsigned char* counter, unsigned char* initCtr); +#endif +#ifdef HAVE_INTEL_VAES +extern void AES_GCM_init_vaes(const unsigned char* key, int nr, + const unsigned char* ivec, unsigned int ibytes, unsigned char* h, + unsigned char* counter, unsigned char* initCtr); +#endif extern void AES_GCM_aad_update_avx2(const unsigned char* addt, unsigned int abytes, unsigned char* tag, unsigned char* h); +#ifdef HAVE_INTEL_AVX512 +extern void AES_GCM_aad_update_avx512(const unsigned char* addt, + unsigned int abytes, unsigned char* tag, unsigned char* h); +#endif +#ifdef HAVE_INTEL_VAES +extern void AES_GCM_aad_update_vaes(const unsigned char* addt, + unsigned int abytes, unsigned char* tag, unsigned char* h); +#endif extern void AES_GCM_encrypt_block_avx2(const unsigned char* key, int nr, unsigned char* out, const unsigned char* in, unsigned char* counter); +#ifdef HAVE_INTEL_AVX512 +extern void AES_GCM_encrypt_block_avx512(const unsigned char* key, int nr, + unsigned char* out, const unsigned char* in, unsigned char* counter); +#endif +#ifdef HAVE_INTEL_VAES +extern void AES_GCM_encrypt_block_vaes(const unsigned char* key, int nr, + unsigned char* out, const unsigned char* in, unsigned char* counter); +#endif extern void AES_GCM_ghash_block_avx2(const unsigned char* data, unsigned char* tag, unsigned char* h); +#ifdef HAVE_INTEL_AVX512 +extern void AES_GCM_ghash_block_avx512(const unsigned char* data, + unsigned char* tag, unsigned char* h); +#endif +#ifdef HAVE_INTEL_VAES +extern void AES_GCM_ghash_block_vaes(const unsigned char* data, + unsigned char* tag, unsigned char* h); +#endif extern void AES_GCM_encrypt_update_avx2(const unsigned char* key, int nr, unsigned char* out, const unsigned char* in, unsigned int nbytes, unsigned char* tag, unsigned char* h, unsigned char* counter); +#ifdef HAVE_INTEL_AVX512 +extern void AES_GCM_encrypt_update_avx512(const unsigned char* key, int nr, + unsigned char* out, const unsigned char* in, unsigned int nbytes, + unsigned char* tag, unsigned char* h, unsigned char* counter); +#endif +#ifdef HAVE_INTEL_VAES +extern void AES_GCM_encrypt_update_vaes(const unsigned char* key, int nr, + unsigned char* out, const unsigned char* in, unsigned int nbytes, + unsigned char* tag, unsigned char* h, unsigned char* counter); +#endif extern void AES_GCM_encrypt_final_avx2(unsigned char* tag, unsigned char* authTag, unsigned int tbytes, unsigned int nbytes, unsigned int abytes, unsigned char* h, unsigned char* initCtr); +#ifdef HAVE_INTEL_AVX512 +extern void AES_GCM_encrypt_final_avx512(unsigned char* tag, + unsigned char* authTag, unsigned int tbytes, unsigned int nbytes, + unsigned int abytes, unsigned char* h, unsigned char* initCtr); +#endif +#ifdef HAVE_INTEL_VAES +extern void AES_GCM_encrypt_final_vaes(unsigned char* tag, + unsigned char* authTag, unsigned int tbytes, unsigned int nbytes, + unsigned int abytes, unsigned char* h, unsigned char* initCtr); +#endif #endif #ifdef HAVE_INTEL_AVX1 extern void AES_GCM_init_avx1(const unsigned char* key, int nr, @@ -11587,6 +11961,20 @@ static WARN_UNUSED_RESULT int AesGcmInit_aesni( aes->aOver = 0; aes->cOver = 0; +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_init_avx512((byte*)aes->key, (int)aes->rounds, iv, ivSz, + aes->gcm.H, AES_COUNTER(aes), AES_INITCTR(aes)); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_init_vaes((byte*)aes->key, (int)aes->rounds, iv, ivSz, + aes->gcm.H, AES_COUNTER(aes), AES_INITCTR(aes)); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_init_avx2((byte*)aes->key, (int)aes->rounds, iv, ivSz, @@ -11641,6 +12029,20 @@ static WARN_UNUSED_RESULT int AesGcmAadUpdate_aesni( aes->aOver = (byte)(aes->aOver + sz); if (aes->aOver == WC_AES_BLOCK_SIZE) { /* We have filled up the block and can process. */ +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_avx512(AES_LASTGBLOCK(aes), AES_TAG(aes), + aes->gcm.H); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_vaes(AES_LASTGBLOCK(aes), AES_TAG(aes), + aes->gcm.H); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_ghash_block_avx2(AES_LASTGBLOCK(aes), AES_TAG(aes), @@ -11672,6 +12074,20 @@ static WARN_UNUSED_RESULT int AesGcmAadUpdate_aesni( partial = aSz % WC_AES_BLOCK_SIZE; if (blocks > 0) { /* GHASH full blocks now. */ +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_aad_update_avx512(a, blocks * WC_AES_BLOCK_SIZE, + AES_TAG(aes), aes->gcm.H); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_aad_update_vaes(a, blocks * WC_AES_BLOCK_SIZE, + AES_TAG(aes), aes->gcm.H); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_aad_update_avx2(a, blocks * WC_AES_BLOCK_SIZE, @@ -11705,6 +12121,20 @@ static WARN_UNUSED_RESULT int AesGcmAadUpdate_aesni( XMEMSET(AES_LASTGBLOCK(aes) + aes->aOver, 0, (size_t)WC_AES_BLOCK_SIZE - aes->aOver); /* GHASH last AAD block. */ +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_avx512(AES_LASTGBLOCK(aes), AES_TAG(aes), + aes->gcm.H); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_vaes(AES_LASTGBLOCK(aes), AES_TAG(aes), + aes->gcm.H); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_ghash_block_avx2(AES_LASTGBLOCK(aes), AES_TAG(aes), @@ -11772,6 +12202,20 @@ static WARN_UNUSED_RESULT int AesGcmEncryptUpdate_aesni( aes->cOver = (byte)(aes->cOver + sz); if (aes->cOver == WC_AES_BLOCK_SIZE) { /* We have filled up the block and can process. */ +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_avx512(AES_LASTGBLOCK(aes), AES_TAG(aes), + aes->gcm.H); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_vaes(AES_LASTGBLOCK(aes), AES_TAG(aes), + aes->gcm.H); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_ghash_block_avx2(AES_LASTGBLOCK(aes), AES_TAG(aes), @@ -11804,6 +12248,22 @@ static WARN_UNUSED_RESULT int AesGcmEncryptUpdate_aesni( partial = cSz % WC_AES_BLOCK_SIZE; if (blocks > 0) { /* Encrypt and GHASH full blocks now. */ +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_encrypt_update_avx512((byte*)aes->key, (int)aes->rounds, + c, p, blocks * WC_AES_BLOCK_SIZE, AES_TAG(aes), aes->gcm.H, + AES_COUNTER(aes)); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_encrypt_update_vaes((byte*)aes->key, (int)aes->rounds, + c, p, blocks * WC_AES_BLOCK_SIZE, AES_TAG(aes), aes->gcm.H, + AES_COUNTER(aes)); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_encrypt_update_avx2((byte*)aes->key, (int)aes->rounds, @@ -11832,6 +12292,20 @@ static WARN_UNUSED_RESULT int AesGcmEncryptUpdate_aesni( if (partial != 0) { /* Encrypt the counter - XOR in zeros as proxy for plaintext. */ XMEMSET(AES_LASTGBLOCK(aes), 0, WC_AES_BLOCK_SIZE); +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_encrypt_block_avx512((byte*)aes->key, (int)aes->rounds, + AES_LASTGBLOCK(aes), AES_LASTGBLOCK(aes), AES_COUNTER(aes)); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_encrypt_block_vaes((byte*)aes->key, (int)aes->rounds, + AES_LASTGBLOCK(aes), AES_LASTGBLOCK(aes), AES_COUNTER(aes)); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_encrypt_block_avx2((byte*)aes->key, (int)aes->rounds, @@ -11887,6 +12361,20 @@ static WARN_UNUSED_RESULT int AesGcmEncryptFinal_aesni( /* Fill the rest of the block with zeros. */ XMEMSET(AES_LASTGBLOCK(aes) + over, 0, (size_t)WC_AES_BLOCK_SIZE - over); /* GHASH last cipher block. */ +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_avx512(AES_LASTGBLOCK(aes), AES_TAG(aes), + aes->gcm.H); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_vaes(AES_LASTGBLOCK(aes), AES_TAG(aes), + aes->gcm.H); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_ghash_block_avx2(AES_LASTGBLOCK(aes), AES_TAG(aes), @@ -11907,6 +12395,20 @@ static WARN_UNUSED_RESULT int AesGcmEncryptFinal_aesni( } } /* Calculate the authentication tag. */ +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_encrypt_final_avx512(AES_TAG(aes), authTag, authTagSz, aes->cSz, + aes->aSz, aes->gcm.H, AES_INITCTR(aes)); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_encrypt_final_vaes(AES_TAG(aes), authTag, authTagSz, aes->cSz, + aes->aSz, aes->gcm.H, AES_INITCTR(aes)); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_encrypt_final_avx2(AES_TAG(aes), authTag, authTagSz, aes->cSz, @@ -11940,9 +12442,29 @@ static WARN_UNUSED_RESULT int AesGcmEncryptFinal_aesni( extern void AES_GCM_decrypt_update_avx2(const unsigned char* key, int nr, unsigned char* out, const unsigned char* in, unsigned int nbytes, unsigned char* tag, unsigned char* h, unsigned char* counter); +#ifdef HAVE_INTEL_AVX512 +extern void AES_GCM_decrypt_update_avx512(const unsigned char* key, int nr, + unsigned char* out, const unsigned char* in, unsigned int nbytes, + unsigned char* tag, unsigned char* h, unsigned char* counter); +#endif +#ifdef HAVE_INTEL_VAES +extern void AES_GCM_decrypt_update_vaes(const unsigned char* key, int nr, + unsigned char* out, const unsigned char* in, unsigned int nbytes, + unsigned char* tag, unsigned char* h, unsigned char* counter); +#endif extern void AES_GCM_decrypt_final_avx2(unsigned char* tag, const unsigned char* authTag, unsigned int tbytes, unsigned int nbytes, unsigned int abytes, unsigned char* h, unsigned char* initCtr, int* res); +#ifdef HAVE_INTEL_AVX512 +extern void AES_GCM_decrypt_final_avx512(unsigned char* tag, + const unsigned char* authTag, unsigned int tbytes, unsigned int nbytes, + unsigned int abytes, unsigned char* h, unsigned char* initCtr, int* res); +#endif +#ifdef HAVE_INTEL_VAES +extern void AES_GCM_decrypt_final_vaes(unsigned char* tag, + const unsigned char* authTag, unsigned int tbytes, unsigned int nbytes, + unsigned int abytes, unsigned char* h, unsigned char* initCtr, int* res); +#endif #endif #ifdef HAVE_INTEL_AVX1 extern void AES_GCM_decrypt_update_avx1(const unsigned char* key, int nr, @@ -12005,6 +12527,20 @@ static WARN_UNUSED_RESULT int AesGcmDecryptUpdate_aesni( aes->cOver = (byte)(aes->cOver + sz); if (aes->cOver == WC_AES_BLOCK_SIZE) { /* We have filled up the block and can process. */ +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_avx512(AES_LASTBLOCK(aes), AES_TAG(aes), + aes->gcm.H); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_vaes(AES_LASTBLOCK(aes), AES_TAG(aes), + aes->gcm.H); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_ghash_block_avx2(AES_LASTBLOCK(aes), AES_TAG(aes), @@ -12037,6 +12573,22 @@ static WARN_UNUSED_RESULT int AesGcmDecryptUpdate_aesni( partial = cSz % WC_AES_BLOCK_SIZE; if (blocks > 0) { /* Decrypt and GHASH full blocks now. */ +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_decrypt_update_avx512((byte*)aes->key, (int)aes->rounds, + p, c, blocks * WC_AES_BLOCK_SIZE, AES_TAG(aes), aes->gcm.H, + AES_COUNTER(aes)); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_decrypt_update_vaes((byte*)aes->key, (int)aes->rounds, + p, c, blocks * WC_AES_BLOCK_SIZE, AES_TAG(aes), aes->gcm.H, + AES_COUNTER(aes)); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_decrypt_update_avx2((byte*)aes->key, (int)aes->rounds, @@ -12065,6 +12617,20 @@ static WARN_UNUSED_RESULT int AesGcmDecryptUpdate_aesni( if (partial != 0) { /* Encrypt the counter - XOR in zeros as proxy for cipher text. */ XMEMSET(AES_LASTGBLOCK(aes), 0, WC_AES_BLOCK_SIZE); +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_encrypt_block_avx512((byte*)aes->key, (int)aes->rounds, + AES_LASTGBLOCK(aes), AES_LASTGBLOCK(aes), AES_COUNTER(aes)); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_encrypt_block_vaes((byte*)aes->key, (int)aes->rounds, + AES_LASTGBLOCK(aes), AES_LASTGBLOCK(aes), AES_COUNTER(aes)); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_encrypt_block_avx2((byte*)aes->key, (int)aes->rounds, @@ -12127,6 +12693,18 @@ static WARN_UNUSED_RESULT int AesGcmDecryptFinal_aesni( /* Zeroize the unused part of the block. */ XMEMSET(lastBlock + over, 0, (size_t)WC_AES_BLOCK_SIZE - over); /* Hash the last block of cipher text. */ +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_avx512(lastBlock, AES_TAG(aes), aes->gcm.H); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_ghash_block_vaes(lastBlock, AES_TAG(aes), aes->gcm.H); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_ghash_block_avx2(lastBlock, AES_TAG(aes), aes->gcm.H); @@ -12144,6 +12722,20 @@ static WARN_UNUSED_RESULT int AesGcmDecryptFinal_aesni( } } /* Calculate and compare the authentication tag. */ +#ifdef HAVE_INTEL_AVX512 + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_decrypt_final_avx512(AES_TAG(aes), authTag, authTagSz, aes->cSz, + aes->aSz, aes->gcm.H, AES_INITCTR(aes), &res); + } + else +#endif +#ifdef HAVE_INTEL_VAES + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_GCM_decrypt_final_vaes(AES_TAG(aes), authTag, authTagSz, aes->cSz, + aes->aSz, aes->gcm.H, AES_INITCTR(aes), &res); + } + else +#endif #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_decrypt_final_avx2(AES_TAG(aes), authTag, authTagSz, aes->cSz, @@ -14537,7 +15129,11 @@ static WARN_UNUSED_RESULT int _AesEcbEncrypt( #else #ifdef WOLFSSL_AESNI if (aes->use_aesni) { + #ifdef WOLFSSL_X86_64_BUILD + AesEcbEncryptBlocks(in, out, sz, (byte*)aes->key, (int)aes->rounds); + #else AES_ECB_encrypt_AESNI(in, out, sz, (byte*)aes->key, (int)aes->rounds); + #endif } else #endif @@ -14632,7 +15228,11 @@ static WARN_UNUSED_RESULT int _AesEcbDecrypt( #else #ifdef WOLFSSL_AESNI if (aes->use_aesni) { + #ifdef WOLFSSL_X86_64_BUILD + AesEcbDecryptBlocks(in, out, sz, (byte*)aes->key, (int)aes->rounds); + #else AES_ECB_decrypt_AESNI(in, out, sz, (byte*)aes->key, (int)aes->rounds); + #endif } else #endif @@ -15797,6 +16397,37 @@ void AES_XTS_encrypt_update_avx1(const unsigned char *in, unsigned char *out, wo XASM_LINK("AES_XTS_encrypt_update_avx1"); #endif #endif /* HAVE_INTEL_AVX1 */ +#ifdef HAVE_INTEL_VAES +void AES_XTS_encrypt_vaes(const unsigned char *in, unsigned char *out, + word32 sz, const unsigned char* i, + const unsigned char* key, const unsigned char* key2, + int nr) + XASM_LINK("AES_XTS_encrypt_vaes"); +#ifdef WOLFSSL_AESXTS_STREAM +void AES_XTS_init_vaes(unsigned char* i, const unsigned char* tweak_key, + int tweak_nr) + XASM_LINK("AES_XTS_init_vaes"); +void AES_XTS_encrypt_update_vaes(const unsigned char *in, unsigned char *out, word32 sz, + const unsigned char* key, unsigned char *i, int nr) + XASM_LINK("AES_XTS_encrypt_update_vaes"); +#endif +#endif /* HAVE_INTEL_VAES */ +#ifdef HAVE_INTEL_AVX512 +void AES_XTS_encrypt_avx512(const unsigned char *in, unsigned char *out, + word32 sz, const unsigned char* i, + const unsigned char* key, const unsigned char* key2, + int nr) + XASM_LINK("AES_XTS_encrypt_avx512"); +#ifdef WOLFSSL_AESXTS_STREAM +void AES_XTS_init_avx512(unsigned char* i, const unsigned char* tweak_key, + int tweak_nr) + XASM_LINK("AES_XTS_init_avx512"); +void AES_XTS_encrypt_update_avx512(const unsigned char *in, unsigned char *out, word32 sz, + const unsigned char* key, unsigned char *i, int nr) + XASM_LINK("AES_XTS_encrypt_update_avx512"); +#endif +#endif /* HAVE_INTEL_AVX512 */ + #ifdef HAVE_AES_DECRYPT void AES_XTS_decrypt_aesni(const unsigned char *in, unsigned char *out, word32 sz, @@ -15820,6 +16451,30 @@ void AES_XTS_decrypt_update_avx1(const unsigned char *in, unsigned char *out, wo XASM_LINK("AES_XTS_decrypt_update_avx1"); #endif #endif /* HAVE_INTEL_AVX1 */ +#ifdef HAVE_INTEL_VAES +void AES_XTS_decrypt_vaes(const unsigned char *in, unsigned char *out, + word32 sz, const unsigned char* i, + const unsigned char* key, const unsigned char* key2, + int nr) + XASM_LINK("AES_XTS_decrypt_vaes"); +#ifdef WOLFSSL_AESXTS_STREAM +void AES_XTS_decrypt_update_vaes(const unsigned char *in, unsigned char *out, word32 sz, + const unsigned char* key, unsigned char *i, int nr) + XASM_LINK("AES_XTS_decrypt_update_vaes"); +#endif +#endif /* HAVE_INTEL_VAES */ +#ifdef HAVE_INTEL_AVX512 +void AES_XTS_decrypt_avx512(const unsigned char *in, unsigned char *out, + word32 sz, const unsigned char* i, + const unsigned char* key, const unsigned char* key2, + int nr) + XASM_LINK("AES_XTS_decrypt_avx512"); +#ifdef WOLFSSL_AESXTS_STREAM +void AES_XTS_decrypt_update_avx512(const unsigned char *in, unsigned char *out, word32 sz, + const unsigned char* key, unsigned char *i, int nr) + XASM_LINK("AES_XTS_decrypt_update_avx512"); +#endif +#endif /* HAVE_INTEL_AVX512 */ #endif /* HAVE_AES_DECRYPT */ #endif /* WOLFSSL_AESNI */ @@ -16078,6 +16733,26 @@ int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, #elif defined(WOLFSSL_AESNI) if (aes->use_aesni) { SAVE_VECTOR_REGISTERS(return _svr_ret;); +#if defined(HAVE_INTEL_AVX512) + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_encrypt_avx512(in, out, sz, i, + (const byte*)aes->key, + (const byte*)xaes->tweak.key, + (int)aes->rounds); + ret = 0; + } + else +#endif +#if defined(HAVE_INTEL_VAES) + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_encrypt_vaes(in, out, sz, i, + (const byte*)aes->key, + (const byte*)xaes->tweak.key, + (int)aes->rounds); + ret = 0; + } + else +#endif #if defined(HAVE_INTEL_AVX1) if (IS_INTEL_AVX1(intel_flags)) { AES_XTS_encrypt_avx1(in, out, sz, i, @@ -16180,6 +16855,24 @@ int wc_AesXtsEncryptInit(XtsAes* xaes, const byte* i, word32 iSz, #ifdef WOLFSSL_AESNI if (aes->use_aesni) { SAVE_VECTOR_REGISTERS(return _svr_ret;); +#if defined(HAVE_INTEL_AVX512) + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_init_avx512(stream->tweak_block, + (const byte*)xaes->tweak.key, + (int)xaes->tweak.rounds); + ret = 0; + } + else +#endif +#if defined(HAVE_INTEL_VAES) + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_init_vaes(stream->tweak_block, + (const byte*)xaes->tweak.key, + (int)xaes->tweak.rounds); + ret = 0; + } + else +#endif #if defined(HAVE_INTEL_AVX1) if (IS_INTEL_AVX1(intel_flags)) { AES_XTS_init_avx1(stream->tweak_block, @@ -16275,6 +16968,26 @@ static int AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s #ifdef WOLFSSL_AESNI if (aes->use_aesni) { SAVE_VECTOR_REGISTERS(return _svr_ret;); +#if defined(HAVE_INTEL_AVX512) + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_encrypt_update_avx512(in, out, sz, + (const byte*)aes->key, + stream->tweak_block, + (int)aes->rounds); + ret = 0; + } + else +#endif +#if defined(HAVE_INTEL_VAES) + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_encrypt_update_vaes(in, out, sz, + (const byte*)aes->key, + stream->tweak_block, + (int)aes->rounds); + ret = 0; + } + else +#endif #if defined(HAVE_INTEL_AVX1) if (IS_INTEL_AVX1(intel_flags)) { AES_XTS_encrypt_update_avx1(in, out, sz, @@ -16559,6 +17272,26 @@ int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, #elif defined(WOLFSSL_AESNI) if (aes->use_aesni) { SAVE_VECTOR_REGISTERS(return _svr_ret;); +#if defined(HAVE_INTEL_AVX512) + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_decrypt_avx512(in, out, sz, i, + (const byte*)aes->key, + (const byte*)xaes->tweak.key, + (int)aes->rounds); + ret = 0; + } + else +#endif +#if defined(HAVE_INTEL_VAES) + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_decrypt_vaes(in, out, sz, i, + (const byte*)aes->key, + (const byte*)xaes->tweak.key, + (int)aes->rounds); + ret = 0; + } + else +#endif #if defined(HAVE_INTEL_AVX1) if (IS_INTEL_AVX1(intel_flags)) { AES_XTS_decrypt_avx1(in, out, sz, i, @@ -16664,6 +17397,24 @@ int wc_AesXtsDecryptInit(XtsAes* xaes, const byte* i, word32 iSz, #ifdef WOLFSSL_AESNI if (aes->use_aesni) { SAVE_VECTOR_REGISTERS(return _svr_ret;); +#if defined(HAVE_INTEL_AVX512) + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_init_avx512(stream->tweak_block, + (const byte*)xaes->tweak.key, + (int)xaes->tweak.rounds); + ret = 0; + } + else +#endif +#if defined(HAVE_INTEL_VAES) + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_init_vaes(stream->tweak_block, + (const byte*)xaes->tweak.key, + (int)xaes->tweak.rounds); + ret = 0; + } + else +#endif #if defined(HAVE_INTEL_AVX1) if (IS_INTEL_AVX1(intel_flags)) { AES_XTS_init_avx1(stream->tweak_block, @@ -16751,6 +17502,26 @@ static int AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s #ifdef WOLFSSL_AESNI if (aes->use_aesni) { SAVE_VECTOR_REGISTERS(return _svr_ret;); +#if defined(HAVE_INTEL_AVX512) + if (IS_INTEL_AVX512(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_decrypt_update_avx512(in, out, sz, + (const byte*)aes->key, + stream->tweak_block, + (int)aes->rounds); + ret = 0; + } + else +#endif +#if defined(HAVE_INTEL_VAES) + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_VAES(intel_flags)) { + AES_XTS_decrypt_update_vaes(in, out, sz, + (const byte*)aes->key, + stream->tweak_block, + (int)aes->rounds); + ret = 0; + } + else +#endif #if defined(HAVE_INTEL_AVX1) if (IS_INTEL_AVX1(intel_flags)) { AES_XTS_decrypt_update_avx1(in, out, sz, diff --git a/wolfcrypt/src/aes_asm.S b/wolfcrypt/src/aes_asm.S index 0371ca8cb22..d4131676542 100644 --- a/wolfcrypt/src/aes_asm.S +++ b/wolfcrypt/src/aes_asm.S @@ -46,1314 +46,7 @@ #endif /* WOLFSSL_USER_SETTINGS_ASM */ #endif /* WOLFSSL_USER_SETTINGS */ -#ifdef WOLFSSL_X86_64_BUILD - -/* -AES_CBC_encrypt_AESNI (const unsigned char *in, - unsigned char *out, - unsigned char ivec[16], - unsigned long length, - const unsigned char *KS, - int nr) -*/ -#ifndef __APPLE__ -.globl AES_CBC_encrypt_AESNI -AES_CBC_encrypt_AESNI: -#else -.globl _AES_CBC_encrypt_AESNI -_AES_CBC_encrypt_AESNI: -#endif -# parameter 1: %rdi -# parameter 2: %rsi -# parameter 3: %rdx -# parameter 4: %rcx -# parameter 5: %r8 -# parameter 6: %r9d -movq %rcx, %r10 -shrq $4, %rcx -shlq $60, %r10 -je NO_PARTS -addq $1, %rcx -NO_PARTS: -subq $16, %rsi -movdqa (%rdx), %xmm1 -LOOP: -pxor (%rdi), %xmm1 -pxor (%r8), %xmm1 -addq $16,%rsi -addq $16,%rdi -cmpl $12, %r9d -aesenc 16(%r8),%xmm1 -aesenc 32(%r8),%xmm1 -aesenc 48(%r8),%xmm1 -aesenc 64(%r8),%xmm1 -aesenc 80(%r8),%xmm1 -aesenc 96(%r8),%xmm1 -aesenc 112(%r8),%xmm1 -aesenc 128(%r8),%xmm1 -aesenc 144(%r8),%xmm1 -movdqa 160(%r8),%xmm2 -jb LAST -cmpl $14, %r9d - -aesenc 160(%r8),%xmm1 -aesenc 176(%r8),%xmm1 -movdqa 192(%r8),%xmm2 -jb LAST -aesenc 192(%r8),%xmm1 -aesenc 208(%r8),%xmm1 -movdqa 224(%r8),%xmm2 -LAST: -decq %rcx -aesenclast %xmm2,%xmm1 -movdqu %xmm1,(%rsi) -jne LOOP -ret - - -#if defined(WOLFSSL_AESNI_BY4) - -/* -AES_CBC_decrypt_AESNI_by4 (const unsigned char *in, - unsigned char *out, - unsigned char ivec[16], - unsigned long length, - const unsigned char *KS, - int nr) -*/ -#ifndef __APPLE__ -.globl AES_CBC_decrypt_AESNI_by4 -AES_CBC_decrypt_AESNI_by4: -#else -.globl _AES_CBC_decrypt_AESNI_by4 -_AES_CBC_decrypt_AESNI_by4: -#endif -# parameter 1: %rdi -# parameter 2: %rsi -# parameter 3: %rdx -# parameter 4: %rcx -# parameter 5: %r8 -# parameter 6: %r9d - - movq %rcx, %r10 - shrq $4, %rcx - shlq $60, %r10 - je DNO_PARTS_4 - addq $1, %rcx -DNO_PARTS_4: - movq %rcx, %r10 - shlq $62, %r10 - shrq $62, %r10 - shrq $2, %rcx - movdqu (%rdx),%xmm5 - je DREMAINDER_4 - subq $64, %rsi -DLOOP_4: - movdqu (%rdi), %xmm1 - movdqu 16(%rdi), %xmm2 - movdqu 32(%rdi), %xmm3 - movdqu 48(%rdi), %xmm4 - movdqa %xmm1, %xmm6 - movdqa %xmm2, %xmm7 - movdqa %xmm3, %xmm8 - movdqa %xmm4, %xmm15 - movdqa (%r8), %xmm9 - movdqa 16(%r8), %xmm10 - movdqa 32(%r8), %xmm11 - movdqa 48(%r8), %xmm12 - pxor %xmm9, %xmm1 - pxor %xmm9, %xmm2 - pxor %xmm9, %xmm3 - pxor %xmm9, %xmm4 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm3 - aesdec %xmm11, %xmm4 - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm3 - aesdec %xmm12, %xmm4 - movdqa 64(%r8), %xmm9 - movdqa 80(%r8), %xmm10 - movdqa 96(%r8), %xmm11 - movdqa 112(%r8), %xmm12 - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm3 - aesdec %xmm11, %xmm4 - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm3 - aesdec %xmm12, %xmm4 - movdqa 128(%r8), %xmm9 - movdqa 144(%r8), %xmm10 - movdqa 160(%r8), %xmm11 - cmpl $12, %r9d - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - jb DLAST_4 - movdqa 160(%r8), %xmm9 - movdqa 176(%r8), %xmm10 - movdqa 192(%r8), %xmm11 - cmpl $14, %r9d - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - jb DLAST_4 - movdqa 192(%r8), %xmm9 - movdqa 208(%r8), %xmm10 - movdqa 224(%r8), %xmm11 - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 -DLAST_4: - addq $64, %rdi - addq $64, %rsi - decq %rcx - aesdeclast %xmm11, %xmm1 - aesdeclast %xmm11, %xmm2 - aesdeclast %xmm11, %xmm3 - aesdeclast %xmm11, %xmm4 - pxor %xmm5, %xmm1 - pxor %xmm6, %xmm2 - pxor %xmm7, %xmm3 - pxor %xmm8, %xmm4 - movdqu %xmm1, (%rsi) - movdqu %xmm2, 16(%rsi) - movdqu %xmm3, 32(%rsi) - movdqu %xmm4, 48(%rsi) - movdqa %xmm15,%xmm5 - jne DLOOP_4 - addq $64, %rsi -DREMAINDER_4: - cmpq $0, %r10 - je DEND_4 -DLOOP_4_2: - movdqu (%rdi), %xmm1 - movdqa %xmm1, %xmm15 - addq $16, %rdi - pxor (%r8), %xmm1 - movdqu 160(%r8), %xmm2 - cmpl $12, %r9d - aesdec 16(%r8), %xmm1 - aesdec 32(%r8), %xmm1 - aesdec 48(%r8), %xmm1 - aesdec 64(%r8), %xmm1 - aesdec 80(%r8), %xmm1 - aesdec 96(%r8), %xmm1 - aesdec 112(%r8), %xmm1 - aesdec 128(%r8), %xmm1 - aesdec 144(%r8), %xmm1 - jb DLAST_4_2 - movdqu 192(%r8), %xmm2 - cmpl $14, %r9d - aesdec 160(%r8), %xmm1 - aesdec 176(%r8), %xmm1 - jb DLAST_4_2 - movdqu 224(%r8), %xmm2 - aesdec 192(%r8), %xmm1 - aesdec 208(%r8), %xmm1 -DLAST_4_2: - aesdeclast %xmm2, %xmm1 - pxor %xmm5, %xmm1 - movdqa %xmm15, %xmm5 - movdqu %xmm1, (%rsi) - addq $16, %rsi - decq %r10 - jne DLOOP_4_2 -DEND_4: - ret - -#elif defined(WOLFSSL_AESNI_BY6) - -/* -AES_CBC_decrypt_AESNI_by6 (const unsigned char *in, - unsigned char *out, - unsigned char ivec[16], - unsigned long length, - const unsigned char *KS, - int nr) -*/ -#ifndef __APPLE__ -.globl AES_CBC_decrypt_AESNI_by6 -AES_CBC_decrypt_AESNI_by6: -#else -.globl _AES_CBC_decrypt_AESNI_by6 -_AES_CBC_decrypt_AESNI_by6: -#endif -# parameter 1: %rdi - in -# parameter 2: %rsi - out -# parameter 3: %rdx - ivec -# parameter 4: %rcx - length -# parameter 5: %r8 - KS -# parameter 6: %r9d - nr - - movq %rcx, %r10 - shrq $4, %rcx - shlq $60, %r10 - je DNO_PARTS_6 - addq $1, %rcx -DNO_PARTS_6: - movq %rax, %r12 - movq %rdx, %r13 - movq %rbx, %r14 - movq $0, %rdx - movq %rcx, %rax - movq $6, %rbx - div %rbx - movq %rax, %rcx - movq %rdx, %r10 - movq %r12, %rax - movq %r13, %rdx - movq %r14, %rbx - cmpq $0, %rcx - movdqu (%rdx), %xmm7 - je DREMAINDER_6 - subq $96, %rsi -DLOOP_6: - movdqu (%rdi), %xmm1 - movdqu 16(%rdi), %xmm2 - movdqu 32(%rdi), %xmm3 - movdqu 48(%rdi), %xmm4 - movdqu 64(%rdi), %xmm5 - movdqu 80(%rdi), %xmm6 - movdqa (%r8), %xmm8 - movdqa 16(%r8), %xmm9 - movdqa 32(%r8), %xmm10 - movdqa 48(%r8), %xmm11 - pxor %xmm8, %xmm1 - pxor %xmm8, %xmm2 - pxor %xmm8, %xmm3 - pxor %xmm8, %xmm4 - pxor %xmm8, %xmm5 - pxor %xmm8, %xmm6 - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm9, %xmm5 - aesdec %xmm9, %xmm6 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - aesdec %xmm10, %xmm5 - aesdec %xmm10, %xmm6 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm3 - aesdec %xmm11, %xmm4 - aesdec %xmm11, %xmm5 - aesdec %xmm11, %xmm6 - movdqa 64(%r8), %xmm8 - movdqa 80(%r8), %xmm9 - movdqa 96(%r8), %xmm10 - movdqa 112(%r8), %xmm11 - aesdec %xmm8, %xmm1 - aesdec %xmm8, %xmm2 - aesdec %xmm8, %xmm3 - aesdec %xmm8, %xmm4 - aesdec %xmm8, %xmm5 - aesdec %xmm8, %xmm6 - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm9, %xmm5 - aesdec %xmm9, %xmm6 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - aesdec %xmm10, %xmm5 - aesdec %xmm10, %xmm6 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm3 - aesdec %xmm11, %xmm4 - aesdec %xmm11, %xmm5 - aesdec %xmm11, %xmm6 - movdqa 128(%r8), %xmm8 - movdqa 144(%r8), %xmm9 - movdqa 160(%r8), %xmm10 - cmpl $12, %r9d - aesdec %xmm8, %xmm1 - aesdec %xmm8, %xmm2 - aesdec %xmm8, %xmm3 - aesdec %xmm8, %xmm4 - aesdec %xmm8, %xmm5 - aesdec %xmm8, %xmm6 - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm9, %xmm5 - aesdec %xmm9, %xmm6 - jb DLAST_6 - movdqa 160(%r8), %xmm8 - movdqa 176(%r8), %xmm9 - movdqa 192(%r8), %xmm10 - cmpl $14, %r9d - aesdec %xmm8, %xmm1 - aesdec %xmm8, %xmm2 - aesdec %xmm8, %xmm3 - aesdec %xmm8, %xmm4 - aesdec %xmm8, %xmm5 - aesdec %xmm8, %xmm6 - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm9, %xmm5 - aesdec %xmm9, %xmm6 - jb DLAST_6 - movdqa 192(%r8), %xmm8 - movdqa 208(%r8), %xmm9 - movdqa 224(%r8), %xmm10 - aesdec %xmm8, %xmm1 - aesdec %xmm8, %xmm2 - aesdec %xmm8, %xmm3 - aesdec %xmm8, %xmm4 - aesdec %xmm8, %xmm5 - aesdec %xmm8, %xmm6 - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm9, %xmm5 - aesdec %xmm9, %xmm6 -DLAST_6: - addq $96, %rsi - aesdeclast %xmm10, %xmm1 - aesdeclast %xmm10, %xmm2 - aesdeclast %xmm10, %xmm3 - aesdeclast %xmm10, %xmm4 - aesdeclast %xmm10, %xmm5 - aesdeclast %xmm10, %xmm6 - movdqu (%rdi), %xmm8 - movdqu 16(%rdi), %xmm9 - movdqu 32(%rdi), %xmm10 - movdqu 48(%rdi), %xmm11 - movdqu 64(%rdi), %xmm12 - movdqu 80(%rdi), %xmm13 - pxor %xmm7, %xmm1 - pxor %xmm8, %xmm2 - pxor %xmm9, %xmm3 - pxor %xmm10, %xmm4 - pxor %xmm11, %xmm5 - pxor %xmm12, %xmm6 - movdqu %xmm13, %xmm7 - movdqu %xmm1, (%rsi) - movdqu %xmm2, 16(%rsi) - movdqu %xmm3, 32(%rsi) - movdqu %xmm4, 48(%rsi) - movdqu %xmm5, 64(%rsi) - movdqu %xmm6, 80(%rsi) - addq $96, %rdi - decq %rcx - jne DLOOP_6 - addq $96, %rsi -DREMAINDER_6: - cmpq $0, %r10 - je DEND_6 -DLOOP_6_2: - movdqu (%rdi), %xmm1 - movdqa %xmm1, %xmm10 - addq $16, %rdi - pxor (%r8), %xmm1 - movdqu 160(%r8), %xmm2 - cmpl $12, %r9d - aesdec 16(%r8), %xmm1 - aesdec 32(%r8), %xmm1 - aesdec 48(%r8), %xmm1 - aesdec 64(%r8), %xmm1 - aesdec 80(%r8), %xmm1 - aesdec 96(%r8), %xmm1 - aesdec 112(%r8), %xmm1 - aesdec 128(%r8), %xmm1 - aesdec 144(%r8), %xmm1 - jb DLAST_6_2 - movdqu 192(%r8), %xmm2 - cmpl $14, %r9d - aesdec 160(%r8), %xmm1 - aesdec 176(%r8), %xmm1 - jb DLAST_6_2 - movdqu 224(%r8), %xmm2 - aesdec 192(%r8), %xmm1 - aesdec 208(%r8), %xmm1 -DLAST_6_2: - aesdeclast %xmm2, %xmm1 - pxor %xmm7, %xmm1 - movdqa %xmm10, %xmm7 - movdqu %xmm1, (%rsi) - addq $16, %rsi - decq %r10 - jne DLOOP_6_2 -DEND_6: - ret - -#else /* WOLFSSL_AESNI_BYx */ - -/* -AES_CBC_decrypt_AESNI_by8 (const unsigned char *in, - unsigned char *out, - unsigned char ivec[16], - unsigned long length, - const unsigned char *KS, - int nr) -*/ -#ifndef __APPLE__ -.globl AES_CBC_decrypt_AESNI_by8 -AES_CBC_decrypt_AESNI_by8: -#else -.globl _AES_CBC_decrypt_AESNI_by8 -_AES_CBC_decrypt_AESNI_by8: -#endif -# parameter 1: %rdi - in -# parameter 2: %rsi - out -# parameter 3: %rdx - ivec -# parameter 4: %rcx - length -# parameter 5: %r8 - KS -# parameter 6: %r9d - nr - - movq %rcx, %r10 - shrq $4, %rcx - shlq $60, %r10 - je DNO_PARTS_8 - addq $1, %rcx -DNO_PARTS_8: - movq %rcx, %r10 - shlq $61, %r10 - shrq $61, %r10 - shrq $3, %rcx - movdqu (%rdx), %xmm9 - je DREMAINDER_8 - subq $128, %rsi -DLOOP_8: - movdqu (%rdi), %xmm1 - movdqu 16(%rdi), %xmm2 - movdqu 32(%rdi), %xmm3 - movdqu 48(%rdi), %xmm4 - movdqu 64(%rdi), %xmm5 - movdqu 80(%rdi), %xmm6 - movdqu 96(%rdi), %xmm7 - movdqu 112(%rdi), %xmm8 - movdqa (%r8), %xmm10 - movdqa 16(%r8), %xmm11 - movdqa 32(%r8), %xmm12 - movdqa 48(%r8), %xmm13 - pxor %xmm10, %xmm1 - pxor %xmm10, %xmm2 - pxor %xmm10, %xmm3 - pxor %xmm10, %xmm4 - pxor %xmm10, %xmm5 - pxor %xmm10, %xmm6 - pxor %xmm10, %xmm7 - pxor %xmm10, %xmm8 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm3 - aesdec %xmm11, %xmm4 - aesdec %xmm11, %xmm5 - aesdec %xmm11, %xmm6 - aesdec %xmm11, %xmm7 - aesdec %xmm11, %xmm8 - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm3 - aesdec %xmm12, %xmm4 - aesdec %xmm12, %xmm5 - aesdec %xmm12, %xmm6 - aesdec %xmm12, %xmm7 - aesdec %xmm12, %xmm8 - aesdec %xmm13, %xmm1 - aesdec %xmm13, %xmm2 - aesdec %xmm13, %xmm3 - aesdec %xmm13, %xmm4 - aesdec %xmm13, %xmm5 - aesdec %xmm13, %xmm6 - aesdec %xmm13, %xmm7 - aesdec %xmm13, %xmm8 - movdqa 64(%r8), %xmm10 - movdqa 80(%r8), %xmm11 - movdqa 96(%r8), %xmm12 - movdqa 112(%r8), %xmm13 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - aesdec %xmm10, %xmm5 - aesdec %xmm10, %xmm6 - aesdec %xmm10, %xmm7 - aesdec %xmm10, %xmm8 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm3 - aesdec %xmm11, %xmm4 - aesdec %xmm11, %xmm5 - aesdec %xmm11, %xmm6 - aesdec %xmm11, %xmm7 - aesdec %xmm11, %xmm8 - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm3 - aesdec %xmm12, %xmm4 - aesdec %xmm12, %xmm5 - aesdec %xmm12, %xmm6 - aesdec %xmm12, %xmm7 - aesdec %xmm12, %xmm8 - aesdec %xmm13, %xmm1 - aesdec %xmm13, %xmm2 - aesdec %xmm13, %xmm3 - aesdec %xmm13, %xmm4 - aesdec %xmm13, %xmm5 - aesdec %xmm13, %xmm6 - aesdec %xmm13, %xmm7 - aesdec %xmm13, %xmm8 - movdqa 128(%r8), %xmm10 - movdqa 144(%r8), %xmm11 - movdqa 160(%r8), %xmm12 - cmpl $12, %r9d - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - aesdec %xmm10, %xmm5 - aesdec %xmm10, %xmm6 - aesdec %xmm10, %xmm7 - aesdec %xmm10, %xmm8 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm3 - aesdec %xmm11, %xmm4 - aesdec %xmm11, %xmm5 - aesdec %xmm11, %xmm6 - aesdec %xmm11, %xmm7 - aesdec %xmm11, %xmm8 - jb DLAST_8 - movdqa 160(%r8), %xmm10 - movdqa 176(%r8), %xmm11 - movdqa 192(%r8), %xmm12 - cmpl $14, %r9d - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - aesdec %xmm10, %xmm5 - aesdec %xmm10, %xmm6 - aesdec %xmm10, %xmm7 - aesdec %xmm10, %xmm8 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm3 - aesdec %xmm11, %xmm4 - aesdec %xmm11, %xmm5 - aesdec %xmm11, %xmm6 - aesdec %xmm11, %xmm7 - aesdec %xmm11, %xmm8 - jb DLAST_8 - movdqa 192(%r8), %xmm10 - movdqa 208(%r8), %xmm11 - movdqa 224(%r8), %xmm12 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - aesdec %xmm10, %xmm5 - aesdec %xmm10, %xmm6 - aesdec %xmm10, %xmm7 - aesdec %xmm10, %xmm8 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm3 - aesdec %xmm11, %xmm4 - aesdec %xmm11, %xmm5 - aesdec %xmm11, %xmm6 - aesdec %xmm11, %xmm7 - aesdec %xmm11, %xmm8 -DLAST_8: - addq $128, %rsi - aesdeclast %xmm12, %xmm1 - aesdeclast %xmm12, %xmm2 - aesdeclast %xmm12, %xmm3 - aesdeclast %xmm12, %xmm4 - aesdeclast %xmm12, %xmm5 - aesdeclast %xmm12, %xmm6 - aesdeclast %xmm12, %xmm7 - aesdeclast %xmm12, %xmm8 - movdqu (%rdi), %xmm10 - movdqu 16(%rdi), %xmm11 - movdqu 32(%rdi), %xmm12 - movdqu 48(%rdi), %xmm13 - pxor %xmm9, %xmm1 - pxor %xmm10, %xmm2 - pxor %xmm11, %xmm3 - pxor %xmm12, %xmm4 - pxor %xmm13, %xmm5 - movdqu 64(%rdi), %xmm10 - movdqu 80(%rdi), %xmm11 - movdqu 96(%rdi), %xmm12 - movdqu 112(%rdi), %xmm9 - pxor %xmm10, %xmm6 - pxor %xmm11, %xmm7 - pxor %xmm12, %xmm8 - movdqu %xmm1, (%rsi) - movdqu %xmm2, 16(%rsi) - movdqu %xmm3, 32(%rsi) - movdqu %xmm4, 48(%rsi) - movdqu %xmm5, 64(%rsi) - movdqu %xmm6, 80(%rsi) - movdqu %xmm7, 96(%rsi) - movdqu %xmm8, 112(%rsi) - addq $128, %rdi - decq %rcx - jne DLOOP_8 - addq $128, %rsi -DREMAINDER_8: - cmpq $0, %r10 - je DEND_8 -DLOOP_8_2: - movdqu (%rdi), %xmm1 - movdqa %xmm1, %xmm10 - addq $16, %rdi - pxor (%r8), %xmm1 - movdqu 160(%r8), %xmm2 - cmpl $12, %r9d - aesdec 16(%r8), %xmm1 - aesdec 32(%r8), %xmm1 - aesdec 48(%r8), %xmm1 - aesdec 64(%r8), %xmm1 - aesdec 80(%r8), %xmm1 - aesdec 96(%r8), %xmm1 - aesdec 112(%r8), %xmm1 - aesdec 128(%r8), %xmm1 - aesdec 144(%r8), %xmm1 - jb DLAST_8_2 - movdqu 192(%r8), %xmm2 - cmpl $14, %r9d - aesdec 160(%r8), %xmm1 - aesdec 176(%r8), %xmm1 - jb DLAST_8_2 - movdqu 224(%r8), %xmm2 - aesdec 192(%r8), %xmm1 - aesdec 208(%r8), %xmm1 -DLAST_8_2: - aesdeclast %xmm2, %xmm1 - pxor %xmm9, %xmm1 - movdqa %xmm10, %xmm9 - movdqu %xmm1, (%rsi) - addq $16, %rsi - decq %r10 - jne DLOOP_8_2 -DEND_8: - ret - -#endif /* WOLFSSL_AESNI_BYx */ - - -/* -AES_ECB_encrypt_AESNI (const unsigned char *in, - unsigned char *out, - unsigned long length, - const unsigned char *KS, - int nr) -*/ -#ifndef __APPLE__ -.globl AES_ECB_encrypt_AESNI -AES_ECB_encrypt_AESNI: -#else -.globl _AES_ECB_encrypt_AESNI -_AES_ECB_encrypt_AESNI: -#endif -# parameter 1: %rdi -# parameter 2: %rsi -# parameter 3: %rdx -# parameter 4: %rcx -# parameter 5: %r8d - movq %rdx, %r10 - shrq $4, %rdx - shlq $60, %r10 - je EECB_NO_PARTS_4 - addq $1, %rdx -EECB_NO_PARTS_4: - movq %rdx, %r10 - shlq $62, %r10 - shrq $62, %r10 - shrq $2, %rdx - je EECB_REMAINDER_4 - subq $64, %rsi -EECB_LOOP_4: - movdqu (%rdi), %xmm1 - movdqu 16(%rdi), %xmm2 - movdqu 32(%rdi), %xmm3 - movdqu 48(%rdi), %xmm4 - movdqa (%rcx), %xmm9 - movdqa 16(%rcx), %xmm10 - movdqa 32(%rcx), %xmm11 - movdqa 48(%rcx), %xmm12 - pxor %xmm9, %xmm1 - pxor %xmm9, %xmm2 - pxor %xmm9, %xmm3 - pxor %xmm9, %xmm4 - aesenc %xmm10, %xmm1 - aesenc %xmm10, %xmm2 - aesenc %xmm10, %xmm3 - aesenc %xmm10, %xmm4 - aesenc %xmm11, %xmm1 - aesenc %xmm11, %xmm2 - aesenc %xmm11, %xmm3 - aesenc %xmm11, %xmm4 - aesenc %xmm12, %xmm1 - aesenc %xmm12, %xmm2 - aesenc %xmm12, %xmm3 - aesenc %xmm12, %xmm4 - movdqa 64(%rcx), %xmm9 - movdqa 80(%rcx), %xmm10 - movdqa 96(%rcx), %xmm11 - movdqa 112(%rcx), %xmm12 - aesenc %xmm9, %xmm1 - aesenc %xmm9, %xmm2 - aesenc %xmm9, %xmm3 - aesenc %xmm9, %xmm4 - aesenc %xmm10, %xmm1 - aesenc %xmm10, %xmm2 - aesenc %xmm10, %xmm3 - aesenc %xmm10, %xmm4 - aesenc %xmm11, %xmm1 - aesenc %xmm11, %xmm2 - aesenc %xmm11, %xmm3 - aesenc %xmm11, %xmm4 - aesenc %xmm12, %xmm1 - aesenc %xmm12, %xmm2 - aesenc %xmm12, %xmm3 - aesenc %xmm12, %xmm4 - movdqa 128(%rcx), %xmm9 - movdqa 144(%rcx), %xmm10 - movdqa 160(%rcx), %xmm11 - cmpl $12, %r8d - aesenc %xmm9, %xmm1 - aesenc %xmm9, %xmm2 - aesenc %xmm9, %xmm3 - aesenc %xmm9, %xmm4 - aesenc %xmm10, %xmm1 - aesenc %xmm10, %xmm2 - aesenc %xmm10, %xmm3 - aesenc %xmm10, %xmm4 - jb EECB_LAST_4 - movdqa 160(%rcx), %xmm9 - movdqa 176(%rcx), %xmm10 - movdqa 192(%rcx), %xmm11 - cmpl $14, %r8d - aesenc %xmm9, %xmm1 - aesenc %xmm9, %xmm2 - aesenc %xmm9, %xmm3 - aesenc %xmm9, %xmm4 - aesenc %xmm10, %xmm1 - aesenc %xmm10, %xmm2 - aesenc %xmm10, %xmm3 - aesenc %xmm10, %xmm4 - jb EECB_LAST_4 - movdqa 192(%rcx), %xmm9 - movdqa 208(%rcx), %xmm10 - movdqa 224(%rcx), %xmm11 - aesenc %xmm9, %xmm1 - aesenc %xmm9, %xmm2 - aesenc %xmm9, %xmm3 - aesenc %xmm9, %xmm4 - aesenc %xmm10, %xmm1 - aesenc %xmm10, %xmm2 - aesenc %xmm10, %xmm3 - aesenc %xmm10, %xmm4 -EECB_LAST_4: - addq $64, %rdi - addq $64, %rsi - decq %rdx - aesenclast %xmm11, %xmm1 - aesenclast %xmm11, %xmm2 - aesenclast %xmm11, %xmm3 - aesenclast %xmm11, %xmm4 - movdqu %xmm1, (%rsi) - movdqu %xmm2, 16(%rsi) - movdqu %xmm3, 32(%rsi) - movdqu %xmm4, 48(%rsi) - jne EECB_LOOP_4 - addq $64, %rsi -EECB_REMAINDER_4: - cmpq $0, %r10 - je EECB_END_4 -EECB_LOOP_4_2: - movdqu (%rdi), %xmm1 - addq $16, %rdi - pxor (%rcx), %xmm1 - movdqu 160(%rcx), %xmm2 - aesenc 16(%rcx), %xmm1 - aesenc 32(%rcx), %xmm1 - aesenc 48(%rcx), %xmm1 - aesenc 64(%rcx), %xmm1 - aesenc 80(%rcx), %xmm1 - aesenc 96(%rcx), %xmm1 - aesenc 112(%rcx), %xmm1 - aesenc 128(%rcx), %xmm1 - aesenc 144(%rcx), %xmm1 - cmpl $12, %r8d - jb EECB_LAST_4_2 - movdqu 192(%rcx), %xmm2 - aesenc 160(%rcx), %xmm1 - aesenc 176(%rcx), %xmm1 - cmpl $14, %r8d - jb EECB_LAST_4_2 - movdqu 224(%rcx), %xmm2 - aesenc 192(%rcx), %xmm1 - aesenc 208(%rcx), %xmm1 -EECB_LAST_4_2: - aesenclast %xmm2, %xmm1 - movdqu %xmm1, (%rsi) - addq $16, %rsi - decq %r10 - jne EECB_LOOP_4_2 -EECB_END_4: - ret - - -/* -AES_ECB_decrypt_AESNI (const unsigned char *in, - unsigned char *out, - unsigned long length, - const unsigned char *KS, - int nr) -*/ -#ifndef __APPLE__ -.globl AES_ECB_decrypt_AESNI -AES_ECB_decrypt_AESNI: -#else -.globl _AES_ECB_decrypt_AESNI -_AES_ECB_decrypt_AESNI: -#endif -# parameter 1: %rdi -# parameter 2: %rsi -# parameter 3: %rdx -# parameter 4: %rcx -# parameter 5: %r8d - - movq %rdx, %r10 - shrq $4, %rdx - shlq $60, %r10 - je DECB_NO_PARTS_4 - addq $1, %rdx -DECB_NO_PARTS_4: - movq %rdx, %r10 - shlq $62, %r10 - shrq $62, %r10 - shrq $2, %rdx - je DECB_REMAINDER_4 - subq $64, %rsi -DECB_LOOP_4: - movdqu (%rdi), %xmm1 - movdqu 16(%rdi), %xmm2 - movdqu 32(%rdi), %xmm3 - movdqu 48(%rdi), %xmm4 - movdqa (%rcx), %xmm9 - movdqa 16(%rcx), %xmm10 - movdqa 32(%rcx), %xmm11 - movdqa 48(%rcx), %xmm12 - pxor %xmm9, %xmm1 - pxor %xmm9, %xmm2 - pxor %xmm9, %xmm3 - pxor %xmm9, %xmm4 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm3 - aesdec %xmm11, %xmm4 - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm3 - aesdec %xmm12, %xmm4 - movdqa 64(%rcx), %xmm9 - movdqa 80(%rcx), %xmm10 - movdqa 96(%rcx), %xmm11 - movdqa 112(%rcx), %xmm12 - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - aesdec %xmm11, %xmm1 - aesdec %xmm11, %xmm2 - aesdec %xmm11, %xmm3 - aesdec %xmm11, %xmm4 - aesdec %xmm12, %xmm1 - aesdec %xmm12, %xmm2 - aesdec %xmm12, %xmm3 - aesdec %xmm12, %xmm4 - movdqa 128(%rcx), %xmm9 - movdqa 144(%rcx), %xmm10 - movdqa 160(%rcx), %xmm11 - cmpl $12, %r8d - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - jb DECB_LAST_4 - movdqa 160(%rcx), %xmm9 - movdqa 176(%rcx), %xmm10 - movdqa 192(%rcx), %xmm11 - cmpl $14, %r8d - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 - jb DECB_LAST_4 - movdqa 192(%rcx), %xmm9 - movdqa 208(%rcx), %xmm10 - movdqa 224(%rcx), %xmm11 - aesdec %xmm9, %xmm1 - aesdec %xmm9, %xmm2 - aesdec %xmm9, %xmm3 - aesdec %xmm9, %xmm4 - aesdec %xmm10, %xmm1 - aesdec %xmm10, %xmm2 - aesdec %xmm10, %xmm3 - aesdec %xmm10, %xmm4 -DECB_LAST_4: - addq $64, %rdi - addq $64, %rsi - decq %rdx - aesdeclast %xmm11, %xmm1 - aesdeclast %xmm11, %xmm2 - aesdeclast %xmm11, %xmm3 - aesdeclast %xmm11, %xmm4 - movdqu %xmm1, (%rsi) - movdqu %xmm2, 16(%rsi) - movdqu %xmm3, 32(%rsi) - movdqu %xmm4, 48(%rsi) - jne DECB_LOOP_4 - addq $64, %rsi -DECB_REMAINDER_4: - cmpq $0, %r10 - je DECB_END_4 -DECB_LOOP_4_2: - movdqu (%rdi), %xmm1 - addq $16, %rdi - pxor (%rcx), %xmm1 - movdqu 160(%rcx), %xmm2 - cmpl $12, %r8d - aesdec 16(%rcx), %xmm1 - aesdec 32(%rcx), %xmm1 - aesdec 48(%rcx), %xmm1 - aesdec 64(%rcx), %xmm1 - aesdec 80(%rcx), %xmm1 - aesdec 96(%rcx), %xmm1 - aesdec 112(%rcx), %xmm1 - aesdec 128(%rcx), %xmm1 - aesdec 144(%rcx), %xmm1 - jb DECB_LAST_4_2 - cmpl $14, %r8d - movdqu 192(%rcx), %xmm2 - aesdec 160(%rcx), %xmm1 - aesdec 176(%rcx), %xmm1 - jb DECB_LAST_4_2 - movdqu 224(%rcx), %xmm2 - aesdec 192(%rcx), %xmm1 - aesdec 208(%rcx), %xmm1 -DECB_LAST_4_2: - aesdeclast %xmm2, %xmm1 - movdqu %xmm1, (%rsi) - addq $16, %rsi - decq %r10 - jne DECB_LOOP_4_2 -DECB_END_4: - ret - - - - -/* -void AES_128_Key_Expansion_AESNI(const unsigned char* userkey, - unsigned char* key_schedule); -*/ -#ifndef __APPLE__ -.globl AES_128_Key_Expansion_AESNI -.align 16,0x90 -AES_128_Key_Expansion_AESNI: -#else -.globl _AES_128_Key_Expansion_AESNI -.p2align 4 -_AES_128_Key_Expansion_AESNI: -#endif -# parameter 1: %rdi -# parameter 2: %rsi - -movdqu (%rdi), %xmm1 -movdqa %xmm1, (%rsi) - - -ASSISTS: -aeskeygenassist $1, %xmm1, %xmm2 -call PREPARE_ROUNDKEY_128 -movdqa %xmm1, 16(%rsi) -aeskeygenassist $2, %xmm1, %xmm2 -call PREPARE_ROUNDKEY_128 -movdqa %xmm1, 32(%rsi) -aeskeygenassist $4, %xmm1, %xmm2 -call PREPARE_ROUNDKEY_128 -movdqa %xmm1, 48(%rsi) -aeskeygenassist $8, %xmm1, %xmm2 -call PREPARE_ROUNDKEY_128 -movdqa %xmm1, 64(%rsi) -aeskeygenassist $16, %xmm1, %xmm2 -call PREPARE_ROUNDKEY_128 -movdqa %xmm1, 80(%rsi) -aeskeygenassist $32, %xmm1, %xmm2 -call PREPARE_ROUNDKEY_128 -movdqa %xmm1, 96(%rsi) -aeskeygenassist $64, %xmm1, %xmm2 -call PREPARE_ROUNDKEY_128 -movdqa %xmm1, 112(%rsi) -aeskeygenassist $0x80, %xmm1, %xmm2 -call PREPARE_ROUNDKEY_128 -movdqa %xmm1, 128(%rsi) -aeskeygenassist $0x1b, %xmm1, %xmm2 -call PREPARE_ROUNDKEY_128 -movdqa %xmm1, 144(%rsi) -aeskeygenassist $0x36, %xmm1, %xmm2 -call PREPARE_ROUNDKEY_128 -movdqa %xmm1, 160(%rsi) -ret - -PREPARE_ROUNDKEY_128: -pshufd $255, %xmm2, %xmm2 -movdqa %xmm1, %xmm3 -pslldq $4, %xmm3 -pxor %xmm3, %xmm1 -pslldq $4, %xmm3 -pxor %xmm3, %xmm1 -pslldq $4, %xmm3 -pxor %xmm3, %xmm1 -pxor %xmm2, %xmm1 -ret - - -/* -void AES_192_Key_Expansion_AESNI (const unsigned char *userkey, - unsigned char *key) -*/ -#ifndef __APPLE__ -.globl AES_192_Key_Expansion_AESNI -AES_192_Key_Expansion_AESNI: -#else -.globl _AES_192_Key_Expansion_AESNI -_AES_192_Key_Expansion_AESNI: -#endif -# parameter 1: %rdi -# parameter 2: %rsi - -movdqu (%rdi), %xmm1 -movq 16(%rdi), %xmm3 -movdqa %xmm1, (%rsi) -movdqa %xmm3, %xmm5 - -aeskeygenassist $0x1, %xmm3, %xmm2 -call PREPARE_ROUNDKEY_192 -shufpd $0, %xmm1, %xmm5 -movdqa %xmm5, 16(%rsi) -movdqa %xmm1, %xmm6 -shufpd $1, %xmm3, %xmm6 -movdqa %xmm6, 32(%rsi) - -aeskeygenassist $0x2, %xmm3, %xmm2 -call PREPARE_ROUNDKEY_192 -movdqa %xmm1, 48(%rsi) -movdqa %xmm3, %xmm5 - -aeskeygenassist $0x4, %xmm3, %xmm2 -call PREPARE_ROUNDKEY_192 -shufpd $0, %xmm1, %xmm5 -movdqa %xmm5, 64(%rsi) -movdqa %xmm1, %xmm6 -shufpd $1, %xmm3, %xmm6 -movdqa %xmm6, 80(%rsi) - -aeskeygenassist $0x8, %xmm3, %xmm2 -call PREPARE_ROUNDKEY_192 -movdqa %xmm1, 96(%rsi) -movdqa %xmm3, %xmm5 - -aeskeygenassist $0x10, %xmm3, %xmm2 -call PREPARE_ROUNDKEY_192 -shufpd $0, %xmm1, %xmm5 -movdqa %xmm5, 112(%rsi) -movdqa %xmm1, %xmm6 -shufpd $1, %xmm3, %xmm6 -movdqa %xmm6, 128(%rsi) - -aeskeygenassist $0x20, %xmm3, %xmm2 -call PREPARE_ROUNDKEY_192 -movdqa %xmm1, 144(%rsi) -movdqa %xmm3, %xmm5 - -aeskeygenassist $0x40, %xmm3, %xmm2 -call PREPARE_ROUNDKEY_192 -shufpd $0, %xmm1, %xmm5 -movdqa %xmm5, 160(%rsi) -movdqa %xmm1, %xmm6 -shufpd $1, %xmm3, %xmm6 -movdqa %xmm6, 176(%rsi) - -aeskeygenassist $0x80, %xmm3, %xmm2 -call PREPARE_ROUNDKEY_192 -movdqa %xmm1, 192(%rsi) -movdqa %xmm3, 208(%rsi) -ret - -PREPARE_ROUNDKEY_192: -pshufd $0x55, %xmm2, %xmm2 -movdqu %xmm1, %xmm4 -pslldq $4, %xmm4 -pxor %xmm4, %xmm1 - -pslldq $4, %xmm4 -pxor %xmm4, %xmm1 -pslldq $4, %xmm4 -pxor %xmm4, %xmm1 -pxor %xmm2, %xmm1 -pshufd $0xff, %xmm1, %xmm2 -movdqu %xmm3, %xmm4 -pslldq $4, %xmm4 -pxor %xmm4, %xmm3 -pxor %xmm2, %xmm3 -ret - - -/* -void AES_256_Key_Expansion_AESNI (const unsigned char *userkey, - unsigned char *key) -*/ -#ifndef __APPLE__ -.globl AES_256_Key_Expansion_AESNI -AES_256_Key_Expansion_AESNI: -#else -.globl _AES_256_Key_Expansion_AESNI -_AES_256_Key_Expansion_AESNI: -#endif -# parameter 1: %rdi -# parameter 2: %rsi - -movdqu (%rdi), %xmm1 -movdqu 16(%rdi), %xmm3 -movdqa %xmm1, (%rsi) -movdqa %xmm3, 16(%rsi) - -aeskeygenassist $0x1, %xmm3, %xmm2 -call MAKE_RK256_a -movdqa %xmm1, 32(%rsi) -aeskeygenassist $0x0, %xmm1, %xmm2 -call MAKE_RK256_b -movdqa %xmm3, 48(%rsi) -aeskeygenassist $0x2, %xmm3, %xmm2 -call MAKE_RK256_a -movdqa %xmm1, 64(%rsi) -aeskeygenassist $0x0, %xmm1, %xmm2 -call MAKE_RK256_b -movdqa %xmm3, 80(%rsi) -aeskeygenassist $0x4, %xmm3, %xmm2 -call MAKE_RK256_a -movdqa %xmm1, 96(%rsi) -aeskeygenassist $0x0, %xmm1, %xmm2 -call MAKE_RK256_b -movdqa %xmm3, 112(%rsi) -aeskeygenassist $0x8, %xmm3, %xmm2 -call MAKE_RK256_a -movdqa %xmm1, 128(%rsi) -aeskeygenassist $0x0, %xmm1, %xmm2 -call MAKE_RK256_b -movdqa %xmm3, 144(%rsi) -aeskeygenassist $0x10, %xmm3, %xmm2 -call MAKE_RK256_a -movdqa %xmm1, 160(%rsi) -aeskeygenassist $0x0, %xmm1, %xmm2 -call MAKE_RK256_b -movdqa %xmm3, 176(%rsi) -aeskeygenassist $0x20, %xmm3, %xmm2 -call MAKE_RK256_a -movdqa %xmm1, 192(%rsi) - -aeskeygenassist $0x0, %xmm1, %xmm2 -call MAKE_RK256_b -movdqa %xmm3, 208(%rsi) -aeskeygenassist $0x40, %xmm3, %xmm2 -call MAKE_RK256_a -movdqa %xmm1, 224(%rsi) - -ret - -MAKE_RK256_a: -pshufd $0xff, %xmm2, %xmm2 -movdqa %xmm1, %xmm4 -pslldq $4, %xmm4 -pxor %xmm4, %xmm1 -pslldq $4, %xmm4 -pxor %xmm4, %xmm1 -pslldq $4, %xmm4 -pxor %xmm4, %xmm1 -pxor %xmm2, %xmm1 -ret - -MAKE_RK256_b: -pshufd $0xaa, %xmm2, %xmm2 -movdqa %xmm3, %xmm4 -pslldq $4, %xmm4 -pxor %xmm4, %xmm3 -pslldq $4, %xmm4 -pxor %xmm4, %xmm3 -pslldq $4, %xmm4 -pxor %xmm4, %xmm3 -pxor %xmm2, %xmm3 -ret - -#elif defined WOLFSSL_X86_BUILD +#if defined WOLFSSL_X86_BUILD /* AES_CBC_encrypt_AESNI (const unsigned char *in, @@ -2238,7 +931,7 @@ MAKE_RK256_b: pxor %xmm2, %xmm3 ret -#endif /* WOLFSSL_X86_64_BUILD */ +#endif /* WOLFSSL_X86_BUILD */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits diff --git a/wolfcrypt/src/aes_asm.asm b/wolfcrypt/src/aes_asm.asm index 4b5e1250d52..cb06a54ac52 100644 --- a/wolfcrypt/src/aes_asm.asm +++ b/wolfcrypt/src/aes_asm.asm @@ -1,1531 +1,54 @@ -; /* aes_asm.asm -; * -; * Copyright (C) 2006-2026 wolfSSL Inc. -; * -; * This file is part of wolfSSL. -; * -; * wolfSSL is free software; you can redistribute it and/or modify -; * it under the terms of the GNU General Public License as published by -; * the Free Software Foundation; either version 3 of the License, or -; * (at your option) any later version. -; * -; * wolfSSL is distributed in the hope that it will be useful, -; * but WITHOUT ANY WARRANTY; without even the implied warranty of -; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -; * GNU General Public License for more details. -; * -; * You should have received a copy of the GNU General Public License -; * along with this program; if not, write to the Free Software -; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA -; */ +; /* aes_asm.asm +; * +; * Copyright (C) 2006-2026 wolfSSL Inc. +; * +; * This file is part of wolfSSL. +; * +; * wolfSSL is free software; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 3 of the License, or +; * (at your option) any later version. +; * +; * wolfSSL is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA +; */ -; -; -; /* See Intel Advanced Encryption Standard (AES) Instructions Set White Paper -; * by Israel, Intel Mobility Group Development Center, Israel Shay Gueron -; */ -; -; /* This file is in intel asm syntax, see .s for at&t syntax */ -; - - -fips_version = 0 -IFDEF HAVE_FIPS - fips_version = 1 - IFDEF HAVE_FIPS_VERSION - fips_version = HAVE_FIPS_VERSION - ENDIF -ENDIF - -IF fips_version GE 2 - fipsAb SEGMENT ALIAS(".fipsA$b") 'CODE' -ELSE - _text SEGMENT -ENDIF - - -; /* -; AES_CBC_encrypt_AESNI[const ,unsigned char*in -; unsigned ,char*out -; unsigned ,char ivec+16 -; unsigned ,long length -; const ,unsigned char*KS -; int nr] -; */ -AES_CBC_encrypt_AESNI PROC -;# parameter 1: rdi -;# parameter 2: rsi -;# parameter 3: rdx -;# parameter 4: rcx -;# parameter 5: r8 -;# parameter 6: r9d - -; save rdi and rsi to rax and r11, restore before ret - mov rax,rdi - mov r11,rsi - -; convert to what we had for att&t convention - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,[rsp+40] - mov r9d,[rsp+48] - - mov r10,rcx - shr rcx,4 - shl r10,60 - je NO_PARTS - add rcx,1 -NO_PARTS: - sub rsi,16 - movdqa xmm1,[rdx] -LOOP_1: - pxor xmm1,[rdi] - pxor xmm1,[r8] - add rsi,16 - add rdi,16 - cmp r9d,12 - aesenc xmm1,16[r8] - aesenc xmm1,32[r8] - aesenc xmm1,48[r8] - aesenc xmm1,64[r8] - aesenc xmm1,80[r8] - aesenc xmm1,96[r8] - aesenc xmm1,112[r8] - aesenc xmm1,128[r8] - aesenc xmm1,144[r8] - movdqa xmm2,160[r8] - jb LAST - cmp r9d,14 - - aesenc xmm1,160[r8] - aesenc xmm1,176[r8] - movdqa xmm2,192[r8] - jb LAST - aesenc xmm1,192[r8] - aesenc xmm1,208[r8] - movdqa xmm2,224[r8] -LAST: - dec rcx - aesenclast xmm1,xmm2 - movdqu [rsi],xmm1 - jne LOOP_1 - ; restore non volatile rdi,rsi - mov rdi,rax - mov rsi,r11 - ret -AES_CBC_encrypt_AESNI ENDP - - -; void AES_CBC_decrypt_AESNI_by4(const unsigned char* in, -; unsigned char* out, -; unsigned char ivec[16], -; unsigned long length, -; const unsigned char* KS, -; int nr) -AES_CBC_decrypt_AESNI_by4 PROC -; parameter 1: rdi -; parameter 2: rsi -; parameter 3: rdx -; parameter 4: rcx -; parameter 5: r8 -; parameter 6: r9d - - ; save rdi and rsi to rax and r11, restore before ret - mov rax, rdi - mov r11, rsi - ; convert to what we had for att&t convention - mov rdi, rcx - mov rsi, rdx - mov rdx, r8 - mov rcx,r9 - mov r8, [rsp+40] - mov r9d, [rsp+48] - ; on microsoft xmm6-xmm15 are non volatile, - ; let's save on stack and restore at end - sub rsp, 8+8*16 ; 8 = align stack , 8 xmm6-12,15 16 bytes each - movdqa [rsp+0], xmm6 - movdqa [rsp+16], xmm7 - movdqa [rsp+32], xmm8 - movdqa [rsp+48], xmm9 - movdqa [rsp+64], xmm10 - movdqa [rsp+80], xmm11 - movdqa [rsp+96], xmm12 - movdqa [rsp+112], xmm15 - ; back to our original code, more or less - mov r10, rcx - shr rcx, 4 - shl r10, 60 - je DNO_PARTS_4 - add rcx, 1 -DNO_PARTS_4: - mov r10, rcx - shl r10, 62 - shr r10, 62 - shr rcx, 2 - movdqu xmm5, [rdx] - je DREMAINDER_4 - sub rsi, 64 -DLOOP_4: - movdqu xmm1, [rdi] - movdqu xmm2, 16[rdi] - movdqu xmm3, 32[rdi] - movdqu xmm4, 48[rdi] - movdqa xmm6, xmm1 - movdqa xmm7, xmm2 - movdqa xmm8, xmm3 - movdqa xmm15, xmm4 - movdqa xmm9, [r8] - movdqa xmm10, 16[r8] - movdqa xmm11, 32[r8] - movdqa xmm12, 48[r8] - pxor xmm1, xmm9 - pxor xmm2, xmm9 - pxor xmm3, xmm9 - pxor xmm4, xmm9 - aesdec xmm1, xmm10 - aesdec xmm2, xmm10 - aesdec xmm3, xmm10 - aesdec xmm4, xmm10 - aesdec xmm1, xmm11 - aesdec xmm2, xmm11 - aesdec xmm3, xmm11 - aesdec xmm4, xmm11 - aesdec xmm1, xmm12 - aesdec xmm2, xmm12 - aesdec xmm3, xmm12 - aesdec xmm4, xmm12 - movdqa xmm9, 64[r8] - movdqa xmm10, 80[r8] - movdqa xmm11, 96[r8] - movdqa xmm12, 112[r8] - aesdec xmm1, xmm9 - aesdec xmm2, xmm9 - aesdec xmm3, xmm9 - aesdec xmm4, xmm9 - aesdec xmm1, xmm10 - aesdec xmm2, xmm10 - aesdec xmm3, xmm10 - aesdec xmm4, xmm10 - aesdec xmm1, xmm11 - aesdec xmm2, xmm11 - aesdec xmm3, xmm11 - aesdec xmm4, xmm11 - aesdec xmm1, xmm12 - aesdec xmm2, xmm12 - aesdec xmm3, xmm12 - aesdec xmm4, xmm12 - movdqa xmm9, 128[r8] - movdqa xmm10, 144[r8] - movdqa xmm11, 160[r8] - cmp r9d, 12 - aesdec xmm1, xmm9 - aesdec xmm2, xmm9 - aesdec xmm3, xmm9 - aesdec xmm4, xmm9 - aesdec xmm1, xmm10 - aesdec xmm2, xmm10 - aesdec xmm3, xmm10 - aesdec xmm4, xmm10 - jb DLAST_4 - movdqa xmm9, 160[r8] - movdqa xmm10, 176[r8] - movdqa xmm11, 192[r8] - cmp r9d, 14 - aesdec xmm1, xmm9 - aesdec xmm2, xmm9 - aesdec xmm3, xmm9 - aesdec xmm4, xmm9 - aesdec xmm1, xmm10 - aesdec xmm2, xmm10 - aesdec xmm3, xmm10 - aesdec xmm4, xmm10 - jb DLAST_4 - movdqa xmm9, 192[r8] - movdqa xmm10, 208[r8] - movdqa xmm11, 224[r8] - aesdec xmm1, xmm9 - aesdec xmm2, xmm9 - aesdec xmm3, xmm9 - aesdec xmm4, xmm9 - aesdec xmm1, xmm10 - aesdec xmm2, xmm10 - aesdec xmm3, xmm10 - aesdec xmm4, xmm10 -DLAST_4: - add rdi, 64 - add rsi, 64 - dec rcx - aesdeclast xmm1, xmm11 - aesdeclast xmm2, xmm11 - aesdeclast xmm3, xmm11 - aesdeclast xmm4, xmm11 - pxor xmm1, xmm5 - pxor xmm2, xmm6 - pxor xmm3, xmm7 - pxor xmm4, xmm8 - movdqu [rsi], xmm1 - movdqu 16[rsi], xmm2 - movdqu 32[rsi], xmm3 - movdqu 48[rsi], xmm4 - movdqa xmm5, xmm15 - jne DLOOP_4 - add rsi, 64 -DREMAINDER_4: - cmp r10, 0 - je DEND_4 -DLOOP_4_2: - movdqu xmm1, [rdi] - movdqa xmm15, xmm1 - add rdi, 16 - pxor xmm1, [r8] - movdqu xmm2, 160[r8] - cmp r9d, 12 - aesdec xmm1, 16[r8] - aesdec xmm1, 32[r8] - aesdec xmm1, 48[r8] - aesdec xmm1, 64[r8] - aesdec xmm1, 80[r8] - aesdec xmm1, 96[r8] - aesdec xmm1, 112[r8] - aesdec xmm1, 128[r8] - aesdec xmm1, 144[r8] - jb DLAST_4_2 - movdqu xmm2, 192[r8] - cmp r9d, 14 - aesdec xmm1, 160[r8] - aesdec xmm1, 176[r8] - jb DLAST_4_2 - movdqu xmm2, 224[r8] - aesdec xmm1, 192[r8] - aesdec xmm1, 208[r8] -DLAST_4_2: - aesdeclast xmm1, xmm2 - pxor xmm1, xmm5 - movdqa xmm5, xmm15 - movdqu [rsi], xmm1 - add rsi, 16 - dec r10 - jne DLOOP_4_2 -DEND_4: - ; restore non volatile rdi,rsi - mov rdi, rax - mov rsi, r11 - ; restore non volatile xmms from stack - movdqa xmm6, [rsp+0] - movdqa xmm7, [rsp+16] - movdqa xmm8, [rsp+32] - movdqa xmm9, [rsp+48] - movdqa xmm10, [rsp+64] - movdqa xmm11, [rsp+80] - movdqa xmm12, [rsp+96] - movdqa xmm15, [rsp+112] - add rsp, 8+8*16 ; 8 = align stack , 8 xmm6-12,15 16 bytes each - ret -AES_CBC_decrypt_AESNI_by4 ENDP - - -; void AES_CBC_decrypt_AESNI_by6(const unsigned char *in, -; unsigned char *out, -; unsigned char ivec[16], -; unsigned long length, -; const unsigned char *KS, -; int nr) -AES_CBC_decrypt_AESNI_by6 PROC -; parameter 1: rdi - in -; parameter 2: rsi - out -; parameter 3: rdx - ivec -; parameter 4: rcx - length -; parameter 5: r8 - KS -; parameter 6: r9d - nr - - ; save rdi and rsi to rax and r11, restore before ret - mov rax, rdi - mov r11, rsi - ; convert to what we had for att&t convention - mov rdi, rcx - mov rsi, rdx - mov rdx, r8 - mov rcx, r9 - mov r8, [rsp+40] - mov r9d, [rsp+48] - ; on microsoft xmm6-xmm15 are non volatile, - ; let's save on stack and restore at end - sub rsp, 8+9*16 ; 8 = align stack , 9 xmm6-14 16 bytes each - movdqa [rsp+0], xmm6 - movdqa [rsp+16], xmm7 - movdqa [rsp+32], xmm8 - movdqa [rsp+48], xmm9 - movdqa [rsp+64], xmm10 - movdqa [rsp+80], xmm11 - movdqa [rsp+96], xmm12 - movdqa [rsp+112], xmm13 - movdqa [rsp+128], xmm14 - ; back to our original code, more or less - mov r10, rcx - shr rcx, 4 - shl r10, 60 - je DNO_PARTS_6 - add rcx, 1 -DNO_PARTS_6: - mov r12, rax - mov r13, rdx - mov r14, rbx - mov rdx, 0 - mov rax, rcx - mov rbx, 6 - div rbx - mov rcx, rax - mov r10, rdx - mov rax, r12 - mov rdx, r13 - mov rbx, r14 - cmp rcx, 0 - movdqu xmm7, [rdx] - je DREMAINDER_6 - sub rsi, 96 -DLOOP_6: - movdqu xmm1, [rdi] - movdqu xmm2, 16[rdi] - movdqu xmm3, 32[rdi] - movdqu xmm4, 48[rdi] - movdqu xmm5, 64[rdi] - movdqu xmm6, 80[rdi] - movdqa xmm8, [r8] - movdqa xmm9, 16[r8] - movdqa xmm10, 32[r8] - movdqa xmm11, 48[r8] - pxor xmm1, xmm8 - pxor xmm2, xmm8 - pxor xmm3, xmm8 - pxor xmm4, xmm8 - pxor xmm5, xmm8 - pxor xmm6, xmm8 - aesdec xmm1, xmm9 - aesdec xmm2, xmm9 - aesdec xmm3, xmm9 - aesdec xmm4, xmm9 - aesdec xmm5, xmm9 - aesdec xmm6, xmm9 - aesdec xmm1, xmm10 - aesdec xmm2, xmm10 - aesdec xmm3, xmm10 - aesdec xmm4, xmm10 - aesdec xmm5, xmm10 - aesdec xmm6, xmm10 - aesdec xmm1, xmm11 - aesdec xmm2, xmm11 - aesdec xmm3, xmm11 - aesdec xmm4, xmm11 - aesdec xmm5, xmm11 - aesdec xmm6, xmm11 - movdqa xmm8, 64[r8] - movdqa xmm9, 80[r8] - movdqa xmm10, 96[r8] - movdqa xmm11, 112[r8] - aesdec xmm1, xmm8 - aesdec xmm2, xmm8 - aesdec xmm3, xmm8 - aesdec xmm4, xmm8 - aesdec xmm5, xmm8 - aesdec xmm6, xmm8 - aesdec xmm1, xmm9 - aesdec xmm2, xmm9 - aesdec xmm3, xmm9 - aesdec xmm4, xmm9 - aesdec xmm5, xmm9 - aesdec xmm6, xmm9 - aesdec xmm1, xmm10 - aesdec xmm2, xmm10 - aesdec xmm3, xmm10 - aesdec xmm4, xmm10 - aesdec xmm5, xmm10 - aesdec xmm6, xmm10 - aesdec xmm1, xmm11 - aesdec xmm2, xmm11 - aesdec xmm3, xmm11 - aesdec xmm4, xmm11 - aesdec xmm5, xmm11 - aesdec xmm6, xmm11 - movdqa xmm8, 128[r8] - movdqa xmm9, 144[r8] - movdqa xmm10, 160[r8] - cmp r9d, 12 - aesdec xmm1, xmm8 - aesdec xmm2, xmm8 - aesdec xmm3, xmm8 - aesdec xmm4, xmm8 - aesdec xmm5, xmm8 - aesdec xmm6, xmm8 - aesdec xmm1, xmm9 - aesdec xmm2, xmm9 - aesdec xmm3, xmm9 - aesdec xmm4, xmm9 - aesdec xmm5, xmm9 - aesdec xmm6, xmm9 - jb DLAST_6 - movdqa xmm8, 160[r8] - movdqa xmm9, 176[r8] - movdqa xmm10, 192[r8] - cmp r9d, 14 - aesdec xmm1, xmm8 - aesdec xmm2, xmm8 - aesdec xmm3, xmm8 - aesdec xmm4, xmm8 - aesdec xmm5, xmm8 - aesdec xmm6, xmm8 - aesdec xmm1, xmm9 - aesdec xmm2, xmm9 - aesdec xmm3, xmm9 - aesdec xmm4, xmm9 - aesdec xmm5, xmm9 - aesdec xmm6, xmm9 - jb DLAST_6 - movdqa xmm8, 192[r8] - movdqa xmm9, 208[r8] - movdqa xmm10, 224[r8] - aesdec xmm1, xmm8 - aesdec xmm2, xmm8 - aesdec xmm3, xmm8 - aesdec xmm4, xmm8 - aesdec xmm5, xmm8 - aesdec xmm6, xmm8 - aesdec xmm1, xmm9 - aesdec xmm2, xmm9 - aesdec xmm3, xmm9 - aesdec xmm4, xmm9 - aesdec xmm5, xmm9 - aesdec xmm6, xmm9 -DLAST_6: - add rsi, 96 - aesdeclast xmm1, xmm10 - aesdeclast xmm2, xmm10 - aesdeclast xmm3, xmm10 - aesdeclast xmm4, xmm10 - aesdeclast xmm5, xmm10 - aesdeclast xmm6, xmm10 - movdqu xmm8, [rdi] - movdqu xmm9, 16[rdi] - movdqu xmm10, 32[rdi] - movdqu xmm11, 48[rdi] - movdqu xmm12, 64[rdi] - movdqu xmm13, 80[rdi] - pxor xmm1, xmm7 - pxor xmm2, xmm8 - pxor xmm3, xmm9 - pxor xmm4, xmm10 - pxor xmm5, xmm11 - pxor xmm6, xmm12 - movdqu xmm7, xmm13 - movdqu [rsi], xmm1 - movdqu 16[rsi], xmm2 - movdqu 32[rsi], xmm3 - movdqu 48[rsi], xmm4 - movdqu 64[rsi], xmm5 - movdqu 80[rsi], xmm6 - add rdi, 96 - dec rcx - jne DLOOP_6 - add rsi, 96 -DREMAINDER_6: - cmp r10, 0 - je DEND_6 -DLOOP_6_2: - movdqu xmm1, [rdi] - movdqa xmm10, xmm1 - add rdi, 16 - pxor xmm1, [r8] - movdqu xmm2, 160[r8] - cmp r9d, 12 - aesdec xmm1, 16[r8] - aesdec xmm1, 32[r8] - aesdec xmm1, 48[r8] - aesdec xmm1, 64[r8] - aesdec xmm1, 80[r8] - aesdec xmm1, 96[r8] - aesdec xmm1, 112[r8] - aesdec xmm1, 128[r8] - aesdec xmm1, 144[r8] - jb DLAST_6_2 - movdqu xmm2, 192[r8] - cmp r9d, 14 - aesdec xmm1, 160[r8] - aesdec xmm1, 176[r8] - jb DLAST_6_2 - movdqu xmm2, 224[r8] - aesdec xmm1, 192[r8] - aesdec xmm1, 208[r8] -DLAST_6_2: - aesdeclast xmm1, xmm2 - pxor xmm1, xmm7 - movdqa xmm7, xmm10 - movdqu [rsi], xmm1 - add rsi, 16 - dec r10 - jne DLOOP_6_2 -DEND_6: - ; restore non volatile rdi,rsi - mov rdi, rax - mov rsi, r11 - ; restore non volatile xmms from stack - movdqa xmm6, [rsp+0] - movdqa xmm7, [rsp+16] - movdqa xmm8, [rsp+32] - movdqa xmm9, [rsp+48] - movdqa xmm10, [rsp+64] - movdqa xmm11, [rsp+80] - movdqa xmm12, [rsp+96] - movdqa xmm13, [rsp+112] - movdqa xmm14, [rsp+128] - add rsp, 8+9*16 ; 8 = align stack , 9 xmm6-14 16 bytes each - ret -AES_CBC_decrypt_AESNI_by6 ENDP - - -; void AES_CBC_decrypt_AESNI_by8(const unsigned char *in, -; unsigned char *out, -; unsigned char ivec[16], -; unsigned long length, -; const unsigned char *KS, -; int nr) -AES_CBC_decrypt_AESNI_by8 PROC -; parameter 1: rdi - in -; parameter 2: rsi - out -; parameter 3: rdx - ivec -; parameter 4: rcx - length -; parameter 5: r8 - KS -; parameter 6: r9d - nr - - ; save rdi and rsi to rax and r11, restore before ret - mov rax, rdi - mov r11, rsi - ; convert to what we had for att&t convention - mov rdi, rcx - mov rsi, rdx - mov rdx, r8 - mov rcx,r9 - mov r8, [rsp+40] - mov r9d, [rsp+48] - ; on microsoft xmm6-xmm15 are non volatile, - ; let's save on stack and restore at end - sub rsp, 8+8*16 ; 8 = align stack , 8 xmm6-13 16 bytes each - movdqa [rsp+0], xmm6 - movdqa [rsp+16], xmm7 - movdqa [rsp+32], xmm8 - movdqa [rsp+48], xmm9 - movdqa [rsp+64], xmm10 - movdqa [rsp+80], xmm11 - movdqa [rsp+96], xmm12 - movdqa [rsp+112], xmm13 - ; back to our original code, more or less - mov r10, rcx - shr rcx, 4 - shl r10, 60 - je DNO_PARTS_8 - add rcx, 1 -DNO_PARTS_8: - mov r10, rcx - shl r10, 61 - shr r10, 61 - shr rcx, 3 - movdqu xmm9, [rdx] - je DREMAINDER_8 - sub rsi, 128 -DLOOP_8: - movdqu xmm1, [rdi] - movdqu xmm2, 16[rdi] - movdqu xmm3, 32[rdi] - movdqu xmm4, 48[rdi] - movdqu xmm5, 64[rdi] - movdqu xmm6, 80[rdi] - movdqu xmm7, 96[rdi] - movdqu xmm8, 112[rdi] - movdqa xmm10, [r8] - movdqa xmm11, 16[r8] - movdqa xmm12, 32[r8] - movdqa xmm13, 48[r8] - pxor xmm1, xmm10 - pxor xmm2, xmm10 - pxor xmm3, xmm10 - pxor xmm4, xmm10 - pxor xmm5, xmm10 - pxor xmm6, xmm10 - pxor xmm7, xmm10 - pxor xmm8, xmm10 - aesdec xmm1, xmm11 - aesdec xmm2, xmm11 - aesdec xmm3, xmm11 - aesdec xmm4, xmm11 - aesdec xmm5, xmm11 - aesdec xmm6, xmm11 - aesdec xmm7, xmm11 - aesdec xmm8, xmm11 - aesdec xmm1, xmm12 - aesdec xmm2, xmm12 - aesdec xmm3, xmm12 - aesdec xmm4, xmm12 - aesdec xmm5, xmm12 - aesdec xmm6, xmm12 - aesdec xmm7, xmm12 - aesdec xmm8, xmm12 - aesdec xmm1, xmm13 - aesdec xmm2, xmm13 - aesdec xmm3, xmm13 - aesdec xmm4, xmm13 - aesdec xmm5, xmm13 - aesdec xmm6, xmm13 - aesdec xmm7, xmm13 - aesdec xmm8, xmm13 - movdqa xmm10, 64[r8] - movdqa xmm11, 80[r8] - movdqa xmm12, 96[r8] - movdqa xmm13, 112[r8] - aesdec xmm1, xmm10 - aesdec xmm2, xmm10 - aesdec xmm3, xmm10 - aesdec xmm4, xmm10 - aesdec xmm5, xmm10 - aesdec xmm6, xmm10 - aesdec xmm7, xmm10 - aesdec xmm8, xmm10 - aesdec xmm1, xmm11 - aesdec xmm2, xmm11 - aesdec xmm3, xmm11 - aesdec xmm4, xmm11 - aesdec xmm5, xmm11 - aesdec xmm6, xmm11 - aesdec xmm7, xmm11 - aesdec xmm8, xmm11 - aesdec xmm1, xmm12 - aesdec xmm2, xmm12 - aesdec xmm3, xmm12 - aesdec xmm4, xmm12 - aesdec xmm5, xmm12 - aesdec xmm6, xmm12 - aesdec xmm7, xmm12 - aesdec xmm8, xmm12 - aesdec xmm1, xmm13 - aesdec xmm2, xmm13 - aesdec xmm3, xmm13 - aesdec xmm4, xmm13 - aesdec xmm5, xmm13 - aesdec xmm6, xmm13 - aesdec xmm7, xmm13 - aesdec xmm8, xmm13 - movdqa xmm10, 128[r8] - movdqa xmm11, 144[r8] - movdqa xmm12, 160[r8] - cmp r9d, 12 - aesdec xmm1, xmm10 - aesdec xmm2, xmm10 - aesdec xmm3, xmm10 - aesdec xmm4, xmm10 - aesdec xmm5, xmm10 - aesdec xmm6, xmm10 - aesdec xmm7, xmm10 - aesdec xmm8, xmm10 - aesdec xmm1, xmm11 - aesdec xmm2, xmm11 - aesdec xmm3, xmm11 - aesdec xmm4, xmm11 - aesdec xmm5, xmm11 - aesdec xmm6, xmm11 - aesdec xmm7, xmm11 - aesdec xmm8, xmm11 - jb DLAST_8 - movdqa xmm10, 160[r8] - movdqa xmm11, 176[r8] - movdqa xmm12, 192[r8] - cmp r9d, 14 - aesdec xmm1, xmm10 - aesdec xmm2, xmm10 - aesdec xmm3, xmm10 - aesdec xmm4, xmm10 - aesdec xmm5, xmm10 - aesdec xmm6, xmm10 - aesdec xmm7, xmm10 - aesdec xmm8, xmm10 - aesdec xmm1, xmm11 - aesdec xmm2, xmm11 - aesdec xmm3, xmm11 - aesdec xmm4, xmm11 - aesdec xmm5, xmm11 - aesdec xmm6, xmm11 - aesdec xmm7, xmm11 - aesdec xmm8, xmm11 - jb DLAST_8 - movdqa xmm10, 192[r8] - movdqa xmm11, 208[r8] - movdqa xmm12, 224[r8] - aesdec xmm1, xmm10 - aesdec xmm2, xmm10 - aesdec xmm3, xmm10 - aesdec xmm4, xmm10 - aesdec xmm5, xmm10 - aesdec xmm6, xmm10 - aesdec xmm7, xmm10 - aesdec xmm8, xmm10 - aesdec xmm1, xmm11 - aesdec xmm2, xmm11 - aesdec xmm3, xmm11 - aesdec xmm4, xmm11 - aesdec xmm5, xmm11 - aesdec xmm6, xmm11 - aesdec xmm7, xmm11 - aesdec xmm8, xmm11 -DLAST_8: - add rsi, 128 - aesdeclast xmm1, xmm12 - aesdeclast xmm2, xmm12 - aesdeclast xmm3, xmm12 - aesdeclast xmm4, xmm12 - aesdeclast xmm5, xmm12 - aesdeclast xmm6, xmm12 - aesdeclast xmm7, xmm12 - aesdeclast xmm8, xmm12 - movdqu xmm10, [rdi] - movdqu xmm11, 16[rdi] - movdqu xmm12, 32[rdi] - movdqu xmm13, 48[rdi] - pxor xmm1, xmm9 - pxor xmm2, xmm10 - pxor xmm3, xmm11 - pxor xmm4, xmm12 - pxor xmm5, xmm13 - movdqu xmm10, 64[rdi] - movdqu xmm11, 80[rdi] - movdqu xmm12, 96[rdi] - movdqu xmm9, 112[rdi] - pxor xmm6, xmm10 - pxor xmm7, xmm11 - pxor xmm8, xmm12 - movdqu [rsi], xmm1 - movdqu 16[rsi], xmm2 - movdqu 32[rsi], xmm3 - movdqu 48[rsi], xmm4 - movdqu 64[rsi], xmm5 - movdqu 80[rsi], xmm6 - movdqu 96[rsi], xmm7 - movdqu 112[rsi], xmm8 - add rdi, 128 - dec rcx - jne DLOOP_8 - add rsi, 128 -DREMAINDER_8: - cmp r10, 0 - je DEND_8 -DLOOP_8_2: - movdqu xmm1, [rdi] - movdqa xmm10, xmm1 - add rdi, 16 - pxor xmm1, [r8] - movdqu xmm2, 160[r8] - cmp r9d, 12 - aesdec xmm1, 16[r8] - aesdec xmm1, 32[r8] - aesdec xmm1, 48[r8] - aesdec xmm1, 64[r8] - aesdec xmm1, 80[r8] - aesdec xmm1, 96[r8] - aesdec xmm1, 112[r8] - aesdec xmm1, 128[r8] - aesdec xmm1, 144[r8] - jb DLAST_8_2 - movdqu xmm2, 192[r8] - cmp r9d, 14 - aesdec xmm1, 160[r8] - aesdec xmm1, 176[r8] - jb DLAST_8_2 - movdqu xmm2, 224[r8] - aesdec xmm1, 192[r8] - aesdec xmm1, 208[r8] -DLAST_8_2: - aesdeclast xmm1, xmm2 - pxor xmm1, xmm9 - movdqa xmm9, xmm10 - movdqu [rsi], xmm1 - add rsi, 16 - dec r10 - jne DLOOP_8_2 -DEND_8: - ; restore non volatile rdi,rsi - mov rdi, rax - mov rsi, r11 - ; restore non volatile xmms from stack - movdqa xmm6, [rsp+0] - movdqa xmm7, [rsp+16] - movdqa xmm8, [rsp+32] - movdqa xmm9, [rsp+48] - movdqa xmm10, [rsp+64] - movdqa xmm11, [rsp+80] - movdqa xmm12, [rsp+96] - movdqa xmm13, [rsp+112] - add rsp, 8+8*16 ; 8 = align stack , 8 xmm6-13 16 bytes each - ret -AES_CBC_decrypt_AESNI_by8 ENDP - - -; /* -; AES_ECB_encrypt_AESNI[const ,unsigned char*in -; unsigned ,char*out -; unsigned ,long length -; const ,unsigned char*KS -; int nr] -; */ -; . globl AES_ECB_encrypt_AESNI -AES_ECB_encrypt_AESNI PROC -;# parameter 1: rdi -;# parameter 2: rsi -;# parameter 3: rdx -;# parameter 4: rcx -;# parameter 5: r8d - -; save rdi and rsi to rax and r11, restore before ret - mov rax,rdi - mov r11,rsi - -; convert to what we had for att&t convention - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8d,[rsp+40] - -; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end - sub rsp,8+4*16 ; 8 = align stack , 4 xmm9-12, 16 bytes each - movdqa [rsp+0], xmm9 - movdqa [rsp+16], xmm10 - movdqa [rsp+32], xmm11 - movdqa [rsp+48], xmm12 - - - mov r10,rdx - shr rdx,4 - shl r10,60 - je EECB_NO_PARTS_4 - add rdx,1 -EECB_NO_PARTS_4: - mov r10,rdx - shl r10,62 - shr r10,62 - shr rdx,2 - je EECB_REMAINDER_4 - sub rsi,64 -EECB_LOOP_4: - movdqu xmm1,[rdi] - movdqu xmm2,16[rdi] - movdqu xmm3,32[rdi] - movdqu xmm4,48[rdi] - movdqa xmm9,[rcx] - movdqa xmm10,16[rcx] - movdqa xmm11,32[rcx] - movdqa xmm12,48[rcx] - pxor xmm1,xmm9 - pxor xmm2,xmm9 - pxor xmm3,xmm9 - pxor xmm4,xmm9 - aesenc xmm1,xmm10 - aesenc xmm2,xmm10 - aesenc xmm3,xmm10 - aesenc xmm4,xmm10 - aesenc xmm1,xmm11 - aesenc xmm2,xmm11 - aesenc xmm3,xmm11 - aesenc xmm4,xmm11 - aesenc xmm1,xmm12 - aesenc xmm2,xmm12 - aesenc xmm3,xmm12 - aesenc xmm4,xmm12 - movdqa xmm9,64[rcx] - movdqa xmm10,80[rcx] - movdqa xmm11,96[rcx] - movdqa xmm12,112[rcx] - aesenc xmm1,xmm9 - aesenc xmm2,xmm9 - aesenc xmm3,xmm9 - aesenc xmm4,xmm9 - aesenc xmm1,xmm10 - aesenc xmm2,xmm10 - aesenc xmm3,xmm10 - aesenc xmm4,xmm10 - aesenc xmm1,xmm11 - aesenc xmm2,xmm11 - aesenc xmm3,xmm11 - aesenc xmm4,xmm11 - aesenc xmm1,xmm12 - aesenc xmm2,xmm12 - aesenc xmm3,xmm12 - aesenc xmm4,xmm12 - movdqa xmm9,128[rcx] - movdqa xmm10,144[rcx] - movdqa xmm11,160[rcx] - cmp r8d,12 - aesenc xmm1,xmm9 - aesenc xmm2,xmm9 - aesenc xmm3,xmm9 - aesenc xmm4,xmm9 - aesenc xmm1,xmm10 - aesenc xmm2,xmm10 - aesenc xmm3,xmm10 - aesenc xmm4,xmm10 - jb EECB_LAST_4 - movdqa xmm9,160[rcx] - movdqa xmm10,176[rcx] - movdqa xmm11,192[rcx] - cmp r8d,14 - aesenc xmm1,xmm9 - aesenc xmm2,xmm9 - aesenc xmm3,xmm9 - aesenc xmm4,xmm9 - aesenc xmm1,xmm10 - aesenc xmm2,xmm10 - aesenc xmm3,xmm10 - aesenc xmm4,xmm10 - jb EECB_LAST_4 - movdqa xmm9,192[rcx] - movdqa xmm10,208[rcx] - movdqa xmm11,224[rcx] - aesenc xmm1,xmm9 - aesenc xmm2,xmm9 - aesenc xmm3,xmm9 - aesenc xmm4,xmm9 - aesenc xmm1,xmm10 - aesenc xmm2,xmm10 - aesenc xmm3,xmm10 - aesenc xmm4,xmm10 -EECB_LAST_4: - add rdi,64 - add rsi,64 - dec rdx - aesenclast xmm1,xmm11 - aesenclast xmm2,xmm11 - aesenclast xmm3,xmm11 - aesenclast xmm4,xmm11 - movdqu [rsi],xmm1 - movdqu 16[rsi],xmm2 - movdqu 32[rsi],xmm3 - movdqu 48[rsi],xmm4 - jne EECB_LOOP_4 - add rsi,64 -EECB_REMAINDER_4: - cmp r10,0 - je EECB_END_4 -EECB_LOOP_4_2: - movdqu xmm1,[rdi] - add rdi,16 - pxor xmm1,[rcx] - movdqu xmm2,160[rcx] - aesenc xmm1,16[rcx] - aesenc xmm1,32[rcx] - aesenc xmm1,48[rcx] - aesenc xmm1,64[rcx] - aesenc xmm1,80[rcx] - aesenc xmm1,96[rcx] - aesenc xmm1,112[rcx] - aesenc xmm1,128[rcx] - aesenc xmm1,144[rcx] - cmp r8d,12 - jb EECB_LAST_4_2 - movdqu xmm2,192[rcx] - aesenc xmm1,160[rcx] - aesenc xmm1,176[rcx] - cmp r8d,14 - jb EECB_LAST_4_2 - movdqu xmm2,224[rcx] - aesenc xmm1,192[rcx] - aesenc xmm1,208[rcx] -EECB_LAST_4_2: - aesenclast xmm1,xmm2 - movdqu [rsi],xmm1 - add rsi,16 - dec r10 - jne EECB_LOOP_4_2 -EECB_END_4: - ; restore non volatile rdi,rsi - mov rdi,rax - mov rsi,r11 - ; restore non volatile xmms from stack - movdqa xmm9, [rsp+0] - movdqa xmm10, [rsp+16] - movdqa xmm11, [rsp+32] - movdqa xmm12, [rsp+48] - add rsp,8+4*16 ; 8 = align stack , 4 xmm9-12 16 bytes each - ret -AES_ECB_encrypt_AESNI ENDP - -; /* -; AES_ECB_decrypt_AESNI[const ,unsigned char*in -; unsigned ,char*out -; unsigned ,long length -; const ,unsigned char*KS -; int nr] -; */ -; . globl AES_ECB_decrypt_AESNI -AES_ECB_decrypt_AESNI PROC -;# parameter 1: rdi -;# parameter 2: rsi -;# parameter 3: rdx -;# parameter 4: rcx -;# parameter 5: r8d - -; save rdi and rsi to rax and r11, restore before ret - mov rax,rdi - mov r11,rsi - -; convert to what we had for att&t convention - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8d,[rsp+40] - -; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end - sub rsp,8+4*16 ; 8 = align stack , 4 xmm9-12, 16 bytes each - movdqa [rsp+0], xmm9 - movdqa [rsp+16], xmm10 - movdqa [rsp+32], xmm11 - movdqa [rsp+48], xmm12 - - mov r10,rdx - shr rdx,4 - shl r10,60 - je DECB_NO_PARTS_4 - add rdx,1 -DECB_NO_PARTS_4: - mov r10,rdx - shl r10,62 - shr r10,62 - shr rdx,2 - je DECB_REMAINDER_4 - sub rsi,64 -DECB_LOOP_4: - movdqu xmm1,[rdi] - movdqu xmm2,16[rdi] - movdqu xmm3,32[rdi] - movdqu xmm4,48[rdi] - movdqa xmm9,[rcx] - movdqa xmm10,16[rcx] - movdqa xmm11,32[rcx] - movdqa xmm12,48[rcx] - pxor xmm1,xmm9 - pxor xmm2,xmm9 - pxor xmm3,xmm9 - pxor xmm4,xmm9 - aesdec xmm1,xmm10 - aesdec xmm2,xmm10 - aesdec xmm3,xmm10 - aesdec xmm4,xmm10 - aesdec xmm1,xmm11 - aesdec xmm2,xmm11 - aesdec xmm3,xmm11 - aesdec xmm4,xmm11 - aesdec xmm1,xmm12 - aesdec xmm2,xmm12 - aesdec xmm3,xmm12 - aesdec xmm4,xmm12 - movdqa xmm9,64[rcx] - movdqa xmm10,80[rcx] - movdqa xmm11,96[rcx] - movdqa xmm12,112[rcx] - aesdec xmm1,xmm9 - aesdec xmm2,xmm9 - aesdec xmm3,xmm9 - aesdec xmm4,xmm9 - aesdec xmm1,xmm10 - aesdec xmm2,xmm10 - aesdec xmm3,xmm10 - aesdec xmm4,xmm10 - aesdec xmm1,xmm11 - aesdec xmm2,xmm11 - aesdec xmm3,xmm11 - aesdec xmm4,xmm11 - aesdec xmm1,xmm12 - aesdec xmm2,xmm12 - aesdec xmm3,xmm12 - aesdec xmm4,xmm12 - movdqa xmm9,128[rcx] - movdqa xmm10,144[rcx] - movdqa xmm11,160[rcx] - cmp r8d,12 - aesdec xmm1,xmm9 - aesdec xmm2,xmm9 - aesdec xmm3,xmm9 - aesdec xmm4,xmm9 - aesdec xmm1,xmm10 - aesdec xmm2,xmm10 - aesdec xmm3,xmm10 - aesdec xmm4,xmm10 - jb DECB_LAST_4 - movdqa xmm9,160[rcx] - movdqa xmm10,176[rcx] - movdqa xmm11,192[rcx] - cmp r8d,14 - aesdec xmm1,xmm9 - aesdec xmm2,xmm9 - aesdec xmm3,xmm9 - aesdec xmm4,xmm9 - aesdec xmm1,xmm10 - aesdec xmm2,xmm10 - aesdec xmm3,xmm10 - aesdec xmm4,xmm10 - jb DECB_LAST_4 - movdqa xmm9,192[rcx] - movdqa xmm10,208[rcx] - movdqa xmm11,224[rcx] - aesdec xmm1,xmm9 - aesdec xmm2,xmm9 - aesdec xmm3,xmm9 - aesdec xmm4,xmm9 - aesdec xmm1,xmm10 - aesdec xmm2,xmm10 - aesdec xmm3,xmm10 - aesdec xmm4,xmm10 -DECB_LAST_4: - add rdi,64 - add rsi,64 - dec rdx - aesdeclast xmm1,xmm11 - aesdeclast xmm2,xmm11 - aesdeclast xmm3,xmm11 - aesdeclast xmm4,xmm11 - movdqu [rsi],xmm1 - movdqu 16[rsi],xmm2 - movdqu 32[rsi],xmm3 - movdqu 48[rsi],xmm4 - jne DECB_LOOP_4 - add rsi,64 -DECB_REMAINDER_4: - cmp r10,0 - je DECB_END_4 -DECB_LOOP_4_2: - movdqu xmm1,[rdi] - add rdi,16 - pxor xmm1,[rcx] - movdqu xmm2,160[rcx] - cmp r8d,12 - aesdec xmm1,16[rcx] - aesdec xmm1,32[rcx] - aesdec xmm1,48[rcx] - aesdec xmm1,64[rcx] - aesdec xmm1,80[rcx] - aesdec xmm1,96[rcx] - aesdec xmm1,112[rcx] - aesdec xmm1,128[rcx] - aesdec xmm1,144[rcx] - jb DECB_LAST_4_2 - cmp r8d,14 - movdqu xmm2,192[rcx] - aesdec xmm1,160[rcx] - aesdec xmm1,176[rcx] - jb DECB_LAST_4_2 - movdqu xmm2,224[rcx] - aesdec xmm1,192[rcx] - aesdec xmm1,208[rcx] -DECB_LAST_4_2: - aesdeclast xmm1,xmm2 - movdqu [rsi],xmm1 - add rsi,16 - dec r10 - jne DECB_LOOP_4_2 -DECB_END_4: - ; restore non volatile rdi,rsi - mov rdi,rax - mov rsi,r11 - ; restore non volatile xmms from stack - movdqa xmm9, [rsp+0] - movdqa xmm10, [rsp+16] - movdqa xmm11, [rsp+32] - movdqa xmm12, [rsp+48] - add rsp,8+4*16 ; 8 = align stack , 4 xmm9-12 16 bytes each - ret -AES_ECB_decrypt_AESNI ENDP - - - -; /* -; void ,AES_128_Key_Expansion_AESNI[const unsigned char*userkey -; unsigned char*key_schedule]/ -; */ -; . align 16,0x90 -; . globl AES_128_Key_Expansion_AESNI -AES_128_Key_Expansion_AESNI PROC -;# parameter 1: rdi -;# parameter 2: rsi - -; save rdi and rsi to rax and r11, restore before ret - mov rax,rdi - mov r11,rsi - -; convert to what we had for att&t convention - mov rdi,rcx - mov rsi,rdx - - mov dword ptr 240[rsi],10 - - movdqu xmm1,[rdi] - movdqa [rsi],xmm1 - - -ASSISTS: - aeskeygenassist xmm2,xmm1,1 - call PREPARE_ROUNDKEY_128 - movdqa 16[rsi],xmm1 - - aeskeygenassist xmm2,xmm1,2 - call PREPARE_ROUNDKEY_128 - movdqa 32[rsi],xmm1 - - aeskeygenassist xmm2,xmm1,4 - call PREPARE_ROUNDKEY_128 - movdqa 48[rsi],xmm1 - - aeskeygenassist xmm2,xmm1,8 - call PREPARE_ROUNDKEY_128 - movdqa 64[rsi],xmm1 - - aeskeygenassist xmm2,xmm1,16 - call PREPARE_ROUNDKEY_128 - movdqa 80[rsi],xmm1 - - aeskeygenassist xmm2,xmm1,32 - call PREPARE_ROUNDKEY_128 - movdqa 96[rsi],xmm1 - - aeskeygenassist xmm2,xmm1,64 - call PREPARE_ROUNDKEY_128 - movdqa 112[rsi],xmm1 - aeskeygenassist xmm2,xmm1,80h - call PREPARE_ROUNDKEY_128 - movdqa 128[rsi],xmm1 - aeskeygenassist xmm2,xmm1,1bh - call PREPARE_ROUNDKEY_128 - movdqa 144[rsi],xmm1 - aeskeygenassist xmm2,xmm1,36h - call PREPARE_ROUNDKEY_128 - movdqa 160[rsi],xmm1 - ; restore non volatile rdi,rsi - mov rdi,rax - mov rsi,r11 - ret - -PREPARE_ROUNDKEY_128: - pshufd xmm2,xmm2,255 - movdqa xmm3,xmm1 - pslldq xmm3,4 - pxor xmm1,xmm3 - pslldq xmm3,4 - pxor xmm1,xmm3 - pslldq xmm3,4 - pxor xmm1,xmm3 - pxor xmm1,xmm2 - ret -AES_128_Key_Expansion_AESNI ENDP - -; /* -; void ,AES_192_Key_Expansion_AESNI[const unsigned char*userkey -; unsigned char*key] -; */ -; . globl AES_192_Key_Expansion_AESNI -AES_192_Key_Expansion_AESNI PROC -;# parameter 1: rdi -;# parameter 2: rsi - -; save rdi and rsi to rax and r11, restore before ret - mov rax,rdi - mov r11,rsi - -; convert to what we had for att&t convention - mov rdi,rcx - mov rsi,rdx - -; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end - sub rsp,8+1*16 ; 8 = align stack , 1 xmm6, 16 bytes each - movdqa [rsp+0], xmm6 - - movdqu xmm1,[rdi] - movq xmm3,qword ptr 16[rdi] - movdqa [rsi],xmm1 - movdqa xmm5,xmm3 - - aeskeygenassist xmm2,xmm3,1h - call PREPARE_ROUNDKEY_192 - shufpd xmm5,xmm1,0 - movdqa 16[rsi],xmm5 - movdqa xmm6,xmm1 - shufpd xmm6,xmm3,1 - movdqa 32[rsi],xmm6 - - aeskeygenassist xmm2,xmm3,2h - call PREPARE_ROUNDKEY_192 - movdqa 48[rsi],xmm1 - movdqa xmm5,xmm3 - - aeskeygenassist xmm2,xmm3,4h - call PREPARE_ROUNDKEY_192 - shufpd xmm5,xmm1,0 - movdqa 64[rsi],xmm5 - movdqa xmm6,xmm1 - shufpd xmm6,xmm3,1 - movdqa 80[rsi],xmm6 - - aeskeygenassist xmm2,xmm3,8h - call PREPARE_ROUNDKEY_192 - movdqa 96[rsi],xmm1 - movdqa xmm5,xmm3 - - aeskeygenassist xmm2,xmm3,10h - call PREPARE_ROUNDKEY_192 - shufpd xmm5,xmm1,0 - movdqa 112[rsi],xmm5 - movdqa xmm6,xmm1 - shufpd xmm6,xmm3,1 - movdqa 128[rsi],xmm6 - - aeskeygenassist xmm2,xmm3,20h - call PREPARE_ROUNDKEY_192 - movdqa 144[rsi],xmm1 - movdqa xmm5,xmm3 - - aeskeygenassist xmm2,xmm3,40h - call PREPARE_ROUNDKEY_192 - shufpd xmm5,xmm1,0 - movdqa 160[rsi],xmm5 - movdqa xmm6,xmm1 - shufpd xmm6,xmm3,1 - movdqa 176[rsi],xmm6 - - aeskeygenassist xmm2,xmm3,80h - call PREPARE_ROUNDKEY_192 - movdqa 192[rsi],xmm1 - movdqa 208[rsi],xmm3 - ; restore non volatile rdi,rsi - mov rdi,rax - mov rsi,r11 -; restore non volatile xmms from stack - movdqa xmm6, [rsp+0] - add rsp,8+1*16 ; 8 = align stack , 1 xmm6 16 bytes each - ret - -PREPARE_ROUNDKEY_192: - pshufd xmm2,xmm2,55h - movdqu xmm4,xmm1 - pslldq xmm4,4 - pxor xmm1,xmm4 - - pslldq xmm4,4 - pxor xmm1,xmm4 - pslldq xmm4,4 - pxor xmm1,xmm4 - pxor xmm1,xmm2 - pshufd xmm2,xmm1,0ffh - movdqu xmm4,xmm3 - pslldq xmm4,4 - pxor xmm3,xmm4 - pxor xmm3,xmm2 - ret -AES_192_Key_Expansion_AESNI ENDP - -; /* -; void ,AES_256_Key_Expansion_AESNI[const unsigned char*userkey -; unsigned char*key] -; */ -; . globl AES_256_Key_Expansion_AESNI -AES_256_Key_Expansion_AESNI PROC -;# parameter 1: rdi -;# parameter 2: rsi - -; save rdi and rsi to rax and r11, restore before ret - mov rax,rdi - mov r11,rsi - -; convert to what we had for att&t convention - mov rdi,rcx - mov rsi,rdx - - movdqu xmm1,[rdi] - movdqu xmm3,16[rdi] - movdqa [rsi],xmm1 - movdqa 16[rsi],xmm3 - - aeskeygenassist xmm2,xmm3,1h - call MAKE_RK256_a - movdqa 32[rsi],xmm1 - aeskeygenassist xmm2,xmm1,0h - call MAKE_RK256_b - movdqa 48[rsi],xmm3 - aeskeygenassist xmm2,xmm3,2h - call MAKE_RK256_a - movdqa 64[rsi],xmm1 - aeskeygenassist xmm2,xmm1,0h - call MAKE_RK256_b - movdqa 80[rsi],xmm3 - aeskeygenassist xmm2,xmm3,4h - call MAKE_RK256_a - movdqa 96[rsi],xmm1 - aeskeygenassist xmm2,xmm1,0h - call MAKE_RK256_b - movdqa 112[rsi],xmm3 - aeskeygenassist xmm2,xmm3,8h - call MAKE_RK256_a - movdqa 128[rsi],xmm1 - aeskeygenassist xmm2,xmm1,0h - call MAKE_RK256_b - movdqa 144[rsi],xmm3 - aeskeygenassist xmm2,xmm3,10h - call MAKE_RK256_a - movdqa 160[rsi],xmm1 - aeskeygenassist xmm2,xmm1,0h - call MAKE_RK256_b - movdqa 176[rsi],xmm3 - aeskeygenassist xmm2,xmm3,20h - call MAKE_RK256_a - movdqa 192[rsi],xmm1 - - aeskeygenassist xmm2,xmm1,0h - call MAKE_RK256_b - movdqa 208[rsi],xmm3 - aeskeygenassist xmm2,xmm3,40h - call MAKE_RK256_a - movdqa 224[rsi],xmm1 - - ; restore non volatile rdi,rsi - mov rdi,rax - mov rsi,r11 - ret -AES_256_Key_Expansion_AESNI ENDP - -MAKE_RK256_a: - pshufd xmm2,xmm2,0ffh - movdqa xmm4,xmm1 - pslldq xmm4,4 - pxor xmm1,xmm4 - pslldq xmm4,4 - pxor xmm1,xmm4 - pslldq xmm4,4 - pxor xmm1,xmm4 - pxor xmm1,xmm2 - ret - -MAKE_RK256_b: - pshufd xmm2,xmm2,0aah - movdqa xmm4,xmm3 - pslldq xmm4,4 - pxor xmm3,xmm4 - pslldq xmm4,4 - pxor xmm3,xmm4 - pslldq xmm4,4 - pxor xmm3,xmm4 - pxor xmm3,xmm2 - ret - - -IF fips_version GE 2 - fipsAb ENDS -ELSE - _text ENDS -ENDIF - -END +; +; +; /* See Intel Advanced Encryption Standard (AES) Instructions Set White Paper +; * by Israel, Intel Mobility Group Development Center, Israel Shay Gueron +; */ +; +; /* This file is in intel asm syntax, see .s for at&t syntax */ +; + + +fips_version = 0 +IFDEF HAVE_FIPS + fips_version = 1 + IFDEF HAVE_FIPS_VERSION + fips_version = HAVE_FIPS_VERSION + ENDIF +ENDIF + +IF fips_version GE 2 + fipsAb SEGMENT ALIAS(".fipsA$b") 'CODE' +ELSE + _text SEGMENT +ENDIF + +IF fips_version GE 2 + fipsAb ENDS +ELSE + _text ENDS +ENDIF + +END diff --git a/wolfcrypt/src/aes_gcm_asm.S b/wolfcrypt/src/aes_gcm_asm.S index e75f2c9b942..e82445fca15 100644 --- a/wolfcrypt/src/aes_gcm_asm.S +++ b/wolfcrypt/src/aes_gcm_asm.S @@ -46,6 +46,16 @@ #define HAVE_INTEL_AVX2 #endif /* HAVE_INTEL_AVX2 */ #endif /* NO_AVX2_SUPPORT */ +#ifndef NO_VAES_SUPPORT +#ifndef HAVE_INTEL_VAES +#define HAVE_INTEL_VAES +#endif /* HAVE_INTEL_VAES */ +#endif /* NO_VAES_SUPPORT */ +#ifndef NO_AVX512_SUPPORT +#ifndef HAVE_INTEL_AVX512 +#define HAVE_INTEL_AVX512 +#endif /* HAVE_INTEL_AVX512 */ +#endif /* NO_AVX512_SUPPORT */ #ifdef WOLFSSL_X86_64_BUILD #ifndef __APPLE__ @@ -194,10 +204,10 @@ _GCM_generate_m0_aesni: por %xmm5, %xmm1 por %xmm6, %xmm2 por %xmm7, %xmm3 - vpshufb %xmm9, %xmm0, %xmm0 - vpshufb %xmm9, %xmm1, %xmm1 - vpshufb %xmm9, %xmm2, %xmm2 - vpshufb %xmm9, %xmm3, %xmm3 + pshufb %xmm9, %xmm0 + pshufb %xmm9, %xmm1 + pshufb %xmm9, %xmm2 + pshufb %xmm9, %xmm3 movdqu %xmm0, 256(%rsi) movdqu %xmm1, 272(%rsi) movdqu %xmm2, 288(%rsi) @@ -230,10 +240,10 @@ _GCM_generate_m0_aesni: por %xmm5, %xmm1 por %xmm6, %xmm2 por %xmm7, %xmm3 - vpshufb %xmm9, %xmm0, %xmm0 - vpshufb %xmm9, %xmm1, %xmm1 - vpshufb %xmm9, %xmm2, %xmm2 - vpshufb %xmm9, %xmm3, %xmm3 + pshufb %xmm9, %xmm0 + pshufb %xmm9, %xmm1 + pshufb %xmm9, %xmm2 + pshufb %xmm9, %xmm3 movdqu %xmm0, 320(%rsi) movdqu %xmm1, 336(%rsi) movdqu %xmm2, 352(%rsi) @@ -266,10 +276,10 @@ _GCM_generate_m0_aesni: por %xmm5, %xmm1 por %xmm6, %xmm2 por %xmm7, %xmm3 - vpshufb %xmm9, %xmm0, %xmm0 - vpshufb %xmm9, %xmm1, %xmm1 - vpshufb %xmm9, %xmm2, %xmm2 - vpshufb %xmm9, %xmm3, %xmm3 + pshufb %xmm9, %xmm0 + pshufb %xmm9, %xmm1 + pshufb %xmm9, %xmm2 + pshufb %xmm9, %xmm3 movdqu %xmm0, 384(%rsi) movdqu %xmm1, 400(%rsi) movdqu %xmm2, 416(%rsi) @@ -302,10 +312,10 @@ _GCM_generate_m0_aesni: por %xmm5, %xmm1 por %xmm6, %xmm2 por %xmm7, %xmm3 - vpshufb %xmm9, %xmm0, %xmm0 - vpshufb %xmm9, %xmm1, %xmm1 - vpshufb %xmm9, %xmm2, %xmm2 - vpshufb %xmm9, %xmm3, %xmm3 + pshufb %xmm9, %xmm0 + pshufb %xmm9, %xmm1 + pshufb %xmm9, %xmm2 + pshufb %xmm9, %xmm3 movdqu %xmm0, 448(%rsi) movdqu %xmm1, 464(%rsi) movdqu %xmm2, 480(%rsi) @@ -16577,6 +16587,14213 @@ L_AES_GCM_decrypt_final_avx2_cmp_tag_done: #endif /* __APPLE__ */ #endif /* WOLFSSL_AESGCM_STREAM */ #endif /* HAVE_INTEL_AVX2 */ +#ifdef HAVE_INTEL_VAES +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_inc_y0: +.quad 0x0000000000000000,0x0000000000000000 +.quad 0x0000000000000000,0x0000000000000001 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_inc_y1: +.quad 0x0000000000000000,0x0000000000000002 +.quad 0x0000000000000000,0x0000000000000003 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_inc_y2: +.quad 0x0000000000000000,0x0000000000000004 +.quad 0x0000000000000000,0x0000000000000005 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_inc_y3: +.quad 0x0000000000000000,0x0000000000000006 +.quad 0x0000000000000000,0x0000000000000007 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_GCM_generate_m0_vaes_rev8: +.quad 0x08090a0b0c0d0e0f,0x0001020304050607 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_GCM_generate_m0_vaes_mod2_128: +.quad 0x0000000000000000,0xe100000000000000 +#ifndef __APPLE__ +.text +.globl GCM_generate_m0_vaes +.type GCM_generate_m0_vaes,@function +.align 16 +GCM_generate_m0_vaes: +#else +.section __TEXT,__text +.globl _GCM_generate_m0_vaes +.p2align 4 +_GCM_generate_m0_vaes: +#endif /* __APPLE__ */ + vmovdqu L_GCM_generate_m0_vaes_rev8(%rip), %xmm9 + vmovdqu L_GCM_generate_m0_vaes_mod2_128(%rip), %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqu (%rdi), %xmm0 + vmovdqu %xmm8, (%rsi) + vmovdqu %xmm0, %xmm8 + vpshufb %xmm9, %xmm0, %xmm0 + vpsllq $63, %xmm0, %xmm5 + vpsrlq $0x01, %xmm0, %xmm4 + vpslldq $8, %xmm5, %xmm1 + vpsrldq $8, %xmm5, %xmm5 + vpshufd $0xff, %xmm1, %xmm1 + vpor %xmm5, %xmm4, %xmm4 + vpsrad $31, %xmm1, %xmm1 + vpand %xmm10, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpsllq $63, %xmm1, %xmm5 + vpsrlq $0x01, %xmm1, %xmm4 + vpslldq $8, %xmm5, %xmm2 + vpsrldq $8, %xmm5, %xmm5 + vpshufd $0xff, %xmm2, %xmm2 + vpor %xmm5, %xmm4, %xmm4 + vpsrad $31, %xmm2, %xmm2 + vpand %xmm10, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpsllq $63, %xmm2, %xmm5 + vpsrlq $0x01, %xmm2, %xmm4 + vpslldq $8, %xmm5, %xmm3 + vpsrldq $8, %xmm5, %xmm5 + vpshufd $0xff, %xmm3, %xmm3 + vpor %xmm5, %xmm4, %xmm4 + vpsrad $31, %xmm3, %xmm3 + vpand %xmm10, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpshufb %xmm9, %xmm3, %xmm3 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm0, %xmm0 + vpxor %xmm2, %xmm3, %xmm8 + vmovdqu %xmm3, 16(%rsi) + vmovdqu %xmm2, 32(%rsi) + vmovdqu %xmm8, 48(%rsi) + vmovdqu %xmm1, 64(%rsi) + vpxor %xmm1, %xmm3, %xmm4 + vpxor %xmm1, %xmm2, %xmm5 + vpxor %xmm1, %xmm8, %xmm6 + vmovdqu %xmm4, 80(%rsi) + vmovdqu %xmm5, 96(%rsi) + vmovdqu %xmm6, 112(%rsi) + vmovdqu %xmm0, 128(%rsi) + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm0, %xmm3, %xmm4 + vpxor %xmm0, %xmm2, %xmm6 + vmovdqu %xmm4, 144(%rsi) + vmovdqu %xmm6, 160(%rsi) + vpxor %xmm6, %xmm3, %xmm6 + vmovdqu %xmm6, 176(%rsi) + vmovdqu %xmm1, 192(%rsi) + vpxor %xmm1, %xmm3, %xmm4 + vpxor %xmm1, %xmm2, %xmm5 + vpxor %xmm1, %xmm8, %xmm6 + vmovdqu %xmm4, 208(%rsi) + vmovdqu %xmm5, 224(%rsi) + vmovdqu %xmm6, 240(%rsi) + vmovdqu (%rsi), %xmm0 + vmovdqu 16(%rsi), %xmm1 + vmovdqu 32(%rsi), %xmm2 + vmovdqu 48(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 256(%rsi) + vmovdqu %xmm1, 272(%rsi) + vmovdqu %xmm2, 288(%rsi) + vmovdqu %xmm3, 304(%rsi) + vmovdqu 64(%rsi), %xmm0 + vmovdqu 80(%rsi), %xmm1 + vmovdqu 96(%rsi), %xmm2 + vmovdqu 112(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 320(%rsi) + vmovdqu %xmm1, 336(%rsi) + vmovdqu %xmm2, 352(%rsi) + vmovdqu %xmm3, 368(%rsi) + vmovdqu 128(%rsi), %xmm0 + vmovdqu 144(%rsi), %xmm1 + vmovdqu 160(%rsi), %xmm2 + vmovdqu 176(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 384(%rsi) + vmovdqu %xmm1, 400(%rsi) + vmovdqu %xmm2, 416(%rsi) + vmovdqu %xmm3, 432(%rsi) + vmovdqu 192(%rsi), %xmm0 + vmovdqu 208(%rsi), %xmm1 + vmovdqu 224(%rsi), %xmm2 + vmovdqu 240(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 448(%rsi) + vmovdqu %xmm1, 464(%rsi) + vmovdqu %xmm2, 480(%rsi) + vmovdqu %xmm3, 496(%rsi) + repz retq +#ifndef __APPLE__ +.size GCM_generate_m0_vaes,.-GCM_generate_m0_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_one: +.quad 0x0000000000000000,0x0000000000000001 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_two: +.quad 0x0000000000000000,0x0000000000000002 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_three: +.quad 0x0000000000000000,0x0000000000000003 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_four: +.quad 0x0000000000000000,0x0000000000000004 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_five: +.quad 0x0000000000000000,0x0000000000000005 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_six: +.quad 0x0000000000000000,0x0000000000000006 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_seven: +.quad 0x0000000000000000,0x0000000000000007 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_eight: +.quad 0x0000000000000000,0x0000000000000008 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_bswap_epi64: +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_bswap_mask: +.quad 0x08090a0b0c0d0e0f,0x0001020304050607 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_gcm_mod2_128: +.quad 0x0000000000000001,0xc200000000000000 +#ifndef __APPLE__ +.text +.globl AES_GCM_encrypt_vaes +.type AES_GCM_encrypt_vaes,@function +.align 16 +AES_GCM_encrypt_vaes: +#else +.section __TEXT,__text +.globl _AES_GCM_encrypt_vaes +.p2align 4 +_AES_GCM_encrypt_vaes: +#endif /* __APPLE__ */ + pushq %r13 + pushq %r12 + pushq %rbx + pushq %r14 + pushq %r15 + movq %rdx, %r12 + movq %rcx, %rax + movl 48(%rsp), %r11d + movl 56(%rsp), %ebx + movl 64(%rsp), %r14d + movq 72(%rsp), %r15 + movl 80(%rsp), %r10d + subq $0x230, %rsp + vpxor %xmm5, %xmm5, %xmm5 + vpxor %xmm15, %xmm15, %xmm15 + movl %ebx, %edx + cmpl $12, %edx + jne L_AES_GCM_encrypt_vaes_iv_not_12 + # # Calculate values when IV is 12 bytes + # Set counter based on IV + movl $0x1000000, %ecx + vmovq (%rax), %xmm5 + vpinsrd $2, 8(%rax), %xmm5, %xmm5 + vpinsrd $3, %ecx, %xmm5, %xmm5 + # H = Encrypt X(=0) and T = Encrypt counter + vmovdqa (%r15), %xmm6 + vpxor %xmm6, %xmm5, %xmm1 + vmovdqa 16(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 32(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 48(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 64(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 80(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 96(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 112(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 128(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 144(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm4 + jl L_AES_GCM_encrypt_vaes_calc_iv_12_last + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 176(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm4 + jl L_AES_GCM_encrypt_vaes_calc_iv_12_last + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 208(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 224(%r15), %xmm4 +L_AES_GCM_encrypt_vaes_calc_iv_12_last: + vaesenclast %xmm4, %xmm6, %xmm6 + vaesenclast %xmm4, %xmm1, %xmm1 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6 + vmovdqu %xmm1, 528(%rsp) + jmp L_AES_GCM_encrypt_vaes_iv_done +L_AES_GCM_encrypt_vaes_iv_not_12: + # Calculate values when IV is not 12 bytes + # H = Encrypt X(=0) + vmovdqa (%r15), %xmm6 + vaesenc 16(%r15), %xmm6, %xmm6 + vaesenc 32(%r15), %xmm6, %xmm6 + vaesenc 48(%r15), %xmm6, %xmm6 + vaesenc 64(%r15), %xmm6, %xmm6 + vaesenc 80(%r15), %xmm6, %xmm6 + vaesenc 96(%r15), %xmm6, %xmm6 + vaesenc 112(%r15), %xmm6, %xmm6 + vaesenc 128(%r15), %xmm6, %xmm6 + vaesenc 144(%r15), %xmm6, %xmm6 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm8 + jl L_AES_GCM_encrypt_vaes_calc_iv_1_aesenc_avx_last + vaesenc %xmm8, %xmm6, %xmm6 + vaesenc 176(%r15), %xmm6, %xmm6 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm8 + jl L_AES_GCM_encrypt_vaes_calc_iv_1_aesenc_avx_last + vaesenc %xmm8, %xmm6, %xmm6 + vaesenc 208(%r15), %xmm6, %xmm6 + vmovdqa 224(%r15), %xmm8 +L_AES_GCM_encrypt_vaes_calc_iv_1_aesenc_avx_last: + vaesenclast %xmm8, %xmm6, %xmm6 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6 + # Calc counter + # Initialization vector + cmpl $0x00, %edx + movq $0x00, %rcx + je L_AES_GCM_encrypt_vaes_calc_iv_done + cmpl $16, %edx + jl L_AES_GCM_encrypt_vaes_calc_iv_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_encrypt_vaes_calc_iv_16_loop: + vmovdqu (%rax,%rcx,1), %xmm7 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm5, %xmm5 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm5 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm5, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm5, %xmm5 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm5, %xmm5 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm5, %xmm5 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_vaes_calc_iv_16_loop + movl %ebx, %edx + cmpl %edx, %ecx + je L_AES_GCM_encrypt_vaes_calc_iv_done +L_AES_GCM_encrypt_vaes_calc_iv_lt16: + subq $16, %rsp + vpxor %xmm7, %xmm7, %xmm7 + xorl %ebx, %ebx + vmovdqu %xmm7, (%rsp) +L_AES_GCM_encrypt_vaes_calc_iv_loop: + movzbl (%rax,%rcx,1), %r13d + movb %r13b, (%rsp,%rbx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_vaes_calc_iv_loop + vmovdqu (%rsp), %xmm7 + addq $16, %rsp + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm5, %xmm5 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm5 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm5, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm5, %xmm5 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm5, %xmm5 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm5, %xmm5 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 +L_AES_GCM_encrypt_vaes_calc_iv_done: + # T = Encrypt counter + vpxor %xmm0, %xmm0, %xmm0 + shll $3, %edx + vmovq %rdx, %xmm0 + vpxor %xmm0, %xmm5, %xmm5 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm5 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm5, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm5, %xmm5 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm5, %xmm5 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm5, %xmm5 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + # Encrypt counter + vmovdqa (%r15), %xmm7 + vpxor %xmm5, %xmm7, %xmm7 + vaesenc 16(%r15), %xmm7, %xmm7 + vaesenc 32(%r15), %xmm7, %xmm7 + vaesenc 48(%r15), %xmm7, %xmm7 + vaesenc 64(%r15), %xmm7, %xmm7 + vaesenc 80(%r15), %xmm7, %xmm7 + vaesenc 96(%r15), %xmm7, %xmm7 + vaesenc 112(%r15), %xmm7, %xmm7 + vaesenc 128(%r15), %xmm7, %xmm7 + vaesenc 144(%r15), %xmm7, %xmm7 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm8 + jl L_AES_GCM_encrypt_vaes_calc_iv_2_aesenc_avx_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 176(%r15), %xmm7, %xmm7 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm8 + jl L_AES_GCM_encrypt_vaes_calc_iv_2_aesenc_avx_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 208(%r15), %xmm7, %xmm7 + vmovdqa 224(%r15), %xmm8 +L_AES_GCM_encrypt_vaes_calc_iv_2_aesenc_avx_last: + vaesenclast %xmm8, %xmm7, %xmm7 + vmovdqu %xmm7, 528(%rsp) +L_AES_GCM_encrypt_vaes_iv_done: + # Additional authentication data + movl %r11d, %edx + cmpl $0x00, %edx + je L_AES_GCM_encrypt_vaes_calc_aad_done + xorl %ecx, %ecx + cmpl $16, %edx + jl L_AES_GCM_encrypt_vaes_calc_aad_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_encrypt_vaes_calc_aad_16_loop: + vmovdqu (%r12,%rcx,1), %xmm7 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm15, %xmm15 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm15, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm15, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm15, %xmm6, %xmm0 + vpxor %xmm15, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm15 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm15, %xmm15 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm15, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm15, %xmm15 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm15, %xmm15 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm15, %xmm15 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm15, %xmm15 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_vaes_calc_aad_16_loop + movl %r11d, %edx + cmpl %edx, %ecx + je L_AES_GCM_encrypt_vaes_calc_aad_done +L_AES_GCM_encrypt_vaes_calc_aad_lt16: + subq $16, %rsp + vpxor %xmm7, %xmm7, %xmm7 + xorl %ebx, %ebx + vmovdqu %xmm7, (%rsp) +L_AES_GCM_encrypt_vaes_calc_aad_loop: + movzbl (%r12,%rcx,1), %r13d + movb %r13b, (%rsp,%rbx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_vaes_calc_aad_loop + vmovdqu (%rsp), %xmm7 + addq $16, %rsp + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm15, %xmm15 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm15, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm15, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm15, %xmm6, %xmm0 + vpxor %xmm15, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm15 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm15, %xmm15 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm15, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm15, %xmm15 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm15, %xmm15 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm15, %xmm15 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm15, %xmm15 +L_AES_GCM_encrypt_vaes_calc_aad_done: + # Calculate counter and H + vpsrlq $63, %xmm6, %xmm8 + vpsllq $0x01, %xmm6, %xmm7 + vpslldq $8, %xmm8, %xmm8 + vpor %xmm8, %xmm7, %xmm7 + vpshufd $0xff, %xmm6, %xmm6 + vpsrad $31, %xmm6, %xmm6 + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm5, %xmm5 + vpand L_vaes_aes_gcm_mod2_128(%rip), %xmm6, %xmm6 + vpaddd L_vaes_aes_gcm_one(%rip), %xmm5, %xmm5 + vpxor %xmm7, %xmm6, %xmm6 + vmovdqu %xmm5, 512(%rsp) + xorl %ebx, %ebx + cmpl $0x80, %r9d + jl L_AES_GCM_encrypt_vaes_done_128 + vmovdqa %xmm15, %xmm2 + # H ^ 1 + vmovdqu %xmm6, (%rsp) + # H ^ 2 + vpclmulqdq $0x00, %xmm6, %xmm6, %xmm7 + vpclmulqdq $0x11, %xmm6, %xmm6, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm0 + vmovdqu %xmm0, 16(%rsp) + # H ^ 3 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm6, %xmm0, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm0, %xmm8 + vpclmulqdq $16, %xmm6, %xmm0, %xmm9 + vpclmulqdq $0x11, %xmm6, %xmm0, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm1 + vmovdqu %xmm1, 32(%rsp) + # H ^ 4 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm3 + vmovdqu %xmm3, 48(%rsp) + # H ^ 5 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 64(%rsp) + # H ^ 6 + vpclmulqdq $0x00, %xmm1, %xmm1, %xmm7 + vpclmulqdq $0x11, %xmm1, %xmm1, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 80(%rsp) + # H ^ 7 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm1, %xmm3, %xmm7 + vpclmulqdq $0x01, %xmm1, %xmm3, %xmm8 + vpclmulqdq $16, %xmm1, %xmm3, %xmm9 + vpclmulqdq $0x11, %xmm1, %xmm3, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 96(%rsp) + # H ^ 8 + vpclmulqdq $0x00, %xmm3, %xmm3, %xmm7 + vpclmulqdq $0x11, %xmm3, %xmm3, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 112(%rsp) + cmpl $0x100, %r9d + jl L_AES_GCM_encrypt_vaes_no_ext + # H ^ 9 + vmovdqu 48(%rsp), %xmm0 + vmovdqu 64(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 128(%rsp) + # H ^ 10 + vmovdqu 64(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 144(%rsp) + # H ^ 11 + vmovdqu 64(%rsp), %xmm0 + vmovdqu 80(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 160(%rsp) + # H ^ 12 + vmovdqu 80(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 176(%rsp) + # H ^ 13 + vmovdqu 80(%rsp), %xmm0 + vmovdqu 96(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 192(%rsp) + # H ^ 14 + vmovdqu 96(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 208(%rsp) + # H ^ 15 + vmovdqu 96(%rsp), %xmm0 + vmovdqu 112(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 224(%rsp) + # H ^ 16 + vmovdqu 112(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 240(%rsp) + vmovdqu 224(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 192(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 160(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu 128(%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + vmovdqu %ymm7, 256(%rsp) + vmovdqu %ymm8, 288(%rsp) + vmovdqu %ymm9, 320(%rsp) + vmovdqu %ymm10, 352(%rsp) + vmovdqu 96(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 64(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 32(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu (%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + vmovdqu %ymm7, 384(%rsp) + vmovdqu %ymm8, 416(%rsp) + vmovdqu %ymm9, 448(%rsp) + vmovdqu %ymm10, 480(%rsp) +L_AES_GCM_encrypt_vaes_no_ext: + vbroadcasti128 L_vaes_aes_gcm_mod2_128(%rip), %ymm14 + cmpl $0x100, %r9d + jl L_AES_GCM_encrypt_vaes_after_256 + movl %r9d, %r13d + andl $0xffffff00, %r13d +L_AES_GCM_encrypt_vaes_loop_256: + # 256 bytes of input + leaq (%rsi,%rbx,1), %rcx + movq %rcx, 544(%rsp) + vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6 + vbroadcasti128 512(%rsp), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu 512(%rsp), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, 512(%rsp) + vbroadcasti128 (%r15), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %r10d + vbroadcasti128 160(%r15), %ymm4 + jl L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %r10d + vbroadcasti128 192(%r15), %ymm4 + jl L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%r15), %ymm4 +L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %ebx + vbroadcasti128 512(%rsp), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu 512(%rsp), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, 512(%rsp) + vbroadcasti128 (%r15), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %r10d + vbroadcasti128 160(%r15), %ymm4 + jl L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %r10d + vbroadcasti128 192(%r15), %ymm4 + jl L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%r15), %ymm4 +L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %ebx + vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6 + movq 544(%rsp), %rcx + vpxor %ymm4, %ymm4, %ymm4 + vinserti128 $0x00, %xmm15, %ymm4, %ymm4 + vmovdqu 256(%rsp), %ymm7 + vmovdqu 288(%rsp), %ymm8 + vmovdqu 320(%rsp), %ymm9 + vmovdqu 352(%rsp), %ymm10 + vmovdqu (%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpxor %ymm4, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vmovdqa %ymm0, %ymm11 + vpxor %ymm1, %ymm2, %ymm12 + vmovdqa %ymm3, %ymm13 + vmovdqu 32(%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 64(%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 96(%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 384(%rsp), %ymm7 + vmovdqu 416(%rsp), %ymm8 + vmovdqu 448(%rsp), %ymm9 + vmovdqu 480(%rsp), %ymm10 + vmovdqu 128(%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 160(%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 192(%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 224(%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5 + vpshufd $0x4e, %ymm11, %ymm11 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm11, %ymm12, %ymm12 + vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5 + vpshufd $0x4e, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm12, %ymm13, %ymm13 + vextracti128 $0x01, %ymm13, %xmm0 + vpxor %xmm0, %xmm13, %xmm15 + cmpl %r13d, %ebx + jl L_AES_GCM_encrypt_vaes_loop_256 +L_AES_GCM_encrypt_vaes_after_256: + movl %r9d, %r13d + andl $0xffffff80, %r13d + cmpl %r13d, %ebx + jge L_AES_GCM_encrypt_vaes_after_128 + # 128 bytes of input + leaq (%rsi,%rbx,1), %rcx + movq %rcx, 544(%rsp) + vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6 + vbroadcasti128 512(%rsp), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu 512(%rsp), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, 512(%rsp) + vbroadcasti128 (%r15), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %r10d + vbroadcasti128 160(%r15), %ymm4 + jl L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %r10d + vbroadcasti128 192(%r15), %ymm4 + jl L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%r15), %ymm4 +L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %ebx + vmovdqu 96(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 64(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 32(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu (%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6 + movq 544(%rsp), %rcx + vpxor %ymm4, %ymm4, %ymm4 + vinserti128 $0x00, %xmm15, %ymm4, %ymm4 + vmovdqu (%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpxor %ymm4, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vmovdqa %ymm0, %ymm11 + vpxor %ymm1, %ymm2, %ymm12 + vmovdqa %ymm3, %ymm13 + vmovdqu 32(%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 64(%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 96(%rcx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5 + vpshufd $0x4e, %ymm11, %ymm11 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm11, %ymm12, %ymm12 + vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5 + vpshufd $0x4e, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm12, %ymm13, %ymm13 + vextracti128 $0x01, %ymm13, %xmm0 + vpxor %xmm0, %xmm13, %xmm15 +L_AES_GCM_encrypt_vaes_after_128: + vmovdqu (%rsp), %xmm6 +L_AES_GCM_encrypt_vaes_done_128: + movl %r9d, %edx + cmpl %edx, %ebx + jge L_AES_GCM_encrypt_vaes_done_enc + movl %r9d, %r13d + andl $0xfffffff0, %r13d + cmpl %r13d, %ebx + jge L_AES_GCM_encrypt_vaes_last_block_done + vmovdqu 512(%rsp), %xmm8 + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm8, %xmm7 + vpaddd L_vaes_aes_gcm_one(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 512(%rsp) + vpxor (%r15), %xmm7, %xmm7 + vaesenc 16(%r15), %xmm7, %xmm7 + vaesenc 32(%r15), %xmm7, %xmm7 + vaesenc 48(%r15), %xmm7, %xmm7 + vaesenc 64(%r15), %xmm7, %xmm7 + vaesenc 80(%r15), %xmm7, %xmm7 + vaesenc 96(%r15), %xmm7, %xmm7 + vaesenc 112(%r15), %xmm7, %xmm7 + vaesenc 128(%r15), %xmm7, %xmm7 + vaesenc 144(%r15), %xmm7, %xmm7 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm8 + jl L_AES_GCM_encrypt_vaes_aesenc_block_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 176(%r15), %xmm7, %xmm7 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm8 + jl L_AES_GCM_encrypt_vaes_aesenc_block_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 208(%r15), %xmm7, %xmm7 + vmovdqa 224(%r15), %xmm8 +L_AES_GCM_encrypt_vaes_aesenc_block_last: + vaesenclast %xmm8, %xmm7, %xmm7 + vmovdqu (%rdi,%rbx,1), %xmm8 + vpxor %xmm8, %xmm7, %xmm7 + vmovdqu %xmm7, (%rsi,%rbx,1) + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm15, %xmm15 + addl $16, %ebx + cmpl %r13d, %ebx + jge L_AES_GCM_encrypt_vaes_last_block_ghash +L_AES_GCM_encrypt_vaes_last_block_start: + vmovdqu (%rdi,%rbx,1), %xmm12 + vmovdqu 512(%rsp), %xmm8 + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm8, %xmm7 + vpaddd L_vaes_aes_gcm_one(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 512(%rsp) + vpxor (%r15), %xmm7, %xmm7 + vpclmulqdq $16, %xmm6, %xmm15, %xmm9 + vaesenc 16(%r15), %xmm7, %xmm7 + vaesenc 32(%r15), %xmm7, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm15, %xmm10 + vaesenc 48(%r15), %xmm7, %xmm7 + vaesenc 64(%r15), %xmm7, %xmm7 + vpclmulqdq $0x00, %xmm6, %xmm15, %xmm11 + vaesenc 80(%r15), %xmm7, %xmm7 + vpclmulqdq $0x11, %xmm6, %xmm15, %xmm1 + vaesenc 96(%r15), %xmm7, %xmm7 + vpxor %xmm10, %xmm9, %xmm9 + vpslldq $8, %xmm9, %xmm2 + vpsrldq $8, %xmm9, %xmm9 + vaesenc 112(%r15), %xmm7, %xmm7 + vpxor %xmm11, %xmm2, %xmm2 + vpxor %xmm9, %xmm1, %xmm3 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm0 + vpclmulqdq $16, %xmm0, %xmm2, %xmm10 + vaesenc 128(%r15), %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm9 + vpxor %xmm10, %xmm9, %xmm9 + vpclmulqdq $16, %xmm0, %xmm9, %xmm10 + vaesenc 144(%r15), %xmm7, %xmm7 + vpshufd $0x4e, %xmm9, %xmm9 + vpxor %xmm10, %xmm9, %xmm9 + vpxor %xmm3, %xmm9, %xmm15 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm8 + jl L_AES_GCM_encrypt_vaes_aesenc_gfmul_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 176(%r15), %xmm7, %xmm7 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm8 + jl L_AES_GCM_encrypt_vaes_aesenc_gfmul_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 208(%r15), %xmm7, %xmm7 + vmovdqa 224(%r15), %xmm8 +L_AES_GCM_encrypt_vaes_aesenc_gfmul_last: + vaesenclast %xmm8, %xmm7, %xmm7 + vmovdqa %xmm12, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vmovdqu %xmm7, (%rsi,%rbx,1) + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + addl $16, %ebx + vpxor %xmm7, %xmm15, %xmm15 + cmpl %r13d, %ebx + jl L_AES_GCM_encrypt_vaes_last_block_start +L_AES_GCM_encrypt_vaes_last_block_ghash: + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm6, %xmm15, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm15, %xmm8 + vpclmulqdq $16, %xmm6, %xmm15, %xmm9 + vpclmulqdq $0x11, %xmm6, %xmm15, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm15 +L_AES_GCM_encrypt_vaes_last_block_done: + movl %r9d, %ecx + movl %ecx, %edx + andl $15, %ecx + jz L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_done + vmovdqu 512(%rsp), %xmm5 + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm5, %xmm5 + vpxor (%r15), %xmm5, %xmm5 + vaesenc 16(%r15), %xmm5, %xmm5 + vaesenc 32(%r15), %xmm5, %xmm5 + vaesenc 48(%r15), %xmm5, %xmm5 + vaesenc 64(%r15), %xmm5, %xmm5 + vaesenc 80(%r15), %xmm5, %xmm5 + vaesenc 96(%r15), %xmm5, %xmm5 + vaesenc 112(%r15), %xmm5, %xmm5 + vaesenc 128(%r15), %xmm5, %xmm5 + vaesenc 144(%r15), %xmm5, %xmm5 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm8 + jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc %xmm8, %xmm5, %xmm5 + vaesenc 176(%r15), %xmm5, %xmm5 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm8 + jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc %xmm8, %xmm5, %xmm5 + vaesenc 208(%r15), %xmm5, %xmm5 + vmovdqa 224(%r15), %xmm8 +L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_aesenc_avx_last: + vaesenclast %xmm8, %xmm5, %xmm5 + subq $16, %rsp + xorl %ecx, %ecx + vmovdqu %xmm5, (%rsp) +L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_loop: + movzbl (%rdi,%rbx,1), %r13d + xorb (%rsp,%rcx,1), %r13b + movb %r13b, (%rsi,%rbx,1) + movb %r13b, (%rsp,%rcx,1) + incl %ebx + incl %ecx + cmpl %edx, %ebx + jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_loop + xorq %r13, %r13 + cmpl $16, %ecx + je L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_finish_enc +L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_byte_loop: + movb %r13b, (%rsp,%rcx,1) + incl %ecx + cmpl $16, %ecx + jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_byte_loop +L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_finish_enc: + vmovdqu (%rsp), %xmm5 + addq $16, %rsp + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + vpxor %xmm5, %xmm15, %xmm15 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm6, %xmm15, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm15, %xmm8 + vpclmulqdq $16, %xmm6, %xmm15, %xmm9 + vpclmulqdq $0x11, %xmm6, %xmm15, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm15 +L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_done: +L_AES_GCM_encrypt_vaes_done_enc: + movl %r9d, %edx + movl %r11d, %ecx + shlq $3, %rdx + shlq $3, %rcx + vmovq %rdx, %xmm0 + vmovq %rcx, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 + vpxor %xmm0, %xmm15, %xmm15 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm6, %xmm15, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm15, %xmm8 + vpclmulqdq $16, %xmm6, %xmm15, %xmm9 + vpclmulqdq $0x11, %xmm6, %xmm15, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm15 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm15, %xmm15 + vmovdqu 528(%rsp), %xmm0 + vpxor %xmm15, %xmm0, %xmm0 + cmpl $16, %r14d + je L_AES_GCM_encrypt_vaes_store_tag_16 + xorq %rcx, %rcx + vmovdqu %xmm0, (%rsp) +L_AES_GCM_encrypt_vaes_store_tag_loop: + movzbl (%rsp,%rcx,1), %r13d + movb %r13b, (%r8,%rcx,1) + incl %ecx + cmpl %r14d, %ecx + jne L_AES_GCM_encrypt_vaes_store_tag_loop + jmp L_AES_GCM_encrypt_vaes_store_tag_done +L_AES_GCM_encrypt_vaes_store_tag_16: + vmovdqu %xmm0, (%r8) +L_AES_GCM_encrypt_vaes_store_tag_done: + vzeroupper + addq $0x230, %rsp + popq %r15 + popq %r14 + popq %rbx + popq %r12 + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_encrypt_vaes,.-AES_GCM_encrypt_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_decrypt_vaes +.type AES_GCM_decrypt_vaes,@function +.align 16 +AES_GCM_decrypt_vaes: +#else +.section __TEXT,__text +.globl _AES_GCM_decrypt_vaes +.p2align 4 +_AES_GCM_decrypt_vaes: +#endif /* __APPLE__ */ + pushq %r13 + pushq %r12 + pushq %rbx + pushq %r14 + pushq %r15 + pushq %rbp + movq %rdx, %r12 + movq %rcx, %rax + movl 56(%rsp), %r11d + movl 64(%rsp), %ebx + movl 72(%rsp), %r14d + movq 80(%rsp), %r15 + movl 88(%rsp), %r10d + movq 96(%rsp), %rbp + subq $0x220, %rsp + vpxor %xmm5, %xmm5, %xmm5 + vpxor %xmm15, %xmm15, %xmm15 + cmpl $12, %ebx + movl %ebx, %edx + jne L_AES_GCM_decrypt_vaes_iv_not_12 + # # Calculate values when IV is 12 bytes + # Set counter based on IV + movl $0x1000000, %ecx + vmovq (%rax), %xmm5 + vpinsrd $2, 8(%rax), %xmm5, %xmm5 + vpinsrd $3, %ecx, %xmm5, %xmm5 + # H = Encrypt X(=0) and T = Encrypt counter + vmovdqa (%r15), %xmm6 + vpxor %xmm6, %xmm5, %xmm1 + vmovdqa 16(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 32(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 48(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 64(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 80(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 96(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 112(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 128(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 144(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm4 + jl L_AES_GCM_decrypt_vaes_calc_iv_12_last + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 176(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm4 + jl L_AES_GCM_decrypt_vaes_calc_iv_12_last + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 208(%r15), %xmm4 + vaesenc %xmm4, %xmm6, %xmm6 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqa 224(%r15), %xmm4 +L_AES_GCM_decrypt_vaes_calc_iv_12_last: + vaesenclast %xmm4, %xmm6, %xmm6 + vaesenclast %xmm4, %xmm1, %xmm1 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6 + vmovdqu %xmm1, 528(%rsp) + jmp L_AES_GCM_decrypt_vaes_iv_done +L_AES_GCM_decrypt_vaes_iv_not_12: + # Calculate values when IV is not 12 bytes + # H = Encrypt X(=0) + vmovdqa (%r15), %xmm6 + vaesenc 16(%r15), %xmm6, %xmm6 + vaesenc 32(%r15), %xmm6, %xmm6 + vaesenc 48(%r15), %xmm6, %xmm6 + vaesenc 64(%r15), %xmm6, %xmm6 + vaesenc 80(%r15), %xmm6, %xmm6 + vaesenc 96(%r15), %xmm6, %xmm6 + vaesenc 112(%r15), %xmm6, %xmm6 + vaesenc 128(%r15), %xmm6, %xmm6 + vaesenc 144(%r15), %xmm6, %xmm6 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm8 + jl L_AES_GCM_decrypt_vaes_calc_iv_1_aesenc_avx_last + vaesenc %xmm8, %xmm6, %xmm6 + vaesenc 176(%r15), %xmm6, %xmm6 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm8 + jl L_AES_GCM_decrypt_vaes_calc_iv_1_aesenc_avx_last + vaesenc %xmm8, %xmm6, %xmm6 + vaesenc 208(%r15), %xmm6, %xmm6 + vmovdqa 224(%r15), %xmm8 +L_AES_GCM_decrypt_vaes_calc_iv_1_aesenc_avx_last: + vaesenclast %xmm8, %xmm6, %xmm6 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6 + # Calc counter + # Initialization vector + cmpl $0x00, %edx + movq $0x00, %rcx + je L_AES_GCM_decrypt_vaes_calc_iv_done + cmpl $16, %edx + jl L_AES_GCM_decrypt_vaes_calc_iv_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_decrypt_vaes_calc_iv_16_loop: + vmovdqu (%rax,%rcx,1), %xmm7 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm5, %xmm5 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm5 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm5, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm5, %xmm5 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm5, %xmm5 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm5, %xmm5 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_vaes_calc_iv_16_loop + movl %ebx, %edx + cmpl %edx, %ecx + je L_AES_GCM_decrypt_vaes_calc_iv_done +L_AES_GCM_decrypt_vaes_calc_iv_lt16: + subq $16, %rsp + vpxor %xmm7, %xmm7, %xmm7 + xorl %ebx, %ebx + vmovdqu %xmm7, (%rsp) +L_AES_GCM_decrypt_vaes_calc_iv_loop: + movzbl (%rax,%rcx,1), %r13d + movb %r13b, (%rsp,%rbx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_vaes_calc_iv_loop + vmovdqu (%rsp), %xmm7 + addq $16, %rsp + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm5, %xmm5 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm5 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm5, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm5, %xmm5 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm5, %xmm5 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm5, %xmm5 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 +L_AES_GCM_decrypt_vaes_calc_iv_done: + # T = Encrypt counter + vpxor %xmm0, %xmm0, %xmm0 + shll $3, %edx + vmovq %rdx, %xmm0 + vpxor %xmm0, %xmm5, %xmm5 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm5 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm5, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm5, %xmm5 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm5, %xmm5 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm5, %xmm5 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + # Encrypt counter + vmovdqa (%r15), %xmm7 + vpxor %xmm5, %xmm7, %xmm7 + vaesenc 16(%r15), %xmm7, %xmm7 + vaesenc 32(%r15), %xmm7, %xmm7 + vaesenc 48(%r15), %xmm7, %xmm7 + vaesenc 64(%r15), %xmm7, %xmm7 + vaesenc 80(%r15), %xmm7, %xmm7 + vaesenc 96(%r15), %xmm7, %xmm7 + vaesenc 112(%r15), %xmm7, %xmm7 + vaesenc 128(%r15), %xmm7, %xmm7 + vaesenc 144(%r15), %xmm7, %xmm7 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm8 + jl L_AES_GCM_decrypt_vaes_calc_iv_2_aesenc_avx_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 176(%r15), %xmm7, %xmm7 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm8 + jl L_AES_GCM_decrypt_vaes_calc_iv_2_aesenc_avx_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 208(%r15), %xmm7, %xmm7 + vmovdqa 224(%r15), %xmm8 +L_AES_GCM_decrypt_vaes_calc_iv_2_aesenc_avx_last: + vaesenclast %xmm8, %xmm7, %xmm7 + vmovdqu %xmm7, 528(%rsp) +L_AES_GCM_decrypt_vaes_iv_done: + # Additional authentication data + movl %r11d, %edx + cmpl $0x00, %edx + je L_AES_GCM_decrypt_vaes_calc_aad_done + xorl %ecx, %ecx + cmpl $16, %edx + jl L_AES_GCM_decrypt_vaes_calc_aad_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_decrypt_vaes_calc_aad_16_loop: + vmovdqu (%r12,%rcx,1), %xmm7 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm15, %xmm15 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm15, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm15, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm15, %xmm6, %xmm0 + vpxor %xmm15, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm15 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm15, %xmm15 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm15, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm15, %xmm15 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm15, %xmm15 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm15, %xmm15 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm15, %xmm15 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_vaes_calc_aad_16_loop + movl %r11d, %edx + cmpl %edx, %ecx + je L_AES_GCM_decrypt_vaes_calc_aad_done +L_AES_GCM_decrypt_vaes_calc_aad_lt16: + subq $16, %rsp + vpxor %xmm7, %xmm7, %xmm7 + xorl %ebx, %ebx + vmovdqu %xmm7, (%rsp) +L_AES_GCM_decrypt_vaes_calc_aad_loop: + movzbl (%r12,%rcx,1), %r13d + movb %r13b, (%rsp,%rbx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_vaes_calc_aad_loop + vmovdqu (%rsp), %xmm7 + addq $16, %rsp + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm15, %xmm15 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm15, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm15, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm15, %xmm6, %xmm0 + vpxor %xmm15, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm15 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm15, %xmm15 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm15, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm15, %xmm15 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm15, %xmm15 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm15, %xmm15 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm15, %xmm15 +L_AES_GCM_decrypt_vaes_calc_aad_done: + # Calculate counter and H + vpsrlq $63, %xmm6, %xmm8 + vpsllq $0x01, %xmm6, %xmm7 + vpslldq $8, %xmm8, %xmm8 + vpor %xmm8, %xmm7, %xmm7 + vpshufd $0xff, %xmm6, %xmm6 + vpsrad $31, %xmm6, %xmm6 + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm5, %xmm5 + vpand L_vaes_aes_gcm_mod2_128(%rip), %xmm6, %xmm6 + vpaddd L_vaes_aes_gcm_one(%rip), %xmm5, %xmm5 + vpxor %xmm7, %xmm6, %xmm6 + vmovdqu %xmm5, 512(%rsp) + xorl %ebx, %ebx + cmpl $0x80, %r9d + jl L_AES_GCM_decrypt_vaes_done_128 + vmovdqa %xmm15, %xmm2 + # H ^ 1 + vmovdqu %xmm6, (%rsp) + # H ^ 2 + vpclmulqdq $0x00, %xmm6, %xmm6, %xmm7 + vpclmulqdq $0x11, %xmm6, %xmm6, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm0 + vmovdqu %xmm0, 16(%rsp) + # H ^ 3 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm6, %xmm0, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm0, %xmm8 + vpclmulqdq $16, %xmm6, %xmm0, %xmm9 + vpclmulqdq $0x11, %xmm6, %xmm0, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm1 + vmovdqu %xmm1, 32(%rsp) + # H ^ 4 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm3 + vmovdqu %xmm3, 48(%rsp) + # H ^ 5 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 64(%rsp) + # H ^ 6 + vpclmulqdq $0x00, %xmm1, %xmm1, %xmm7 + vpclmulqdq $0x11, %xmm1, %xmm1, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 80(%rsp) + # H ^ 7 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm1, %xmm3, %xmm7 + vpclmulqdq $0x01, %xmm1, %xmm3, %xmm8 + vpclmulqdq $16, %xmm1, %xmm3, %xmm9 + vpclmulqdq $0x11, %xmm1, %xmm3, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 96(%rsp) + # H ^ 8 + vpclmulqdq $0x00, %xmm3, %xmm3, %xmm7 + vpclmulqdq $0x11, %xmm3, %xmm3, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 112(%rsp) + cmpl $0x100, %r9d + jl L_AES_GCM_decrypt_vaes_no_ext + # H ^ 9 + vmovdqu 48(%rsp), %xmm0 + vmovdqu 64(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 128(%rsp) + # H ^ 10 + vmovdqu 64(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 144(%rsp) + # H ^ 11 + vmovdqu 64(%rsp), %xmm0 + vmovdqu 80(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 160(%rsp) + # H ^ 12 + vmovdqu 80(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 176(%rsp) + # H ^ 13 + vmovdqu 80(%rsp), %xmm0 + vmovdqu 96(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 192(%rsp) + # H ^ 14 + vmovdqu 96(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 208(%rsp) + # H ^ 15 + vmovdqu 96(%rsp), %xmm0 + vmovdqu 112(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 224(%rsp) + # H ^ 16 + vmovdqu 112(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 240(%rsp) + vmovdqu 224(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 192(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 160(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu 128(%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + vmovdqu %ymm7, 256(%rsp) + vmovdqu %ymm8, 288(%rsp) + vmovdqu %ymm9, 320(%rsp) + vmovdqu %ymm10, 352(%rsp) + vmovdqu 96(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 64(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 32(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu (%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + vmovdqu %ymm7, 384(%rsp) + vmovdqu %ymm8, 416(%rsp) + vmovdqu %ymm9, 448(%rsp) + vmovdqu %ymm10, 480(%rsp) +L_AES_GCM_decrypt_vaes_no_ext: + vbroadcasti128 L_vaes_aes_gcm_mod2_128(%rip), %ymm14 + cmpl $0x100, %r9d + jl L_AES_GCM_decrypt_vaes_after_256 + movl %r9d, %r13d + andl $0xffffff00, %r13d +L_AES_GCM_decrypt_vaes_loop_256: + # 256 bytes of input + leaq (%rdi,%rbx,1), %rax + vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6 + vpxor %ymm4, %ymm4, %ymm4 + vinserti128 $0x00, %xmm15, %ymm4, %ymm4 + vmovdqu 256(%rsp), %ymm7 + vmovdqu 288(%rsp), %ymm8 + vmovdqu 320(%rsp), %ymm9 + vmovdqu 352(%rsp), %ymm10 + vmovdqu (%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpxor %ymm4, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vmovdqa %ymm0, %ymm11 + vpxor %ymm1, %ymm2, %ymm12 + vmovdqa %ymm3, %ymm13 + vmovdqu 32(%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 64(%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 96(%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 384(%rsp), %ymm7 + vmovdqu 416(%rsp), %ymm8 + vmovdqu 448(%rsp), %ymm9 + vmovdqu 480(%rsp), %ymm10 + vmovdqu 128(%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 160(%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 192(%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 224(%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5 + vpshufd $0x4e, %ymm11, %ymm11 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm11, %ymm12, %ymm12 + vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5 + vpshufd $0x4e, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm12, %ymm13, %ymm13 + vextracti128 $0x01, %ymm13, %xmm0 + vpxor %xmm0, %xmm13, %xmm15 + vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6 + vbroadcasti128 512(%rsp), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu 512(%rsp), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, 512(%rsp) + vbroadcasti128 (%r15), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %r10d + vbroadcasti128 160(%r15), %ymm4 + jl L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %r10d + vbroadcasti128 192(%r15), %ymm4 + jl L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%r15), %ymm4 +L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %ebx + vbroadcasti128 512(%rsp), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu 512(%rsp), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, 512(%rsp) + vbroadcasti128 (%r15), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %r10d + vbroadcasti128 160(%r15), %ymm4 + jl L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %r10d + vbroadcasti128 192(%r15), %ymm4 + jl L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%r15), %ymm4 +L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %ebx + cmpl %r13d, %ebx + jl L_AES_GCM_decrypt_vaes_loop_256 +L_AES_GCM_decrypt_vaes_after_256: + vmovdqu 96(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 64(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 32(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu (%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + movl %r9d, %r13d + andl $0xffffff80, %r13d + cmpl %r13d, %ebx + jge L_AES_GCM_decrypt_vaes_after_128 + # 128 bytes of input + leaq (%rdi,%rbx,1), %rax + vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6 + vpxor %ymm4, %ymm4, %ymm4 + vinserti128 $0x00, %xmm15, %ymm4, %ymm4 + vmovdqu (%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpxor %ymm4, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vmovdqa %ymm0, %ymm11 + vpxor %ymm1, %ymm2, %ymm12 + vmovdqa %ymm3, %ymm13 + vmovdqu 32(%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 64(%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 96(%rax), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5 + vpshufd $0x4e, %ymm11, %ymm11 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm11, %ymm12, %ymm12 + vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5 + vpshufd $0x4e, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm12, %ymm13, %ymm13 + vextracti128 $0x01, %ymm13, %xmm0 + vpxor %xmm0, %xmm13, %xmm15 + vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6 + vbroadcasti128 512(%rsp), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu 512(%rsp), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, 512(%rsp) + vbroadcasti128 (%r15), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %r10d + vbroadcasti128 160(%r15), %ymm4 + jl L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %r10d + vbroadcasti128 192(%r15), %ymm4 + jl L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%r15), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%r15), %ymm4 +L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %ebx +L_AES_GCM_decrypt_vaes_after_128: + vmovdqu (%rsp), %xmm6 +L_AES_GCM_decrypt_vaes_done_128: + movl %r9d, %edx + cmpl %edx, %ebx + jge L_AES_GCM_decrypt_vaes_done_dec + movl %r9d, %r13d + andl $0xfffffff0, %r13d + cmpl %r13d, %ebx + jge L_AES_GCM_decrypt_vaes_last_block_done +L_AES_GCM_decrypt_vaes_last_block_start: + vmovdqu (%rdi,%rbx,1), %xmm12 + vmovdqa %xmm6, %xmm0 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm12, %xmm1 + vpxor %xmm15, %xmm1, %xmm1 + vmovdqu 512(%rsp), %xmm8 + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm8, %xmm7 + vpaddd L_vaes_aes_gcm_one(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 512(%rsp) + vpxor (%r15), %xmm7, %xmm7 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vaesenc 16(%r15), %xmm7, %xmm7 + vaesenc 32(%r15), %xmm7, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm10 + vaesenc 48(%r15), %xmm7, %xmm7 + vaesenc 64(%r15), %xmm7, %xmm7 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm11 + vaesenc 80(%r15), %xmm7, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vaesenc 96(%r15), %xmm7, %xmm7 + vpxor %xmm10, %xmm9, %xmm9 + vpslldq $8, %xmm9, %xmm2 + vpsrldq $8, %xmm9, %xmm9 + vaesenc 112(%r15), %xmm7, %xmm7 + vpxor %xmm11, %xmm2, %xmm2 + vpxor %xmm9, %xmm1, %xmm3 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm0 + vpclmulqdq $16, %xmm0, %xmm2, %xmm10 + vaesenc 128(%r15), %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm9 + vpxor %xmm10, %xmm9, %xmm9 + vpclmulqdq $16, %xmm0, %xmm9, %xmm10 + vaesenc 144(%r15), %xmm7, %xmm7 + vpshufd $0x4e, %xmm9, %xmm9 + vpxor %xmm10, %xmm9, %xmm9 + vpxor %xmm3, %xmm9, %xmm15 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm8 + jl L_AES_GCM_decrypt_vaes_aesenc_gfmul_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 176(%r15), %xmm7, %xmm7 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm8 + jl L_AES_GCM_decrypt_vaes_aesenc_gfmul_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 208(%r15), %xmm7, %xmm7 + vmovdqa 224(%r15), %xmm8 +L_AES_GCM_decrypt_vaes_aesenc_gfmul_last: + vaesenclast %xmm8, %xmm7, %xmm7 + vmovdqa %xmm12, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vmovdqu %xmm7, (%rsi,%rbx,1) + addl $16, %ebx + cmpl %r13d, %ebx + jl L_AES_GCM_decrypt_vaes_last_block_start +L_AES_GCM_decrypt_vaes_last_block_done: + movl %r9d, %ecx + movl %ecx, %edx + andl $15, %ecx + jz L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_done + vmovdqu 512(%rsp), %xmm5 + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm5, %xmm5 + vpxor (%r15), %xmm5, %xmm5 + vaesenc 16(%r15), %xmm5, %xmm5 + vaesenc 32(%r15), %xmm5, %xmm5 + vaesenc 48(%r15), %xmm5, %xmm5 + vaesenc 64(%r15), %xmm5, %xmm5 + vaesenc 80(%r15), %xmm5, %xmm5 + vaesenc 96(%r15), %xmm5, %xmm5 + vaesenc 112(%r15), %xmm5, %xmm5 + vaesenc 128(%r15), %xmm5, %xmm5 + vaesenc 144(%r15), %xmm5, %xmm5 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm8 + jl L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc %xmm8, %xmm5, %xmm5 + vaesenc 176(%r15), %xmm5, %xmm5 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm8 + jl L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc %xmm8, %xmm5, %xmm5 + vaesenc 208(%r15), %xmm5, %xmm5 + vmovdqa 224(%r15), %xmm8 +L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_aesenc_avx_last: + vaesenclast %xmm8, %xmm5, %xmm5 + subq $32, %rsp + xorl %ecx, %ecx + vmovdqu %xmm5, (%rsp) + vpxor %xmm0, %xmm0, %xmm0 + vmovdqu %xmm0, 16(%rsp) +L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_loop: + movzbl (%rdi,%rbx,1), %r13d + movb %r13b, 16(%rsp,%rcx,1) + xorb (%rsp,%rcx,1), %r13b + movb %r13b, (%rsi,%rbx,1) + incl %ebx + incl %ecx + cmpl %edx, %ebx + jl L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_loop + vmovdqu 16(%rsp), %xmm5 + addq $32, %rsp + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + vpxor %xmm5, %xmm15, %xmm15 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm6, %xmm15, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm15, %xmm8 + vpclmulqdq $16, %xmm6, %xmm15, %xmm9 + vpclmulqdq $0x11, %xmm6, %xmm15, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm15 +L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_done: +L_AES_GCM_decrypt_vaes_done_dec: + movl %r9d, %edx + movl %r11d, %ecx + shlq $3, %rdx + shlq $3, %rcx + vmovq %rdx, %xmm0 + vmovq %rcx, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 + vpxor %xmm0, %xmm15, %xmm15 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm6, %xmm15, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm15, %xmm8 + vpclmulqdq $16, %xmm6, %xmm15, %xmm9 + vpclmulqdq $0x11, %xmm6, %xmm15, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm15 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm15, %xmm15 + vmovdqu 528(%rsp), %xmm0 + vpxor %xmm15, %xmm0, %xmm0 + cmpl $16, %r14d + je L_AES_GCM_decrypt_vaes_cmp_tag_16 + subq $16, %rsp + xorq %rcx, %rcx + xorq %rbx, %rbx + vmovdqu %xmm0, (%rsp) +L_AES_GCM_decrypt_vaes_cmp_tag_loop: + movzbl (%rsp,%rcx,1), %r13d + xorb (%r8,%rcx,1), %r13b + orb %r13b, %bl + incl %ecx + cmpl %r14d, %ecx + jne L_AES_GCM_decrypt_vaes_cmp_tag_loop + cmpb $0x00, %bl + sete %bl + addq $16, %rsp + xorq %rcx, %rcx + jmp L_AES_GCM_decrypt_vaes_cmp_tag_done +L_AES_GCM_decrypt_vaes_cmp_tag_16: + vmovdqu (%r8), %xmm1 + vpcmpeqb %xmm1, %xmm0, %xmm0 + vpmovmskb %xmm0, %rdx + # %%edx == 0xFFFF then return 1 else => return 0 + xorl %ebx, %ebx + cmpl $0xffff, %edx + sete %bl +L_AES_GCM_decrypt_vaes_cmp_tag_done: + movl %ebx, (%rbp) + vzeroupper + addq $0x220, %rsp + popq %rbp + popq %r15 + popq %r14 + popq %rbx + popq %r12 + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_decrypt_vaes,.-AES_GCM_decrypt_vaes +#endif /* __APPLE__ */ +#ifdef WOLFSSL_AESGCM_STREAM +#ifndef __APPLE__ +.text +.globl AES_GCM_init_vaes +.type AES_GCM_init_vaes,@function +.align 16 +AES_GCM_init_vaes: +#else +.section __TEXT,__text +.globl _AES_GCM_init_vaes +.p2align 4 +_AES_GCM_init_vaes: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + movq %rdx, %r10 + movl %ecx, %r11d + movq 24(%rsp), %rax + subq $16, %rsp + vpxor %xmm4, %xmm4, %xmm4 + movl %r11d, %edx + cmpl $12, %edx + jne L_AES_GCM_init_vaes_iv_not_12 + # # Calculate values when IV is 12 bytes + # Set counter based on IV + movl $0x1000000, %ecx + vmovq (%r10), %xmm4 + vpinsrd $2, 8(%r10), %xmm4, %xmm4 + vpinsrd $3, %ecx, %xmm4, %xmm4 + # H = Encrypt X(=0) and T = Encrypt counter + vmovdqa (%rdi), %xmm5 + vpxor %xmm5, %xmm4, %xmm1 + vmovdqa 16(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 32(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 48(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 64(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 80(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 96(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 112(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 128(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 144(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm6 + jl L_AES_GCM_init_vaes_calc_iv_12_last + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 176(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm6 + jl L_AES_GCM_init_vaes_calc_iv_12_last + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 208(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 224(%rdi), %xmm6 +L_AES_GCM_init_vaes_calc_iv_12_last: + vaesenclast %xmm6, %xmm5, %xmm5 + vaesenclast %xmm6, %xmm1, %xmm1 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + vmovdqu %xmm1, %xmm15 + jmp L_AES_GCM_init_vaes_iv_done +L_AES_GCM_init_vaes_iv_not_12: + # Calculate values when IV is not 12 bytes + # H = Encrypt X(=0) + vmovdqa (%rdi), %xmm5 + vaesenc 16(%rdi), %xmm5, %xmm5 + vaesenc 32(%rdi), %xmm5, %xmm5 + vaesenc 48(%rdi), %xmm5, %xmm5 + vaesenc 64(%rdi), %xmm5, %xmm5 + vaesenc 80(%rdi), %xmm5, %xmm5 + vaesenc 96(%rdi), %xmm5, %xmm5 + vaesenc 112(%rdi), %xmm5, %xmm5 + vaesenc 128(%rdi), %xmm5, %xmm5 + vaesenc 144(%rdi), %xmm5, %xmm5 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm8 + jl L_AES_GCM_init_vaes_calc_iv_1_aesenc_avx_last + vaesenc %xmm8, %xmm5, %xmm5 + vaesenc 176(%rdi), %xmm5, %xmm5 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm8 + jl L_AES_GCM_init_vaes_calc_iv_1_aesenc_avx_last + vaesenc %xmm8, %xmm5, %xmm5 + vaesenc 208(%rdi), %xmm5, %xmm5 + vmovdqa 224(%rdi), %xmm8 +L_AES_GCM_init_vaes_calc_iv_1_aesenc_avx_last: + vaesenclast %xmm8, %xmm5, %xmm5 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + # Calc counter + # Initialization vector + cmpl $0x00, %edx + movq $0x00, %rcx + je L_AES_GCM_init_vaes_calc_iv_done + cmpl $16, %edx + jl L_AES_GCM_init_vaes_calc_iv_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_init_vaes_calc_iv_16_loop: + vmovdqu (%r10,%rcx,1), %xmm7 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm6 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm6, %xmm6 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm6, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm6, %xmm6 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm6, %xmm6 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm6, %xmm0 + vpslld $30, %xmm6, %xmm1 + vpslld $25, %xmm6, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm2 + vpsrld $2, %xmm6, %xmm3 + vpsrld $7, %xmm6, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm6, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_init_vaes_calc_iv_16_loop + movl %r11d, %edx + cmpl %edx, %ecx + je L_AES_GCM_init_vaes_calc_iv_done +L_AES_GCM_init_vaes_calc_iv_lt16: + subq $16, %rsp + vpxor %xmm7, %xmm7, %xmm7 + xorl %r13d, %r13d + vmovdqu %xmm7, (%rsp) +L_AES_GCM_init_vaes_calc_iv_loop: + movzbl (%r10,%rcx,1), %r12d + movb %r12b, (%rsp,%r13,1) + incl %ecx + incl %r13d + cmpl %edx, %ecx + jl L_AES_GCM_init_vaes_calc_iv_loop + vmovdqu (%rsp), %xmm7 + addq $16, %rsp + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm6 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm6, %xmm6 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm6, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm6, %xmm6 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm6, %xmm6 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm6, %xmm0 + vpslld $30, %xmm6, %xmm1 + vpslld $25, %xmm6, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm2 + vpsrld $2, %xmm6, %xmm3 + vpsrld $7, %xmm6, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm6, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 +L_AES_GCM_init_vaes_calc_iv_done: + # T = Encrypt counter + vpxor %xmm0, %xmm0, %xmm0 + shll $3, %edx + vmovq %rdx, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm6 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm6, %xmm6 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm6, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm6, %xmm6 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm6, %xmm6 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm6, %xmm0 + vpslld $30, %xmm6, %xmm1 + vpslld $25, %xmm6, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm2 + vpsrld $2, %xmm6, %xmm3 + vpsrld $7, %xmm6, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm6, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4 + # Encrypt counter + vmovdqa (%rdi), %xmm7 + vpxor %xmm4, %xmm7, %xmm7 + vaesenc 16(%rdi), %xmm7, %xmm7 + vaesenc 32(%rdi), %xmm7, %xmm7 + vaesenc 48(%rdi), %xmm7, %xmm7 + vaesenc 64(%rdi), %xmm7, %xmm7 + vaesenc 80(%rdi), %xmm7, %xmm7 + vaesenc 96(%rdi), %xmm7, %xmm7 + vaesenc 112(%rdi), %xmm7, %xmm7 + vaesenc 128(%rdi), %xmm7, %xmm7 + vaesenc 144(%rdi), %xmm7, %xmm7 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm8 + jl L_AES_GCM_init_vaes_calc_iv_2_aesenc_avx_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 176(%rdi), %xmm7, %xmm7 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm8 + jl L_AES_GCM_init_vaes_calc_iv_2_aesenc_avx_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 208(%rdi), %xmm7, %xmm7 + vmovdqa 224(%rdi), %xmm8 +L_AES_GCM_init_vaes_calc_iv_2_aesenc_avx_last: + vaesenclast %xmm8, %xmm7, %xmm7 + vmovdqu %xmm7, %xmm15 +L_AES_GCM_init_vaes_iv_done: + vmovdqa %xmm15, (%rax) + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4 + vpaddd L_vaes_aes_gcm_one(%rip), %xmm4, %xmm4 + vmovdqa %xmm5, (%r8) + vmovdqa %xmm4, (%r9) + addq $16, %rsp + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_GCM_init_vaes,.-AES_GCM_init_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_aad_update_vaes +.type AES_GCM_aad_update_vaes,@function +.align 16 +AES_GCM_aad_update_vaes: +#else +.section __TEXT,__text +.globl _AES_GCM_aad_update_vaes +.p2align 4 +_AES_GCM_aad_update_vaes: +#endif /* __APPLE__ */ + movq %rcx, %rax + vmovdqa (%rdx), %xmm5 + vmovdqa (%rax), %xmm6 + xorl %ecx, %ecx +L_AES_GCM_aad_update_vaes_16_loop: + vmovdqu (%rdi,%rcx,1), %xmm7 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm5, %xmm5 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm5 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm5, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm5, %xmm5 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm5, %xmm5 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm5, %xmm5 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + addl $16, %ecx + cmpl %esi, %ecx + jl L_AES_GCM_aad_update_vaes_16_loop + vmovdqa %xmm5, (%rdx) + repz retq +#ifndef __APPLE__ +.size AES_GCM_aad_update_vaes,.-AES_GCM_aad_update_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_encrypt_block_vaes +.type AES_GCM_encrypt_block_vaes,@function +.align 16 +AES_GCM_encrypt_block_vaes: +#else +.section __TEXT,__text +.globl _AES_GCM_encrypt_block_vaes +.p2align 4 +_AES_GCM_encrypt_block_vaes: +#endif /* __APPLE__ */ + movq %rdx, %r10 + movq %rcx, %r11 + vmovdqu (%r8), %xmm1 + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm1, %xmm0 + vpaddd L_vaes_aes_gcm_one(%rip), %xmm1, %xmm1 + vmovdqu %xmm1, (%r8) + vpxor (%rdi), %xmm0, %xmm0 + vaesenc 16(%rdi), %xmm0, %xmm0 + vaesenc 32(%rdi), %xmm0, %xmm0 + vaesenc 48(%rdi), %xmm0, %xmm0 + vaesenc 64(%rdi), %xmm0, %xmm0 + vaesenc 80(%rdi), %xmm0, %xmm0 + vaesenc 96(%rdi), %xmm0, %xmm0 + vaesenc 112(%rdi), %xmm0, %xmm0 + vaesenc 128(%rdi), %xmm0, %xmm0 + vaesenc 144(%rdi), %xmm0, %xmm0 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm1 + jl L_AES_GCM_encrypt_block_vaes_aesenc_block_last + vaesenc %xmm1, %xmm0, %xmm0 + vaesenc 176(%rdi), %xmm0, %xmm0 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm1 + jl L_AES_GCM_encrypt_block_vaes_aesenc_block_last + vaesenc %xmm1, %xmm0, %xmm0 + vaesenc 208(%rdi), %xmm0, %xmm0 + vmovdqa 224(%rdi), %xmm1 +L_AES_GCM_encrypt_block_vaes_aesenc_block_last: + vaesenclast %xmm1, %xmm0, %xmm0 + vmovdqu (%r11), %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vmovdqu %xmm0, (%r10) + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 + vzeroupper + repz retq +#ifndef __APPLE__ +.size AES_GCM_encrypt_block_vaes,.-AES_GCM_encrypt_block_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_ghash_block_vaes +.type AES_GCM_ghash_block_vaes,@function +.align 16 +AES_GCM_ghash_block_vaes: +#else +.section __TEXT,__text +.globl _AES_GCM_ghash_block_vaes +.p2align 4 +_AES_GCM_ghash_block_vaes: +#endif /* __APPLE__ */ + vmovdqa (%rsi), %xmm4 + vmovdqa (%rdx), %xmm5 + vmovdqu (%rdi), %xmm7 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm6 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm6, %xmm6 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm6, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm6, %xmm6 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm6, %xmm6 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm6, %xmm0 + vpslld $30, %xmm6, %xmm1 + vpslld $25, %xmm6, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm2 + vpsrld $2, %xmm6, %xmm3 + vpsrld $7, %xmm6, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm6, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vmovdqa %xmm4, (%rsi) + vzeroupper + repz retq +#ifndef __APPLE__ +.size AES_GCM_ghash_block_vaes,.-AES_GCM_ghash_block_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_encrypt_update_vaes +.type AES_GCM_encrypt_update_vaes,@function +.align 16 +AES_GCM_encrypt_update_vaes: +#else +.section __TEXT,__text +.globl _AES_GCM_encrypt_update_vaes +.p2align 4 +_AES_GCM_encrypt_update_vaes: +#endif /* __APPLE__ */ + pushq %r13 + pushq %r12 + pushq %r14 + pushq %r15 + pushq %rbx + movq %rdx, %r10 + movq %rcx, %r11 + movq 48(%rsp), %rax + movq 56(%rsp), %r12 + subq $0x210, %rsp + vmovdqa (%r9), %xmm15 + vmovdqa (%rax), %xmm6 + vpsrlq $63, %xmm6, %xmm8 + vpsllq $0x01, %xmm6, %xmm7 + vpslldq $8, %xmm8, %xmm8 + vpor %xmm8, %xmm7, %xmm7 + vpshufd $0xff, %xmm6, %xmm6 + vpsrad $31, %xmm6, %xmm6 + vpand L_vaes_aes_gcm_mod2_128(%rip), %xmm6, %xmm6 + vpxor %xmm7, %xmm6, %xmm6 + xorl %r14d, %r14d + cmpl $0x80, %r8d + jl L_AES_GCM_encrypt_update_vaes_done_128 + vmovdqa %xmm15, %xmm2 + # H ^ 1 + vmovdqu %xmm6, (%rsp) + # H ^ 2 + vpclmulqdq $0x00, %xmm6, %xmm6, %xmm7 + vpclmulqdq $0x11, %xmm6, %xmm6, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm0 + vmovdqu %xmm0, 16(%rsp) + # H ^ 3 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm6, %xmm0, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm0, %xmm8 + vpclmulqdq $16, %xmm6, %xmm0, %xmm9 + vpclmulqdq $0x11, %xmm6, %xmm0, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm1 + vmovdqu %xmm1, 32(%rsp) + # H ^ 4 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm3 + vmovdqu %xmm3, 48(%rsp) + # H ^ 5 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 64(%rsp) + # H ^ 6 + vpclmulqdq $0x00, %xmm1, %xmm1, %xmm7 + vpclmulqdq $0x11, %xmm1, %xmm1, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 80(%rsp) + # H ^ 7 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm1, %xmm3, %xmm7 + vpclmulqdq $0x01, %xmm1, %xmm3, %xmm8 + vpclmulqdq $16, %xmm1, %xmm3, %xmm9 + vpclmulqdq $0x11, %xmm1, %xmm3, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 96(%rsp) + # H ^ 8 + vpclmulqdq $0x00, %xmm3, %xmm3, %xmm7 + vpclmulqdq $0x11, %xmm3, %xmm3, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 112(%rsp) + cmpl $0x100, %r8d + jl L_AES_GCM_encrypt_update_vaes_no_ext + # H ^ 9 + vmovdqu 48(%rsp), %xmm0 + vmovdqu 64(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 128(%rsp) + # H ^ 10 + vmovdqu 64(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 144(%rsp) + # H ^ 11 + vmovdqu 64(%rsp), %xmm0 + vmovdqu 80(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 160(%rsp) + # H ^ 12 + vmovdqu 80(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 176(%rsp) + # H ^ 13 + vmovdqu 80(%rsp), %xmm0 + vmovdqu 96(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 192(%rsp) + # H ^ 14 + vmovdqu 96(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 208(%rsp) + # H ^ 15 + vmovdqu 96(%rsp), %xmm0 + vmovdqu 112(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 224(%rsp) + # H ^ 16 + vmovdqu 112(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 240(%rsp) + vmovdqu 224(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 192(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 160(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu 128(%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + vmovdqu %ymm7, 256(%rsp) + vmovdqu %ymm8, 288(%rsp) + vmovdqu %ymm9, 320(%rsp) + vmovdqu %ymm10, 352(%rsp) + vmovdqu 96(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 64(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 32(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu (%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + vmovdqu %ymm7, 384(%rsp) + vmovdqu %ymm8, 416(%rsp) + vmovdqu %ymm9, 448(%rsp) + vmovdqu %ymm10, 480(%rsp) +L_AES_GCM_encrypt_update_vaes_no_ext: + vbroadcasti128 L_vaes_aes_gcm_mod2_128(%rip), %ymm14 + cmpl $0x100, %r8d + jl L_AES_GCM_encrypt_update_vaes_after_256 + movl %r8d, %r13d + andl $0xffffff00, %r13d +L_AES_GCM_encrypt_update_vaes_loop_256: + # 256 bytes of input + leaq (%r10,%r14,1), %r15 + vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6 + vbroadcasti128 (%r12), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu (%r12), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, (%r12) + vbroadcasti128 (%rdi), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %esi + vbroadcasti128 160(%rdi), %ymm4 + jl L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %esi + vbroadcasti128 192(%rdi), %ymm4 + jl L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%rdi), %ymm4 +L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %r14d + vbroadcasti128 (%r12), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu (%r12), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, (%r12) + vbroadcasti128 (%rdi), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %esi + vbroadcasti128 160(%rdi), %ymm4 + jl L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %esi + vbroadcasti128 192(%rdi), %ymm4 + jl L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%rdi), %ymm4 +L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %r14d + vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6 + vpxor %ymm4, %ymm4, %ymm4 + vinserti128 $0x00, %xmm15, %ymm4, %ymm4 + vmovdqu 256(%rsp), %ymm7 + vmovdqu 288(%rsp), %ymm8 + vmovdqu 320(%rsp), %ymm9 + vmovdqu 352(%rsp), %ymm10 + vmovdqu (%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpxor %ymm4, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vmovdqa %ymm0, %ymm11 + vpxor %ymm1, %ymm2, %ymm12 + vmovdqa %ymm3, %ymm13 + vmovdqu 32(%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 64(%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 96(%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 384(%rsp), %ymm7 + vmovdqu 416(%rsp), %ymm8 + vmovdqu 448(%rsp), %ymm9 + vmovdqu 480(%rsp), %ymm10 + vmovdqu 128(%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 160(%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 192(%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 224(%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5 + vpshufd $0x4e, %ymm11, %ymm11 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm11, %ymm12, %ymm12 + vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5 + vpshufd $0x4e, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm12, %ymm13, %ymm13 + vextracti128 $0x01, %ymm13, %xmm0 + vpxor %xmm0, %xmm13, %xmm15 + cmpl %r13d, %r14d + jl L_AES_GCM_encrypt_update_vaes_loop_256 +L_AES_GCM_encrypt_update_vaes_after_256: + movl %r8d, %r13d + andl $0xffffff80, %r13d + cmpl %r13d, %r14d + jge L_AES_GCM_encrypt_update_vaes_after_128 + # 128 bytes of input + leaq (%r10,%r14,1), %r15 + vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6 + vbroadcasti128 (%r12), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu (%r12), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, (%r12) + vbroadcasti128 (%rdi), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %esi + vbroadcasti128 160(%rdi), %ymm4 + jl L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %esi + vbroadcasti128 192(%rdi), %ymm4 + jl L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%rdi), %ymm4 +L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %r14d + vmovdqu 96(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 64(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 32(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu (%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6 + vpxor %ymm4, %ymm4, %ymm4 + vinserti128 $0x00, %xmm15, %ymm4, %ymm4 + vmovdqu (%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpxor %ymm4, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vmovdqa %ymm0, %ymm11 + vpxor %ymm1, %ymm2, %ymm12 + vmovdqa %ymm3, %ymm13 + vmovdqu 32(%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 64(%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 96(%r15), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5 + vpshufd $0x4e, %ymm11, %ymm11 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm11, %ymm12, %ymm12 + vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5 + vpshufd $0x4e, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm12, %ymm13, %ymm13 + vextracti128 $0x01, %ymm13, %xmm0 + vpxor %xmm0, %xmm13, %xmm15 +L_AES_GCM_encrypt_update_vaes_after_128: + vmovdqu (%rsp), %xmm6 +L_AES_GCM_encrypt_update_vaes_done_128: + movl %r8d, %edx + cmpl %edx, %r14d + jge L_AES_GCM_encrypt_update_vaes_done_enc + movl %r8d, %r13d + andl $0xfffffff0, %r13d + cmpl %r13d, %r14d + jge L_AES_GCM_encrypt_update_vaes_last_block_done + vmovdqu (%r12), %xmm8 + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm8, %xmm7 + vpaddd L_vaes_aes_gcm_one(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxor (%rdi), %xmm7, %xmm7 + vaesenc 16(%rdi), %xmm7, %xmm7 + vaesenc 32(%rdi), %xmm7, %xmm7 + vaesenc 48(%rdi), %xmm7, %xmm7 + vaesenc 64(%rdi), %xmm7, %xmm7 + vaesenc 80(%rdi), %xmm7, %xmm7 + vaesenc 96(%rdi), %xmm7, %xmm7 + vaesenc 112(%rdi), %xmm7, %xmm7 + vaesenc 128(%rdi), %xmm7, %xmm7 + vaesenc 144(%rdi), %xmm7, %xmm7 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm8 + jl L_AES_GCM_encrypt_update_vaes_aesenc_block_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 176(%rdi), %xmm7, %xmm7 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm8 + jl L_AES_GCM_encrypt_update_vaes_aesenc_block_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 208(%rdi), %xmm7, %xmm7 + vmovdqa 224(%rdi), %xmm8 +L_AES_GCM_encrypt_update_vaes_aesenc_block_last: + vaesenclast %xmm8, %xmm7, %xmm7 + vmovdqu (%r11,%r14,1), %xmm8 + vpxor %xmm8, %xmm7, %xmm7 + vmovdqu %xmm7, (%r10,%r14,1) + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm15, %xmm15 + addl $16, %r14d + cmpl %r13d, %r14d + jge L_AES_GCM_encrypt_update_vaes_last_block_ghash +L_AES_GCM_encrypt_update_vaes_last_block_start: + vmovdqu (%r11,%r14,1), %xmm12 + vmovdqu (%r12), %xmm8 + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm8, %xmm7 + vpaddd L_vaes_aes_gcm_one(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxor (%rdi), %xmm7, %xmm7 + vpclmulqdq $16, %xmm6, %xmm15, %xmm9 + vaesenc 16(%rdi), %xmm7, %xmm7 + vaesenc 32(%rdi), %xmm7, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm15, %xmm10 + vaesenc 48(%rdi), %xmm7, %xmm7 + vaesenc 64(%rdi), %xmm7, %xmm7 + vpclmulqdq $0x00, %xmm6, %xmm15, %xmm11 + vaesenc 80(%rdi), %xmm7, %xmm7 + vpclmulqdq $0x11, %xmm6, %xmm15, %xmm1 + vaesenc 96(%rdi), %xmm7, %xmm7 + vpxor %xmm10, %xmm9, %xmm9 + vpslldq $8, %xmm9, %xmm2 + vpsrldq $8, %xmm9, %xmm9 + vaesenc 112(%rdi), %xmm7, %xmm7 + vpxor %xmm11, %xmm2, %xmm2 + vpxor %xmm9, %xmm1, %xmm3 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm0 + vpclmulqdq $16, %xmm0, %xmm2, %xmm10 + vaesenc 128(%rdi), %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm9 + vpxor %xmm10, %xmm9, %xmm9 + vpclmulqdq $16, %xmm0, %xmm9, %xmm10 + vaesenc 144(%rdi), %xmm7, %xmm7 + vpshufd $0x4e, %xmm9, %xmm9 + vpxor %xmm10, %xmm9, %xmm9 + vpxor %xmm3, %xmm9, %xmm15 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm8 + jl L_AES_GCM_encrypt_update_vaes_aesenc_gfmul_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 176(%rdi), %xmm7, %xmm7 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm8 + jl L_AES_GCM_encrypt_update_vaes_aesenc_gfmul_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 208(%rdi), %xmm7, %xmm7 + vmovdqa 224(%rdi), %xmm8 +L_AES_GCM_encrypt_update_vaes_aesenc_gfmul_last: + vaesenclast %xmm8, %xmm7, %xmm7 + vmovdqa %xmm12, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vmovdqu %xmm7, (%r10,%r14,1) + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + addl $16, %r14d + vpxor %xmm7, %xmm15, %xmm15 + cmpl %r13d, %r14d + jl L_AES_GCM_encrypt_update_vaes_last_block_start +L_AES_GCM_encrypt_update_vaes_last_block_ghash: + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm6, %xmm15, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm15, %xmm8 + vpclmulqdq $16, %xmm6, %xmm15, %xmm9 + vpclmulqdq $0x11, %xmm6, %xmm15, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm15 +L_AES_GCM_encrypt_update_vaes_last_block_done: +L_AES_GCM_encrypt_update_vaes_done_enc: + vmovdqa %xmm15, (%r9) + vzeroupper + addq $0x210, %rsp + popq %rbx + popq %r15 + popq %r14 + popq %r12 + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_encrypt_update_vaes,.-AES_GCM_encrypt_update_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_encrypt_final_vaes +.type AES_GCM_encrypt_final_vaes,@function +.align 16 +AES_GCM_encrypt_final_vaes: +#else +.section __TEXT,__text +.globl _AES_GCM_encrypt_final_vaes +.p2align 4 +_AES_GCM_encrypt_final_vaes: +#endif /* __APPLE__ */ + pushq %r13 + movl %edx, %eax + movl %ecx, %r10d + movl %r8d, %r11d + movq 16(%rsp), %r8 + subq $16, %rsp + vmovdqa (%rdi), %xmm4 + vmovdqa (%r9), %xmm5 + vmovdqa (%r8), %xmm6 + vpsrlq $63, %xmm5, %xmm8 + vpsllq $0x01, %xmm5, %xmm7 + vpslldq $8, %xmm8, %xmm8 + vpor %xmm8, %xmm7, %xmm7 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpand L_vaes_aes_gcm_mod2_128(%rip), %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + movl %r10d, %edx + movl %r11d, %ecx + shlq $3, %rdx + shlq $3, %rcx + vmovq %rdx, %xmm0 + vmovq %rcx, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm5, %xmm4, %xmm7 + vpclmulqdq $0x01, %xmm5, %xmm4, %xmm8 + vpclmulqdq $16, %xmm5, %xmm4, %xmm9 + vpclmulqdq $0x11, %xmm5, %xmm4, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm0 + cmpl $16, %eax + je L_AES_GCM_encrypt_final_vaes_store_tag_16 + xorq %rcx, %rcx + vmovdqu %xmm0, (%rsp) +L_AES_GCM_encrypt_final_vaes_store_tag_loop: + movzbl (%rsp,%rcx,1), %r13d + movb %r13b, (%rsi,%rcx,1) + incl %ecx + cmpl %eax, %ecx + jne L_AES_GCM_encrypt_final_vaes_store_tag_loop + jmp L_AES_GCM_encrypt_final_vaes_store_tag_done +L_AES_GCM_encrypt_final_vaes_store_tag_16: + vmovdqu %xmm0, (%rsi) +L_AES_GCM_encrypt_final_vaes_store_tag_done: + vzeroupper + addq $16, %rsp + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_encrypt_final_vaes,.-AES_GCM_encrypt_final_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_decrypt_update_vaes +.type AES_GCM_decrypt_update_vaes,@function +.align 16 +AES_GCM_decrypt_update_vaes: +#else +.section __TEXT,__text +.globl _AES_GCM_decrypt_update_vaes +.p2align 4 +_AES_GCM_decrypt_update_vaes: +#endif /* __APPLE__ */ + pushq %r13 + pushq %r12 + pushq %r14 + pushq %r15 + pushq %rbx + movq %rdx, %r10 + movq %rcx, %r11 + movq 48(%rsp), %rax + movq 56(%rsp), %r12 + subq $0x210, %rsp + vmovdqa (%r9), %xmm15 + vmovdqa (%rax), %xmm6 + vpsrlq $63, %xmm6, %xmm8 + vpsllq $0x01, %xmm6, %xmm7 + vpslldq $8, %xmm8, %xmm8 + vpor %xmm8, %xmm7, %xmm7 + vpshufd $0xff, %xmm6, %xmm6 + vpsrad $31, %xmm6, %xmm6 + vpand L_vaes_aes_gcm_mod2_128(%rip), %xmm6, %xmm6 + vpxor %xmm7, %xmm6, %xmm6 + xorl %r14d, %r14d + cmpl $0x80, %r8d + jl L_AES_GCM_decrypt_update_vaes_done_128 + vmovdqa %xmm15, %xmm2 + # H ^ 1 + vmovdqu %xmm6, (%rsp) + # H ^ 2 + vpclmulqdq $0x00, %xmm6, %xmm6, %xmm7 + vpclmulqdq $0x11, %xmm6, %xmm6, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm0 + vmovdqu %xmm0, 16(%rsp) + # H ^ 3 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm6, %xmm0, %xmm7 + vpclmulqdq $0x01, %xmm6, %xmm0, %xmm8 + vpclmulqdq $16, %xmm6, %xmm0, %xmm9 + vpclmulqdq $0x11, %xmm6, %xmm0, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm1 + vmovdqu %xmm1, 32(%rsp) + # H ^ 4 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm3 + vmovdqu %xmm3, 48(%rsp) + # H ^ 5 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 64(%rsp) + # H ^ 6 + vpclmulqdq $0x00, %xmm1, %xmm1, %xmm7 + vpclmulqdq $0x11, %xmm1, %xmm1, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 80(%rsp) + # H ^ 7 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm1, %xmm3, %xmm7 + vpclmulqdq $0x01, %xmm1, %xmm3, %xmm8 + vpclmulqdq $16, %xmm1, %xmm3, %xmm9 + vpclmulqdq $0x11, %xmm1, %xmm3, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 96(%rsp) + # H ^ 8 + vpclmulqdq $0x00, %xmm3, %xmm3, %xmm7 + vpclmulqdq $0x11, %xmm3, %xmm3, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 112(%rsp) + cmpl $0x100, %r8d + jl L_AES_GCM_decrypt_update_vaes_no_ext + # H ^ 9 + vmovdqu 48(%rsp), %xmm0 + vmovdqu 64(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 128(%rsp) + # H ^ 10 + vmovdqu 64(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 144(%rsp) + # H ^ 11 + vmovdqu 64(%rsp), %xmm0 + vmovdqu 80(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 160(%rsp) + # H ^ 12 + vmovdqu 80(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 176(%rsp) + # H ^ 13 + vmovdqu 80(%rsp), %xmm0 + vmovdqu 96(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 192(%rsp) + # H ^ 14 + vmovdqu 96(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 208(%rsp) + # H ^ 15 + vmovdqu 96(%rsp), %xmm0 + vmovdqu 112(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 224(%rsp) + # H ^ 16 + vmovdqu 112(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm4 + vmovdqu %xmm4, 240(%rsp) + vmovdqu 224(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 192(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 160(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu 128(%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + vmovdqu %ymm7, 256(%rsp) + vmovdqu %ymm8, 288(%rsp) + vmovdqu %ymm9, 320(%rsp) + vmovdqu %ymm10, 352(%rsp) + vmovdqu 96(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 64(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 32(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu (%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + vmovdqu %ymm7, 384(%rsp) + vmovdqu %ymm8, 416(%rsp) + vmovdqu %ymm9, 448(%rsp) + vmovdqu %ymm10, 480(%rsp) +L_AES_GCM_decrypt_update_vaes_no_ext: + vbroadcasti128 L_vaes_aes_gcm_mod2_128(%rip), %ymm14 + cmpl $0x100, %r8d + jl L_AES_GCM_decrypt_update_vaes_after_256 + movl %r8d, %r13d + andl $0xffffff00, %r13d +L_AES_GCM_decrypt_update_vaes_loop_256: + # 256 bytes of input + leaq (%r11,%r14,1), %rbx + vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6 + vpxor %ymm4, %ymm4, %ymm4 + vinserti128 $0x00, %xmm15, %ymm4, %ymm4 + vmovdqu 256(%rsp), %ymm7 + vmovdqu 288(%rsp), %ymm8 + vmovdqu 320(%rsp), %ymm9 + vmovdqu 352(%rsp), %ymm10 + vmovdqu (%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpxor %ymm4, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vmovdqa %ymm0, %ymm11 + vpxor %ymm1, %ymm2, %ymm12 + vmovdqa %ymm3, %ymm13 + vmovdqu 32(%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 64(%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 96(%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 384(%rsp), %ymm7 + vmovdqu 416(%rsp), %ymm8 + vmovdqu 448(%rsp), %ymm9 + vmovdqu 480(%rsp), %ymm10 + vmovdqu 128(%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 160(%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 192(%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 224(%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5 + vpshufd $0x4e, %ymm11, %ymm11 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm11, %ymm12, %ymm12 + vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5 + vpshufd $0x4e, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm12, %ymm13, %ymm13 + vextracti128 $0x01, %ymm13, %xmm0 + vpxor %xmm0, %xmm13, %xmm15 + vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6 + vbroadcasti128 (%r12), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu (%r12), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, (%r12) + vbroadcasti128 (%rdi), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %esi + vbroadcasti128 160(%rdi), %ymm4 + jl L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %esi + vbroadcasti128 192(%rdi), %ymm4 + jl L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%rdi), %ymm4 +L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %r14d + vbroadcasti128 (%r12), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu (%r12), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, (%r12) + vbroadcasti128 (%rdi), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %esi + vbroadcasti128 160(%rdi), %ymm4 + jl L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %esi + vbroadcasti128 192(%rdi), %ymm4 + jl L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%rdi), %ymm4 +L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %r14d + cmpl %r13d, %r14d + jl L_AES_GCM_decrypt_update_vaes_loop_256 +L_AES_GCM_decrypt_update_vaes_after_256: + vmovdqu 96(%rsp), %ymm7 + vpermq $0x4e, %ymm7, %ymm7 + vmovdqu 64(%rsp), %ymm8 + vpermq $0x4e, %ymm8, %ymm8 + vmovdqu 32(%rsp), %ymm9 + vpermq $0x4e, %ymm9, %ymm9 + vmovdqu (%rsp), %ymm10 + vpermq $0x4e, %ymm10, %ymm10 + movl %r8d, %r13d + andl $0xffffff80, %r13d + cmpl %r13d, %r14d + jge L_AES_GCM_decrypt_update_vaes_after_128 + # 128 bytes of input + leaq (%r11,%r14,1), %rbx + vbroadcasti128 L_vaes_aes_gcm_bswap_mask(%rip), %ymm6 + vpxor %ymm4, %ymm4, %ymm4 + vinserti128 $0x00, %xmm15, %ymm4, %ymm4 + vmovdqu (%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpxor %ymm4, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm7, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm7, %ymm5, %ymm1 + vpclmulqdq $16, %ymm7, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm7, %ymm5, %ymm3 + vmovdqa %ymm0, %ymm11 + vpxor %ymm1, %ymm2, %ymm12 + vmovdqa %ymm3, %ymm13 + vmovdqu 32(%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm8, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm8, %ymm5, %ymm1 + vpclmulqdq $16, %ymm8, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm8, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 64(%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm9, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm9, %ymm5, %ymm1 + vpclmulqdq $16, %ymm9, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm9, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vmovdqu 96(%rbx), %ymm5 + vpshufb %ymm6, %ymm5, %ymm5 + vpclmulqdq $0x00, %ymm10, %ymm5, %ymm0 + vpclmulqdq $0x01, %ymm10, %ymm5, %ymm1 + vpclmulqdq $16, %ymm10, %ymm5, %ymm2 + vpclmulqdq $0x11, %ymm10, %ymm5, %ymm3 + vpxor %ymm0, %ymm11, %ymm11 + vpxor %ymm1, %ymm12, %ymm12 + vpxor %ymm2, %ymm12, %ymm12 + vpxor %ymm3, %ymm13, %ymm13 + vpclmulqdq $0x01, %ymm11, %ymm14, %ymm5 + vpshufd $0x4e, %ymm11, %ymm11 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm11, %ymm12, %ymm12 + vpclmulqdq $0x01, %ymm12, %ymm14, %ymm5 + vpshufd $0x4e, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm12, %ymm13, %ymm13 + vextracti128 $0x01, %ymm13, %xmm0 + vpxor %xmm0, %xmm13, %xmm15 + vbroadcasti128 L_vaes_aes_gcm_bswap_epi64(%rip), %ymm6 + vbroadcasti128 (%r12), %ymm4 + vpaddd L_vaes_aes_gcm_inc_y0(%rip), %ymm4, %ymm0 + vpshufb %ymm6, %ymm0, %ymm0 + vpaddd L_vaes_aes_gcm_inc_y1(%rip), %ymm4, %ymm1 + vpshufb %ymm6, %ymm1, %ymm1 + vpaddd L_vaes_aes_gcm_inc_y2(%rip), %ymm4, %ymm2 + vpshufb %ymm6, %ymm2, %ymm2 + vpaddd L_vaes_aes_gcm_inc_y3(%rip), %ymm4, %ymm3 + vpshufb %ymm6, %ymm3, %ymm3 + vmovdqu (%r12), %xmm7 + vpaddd L_vaes_aes_gcm_eight(%rip), %xmm7, %xmm7 + vmovdqu %xmm7, (%r12) + vbroadcasti128 (%rdi), %ymm4 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm4, %ymm1, %ymm1 + vpxor %ymm4, %ymm2, %ymm2 + vpxor %ymm4, %ymm3, %ymm3 + vbroadcasti128 16(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 32(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 48(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 64(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 80(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 96(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 112(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 128(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 144(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $11, %esi + vbroadcasti128 160(%rdi), %ymm4 + jl L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 176(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + cmpl $13, %esi + vbroadcasti128 192(%rdi), %ymm4 + jl L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 208(%rdi), %ymm4 + vaesenc %ymm4, %ymm0, %ymm0 + vaesenc %ymm4, %ymm1, %ymm1 + vaesenc %ymm4, %ymm2, %ymm2 + vaesenc %ymm4, %ymm3, %ymm3 + vbroadcasti128 224(%rdi), %ymm4 +L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last: + vaesenclast %ymm4, %ymm0, %ymm0 + vaesenclast %ymm4, %ymm1, %ymm1 + vaesenclast %ymm4, %ymm2, %ymm2 + vaesenclast %ymm4, %ymm3, %ymm3 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu (%rcx), %ymm5 + vpxor %ymm5, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vmovdqu 32(%rcx), %ymm5 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vmovdqu 64(%rcx), %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vmovdqu 96(%rcx), %ymm5 + vpxor %ymm5, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + addl $0x80, %r14d +L_AES_GCM_decrypt_update_vaes_after_128: + vmovdqu (%rsp), %xmm6 +L_AES_GCM_decrypt_update_vaes_done_128: + movl %r8d, %edx + cmpl %edx, %r14d + jge L_AES_GCM_decrypt_update_vaes_done_dec + movl %r8d, %r13d + andl $0xfffffff0, %r13d + cmpl %r13d, %r14d + jge L_AES_GCM_decrypt_update_vaes_last_block_done +L_AES_GCM_decrypt_update_vaes_last_block_start: + vmovdqu (%r11,%r14,1), %xmm12 + vmovdqa %xmm6, %xmm0 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm12, %xmm1 + vpxor %xmm15, %xmm1, %xmm1 + vmovdqu (%r12), %xmm8 + vpshufb L_vaes_aes_gcm_bswap_epi64(%rip), %xmm8, %xmm7 + vpaddd L_vaes_aes_gcm_one(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxor (%rdi), %xmm7, %xmm7 + vpclmulqdq $16, %xmm0, %xmm1, %xmm9 + vaesenc 16(%rdi), %xmm7, %xmm7 + vaesenc 32(%rdi), %xmm7, %xmm7 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm10 + vaesenc 48(%rdi), %xmm7, %xmm7 + vaesenc 64(%rdi), %xmm7, %xmm7 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm11 + vaesenc 80(%rdi), %xmm7, %xmm7 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vaesenc 96(%rdi), %xmm7, %xmm7 + vpxor %xmm10, %xmm9, %xmm9 + vpslldq $8, %xmm9, %xmm2 + vpsrldq $8, %xmm9, %xmm9 + vaesenc 112(%rdi), %xmm7, %xmm7 + vpxor %xmm11, %xmm2, %xmm2 + vpxor %xmm9, %xmm1, %xmm3 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm0 + vpclmulqdq $16, %xmm0, %xmm2, %xmm10 + vaesenc 128(%rdi), %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm9 + vpxor %xmm10, %xmm9, %xmm9 + vpclmulqdq $16, %xmm0, %xmm9, %xmm10 + vaesenc 144(%rdi), %xmm7, %xmm7 + vpshufd $0x4e, %xmm9, %xmm9 + vpxor %xmm10, %xmm9, %xmm9 + vpxor %xmm3, %xmm9, %xmm15 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm8 + jl L_AES_GCM_decrypt_update_vaes_aesenc_gfmul_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 176(%rdi), %xmm7, %xmm7 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm8 + jl L_AES_GCM_decrypt_update_vaes_aesenc_gfmul_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 208(%rdi), %xmm7, %xmm7 + vmovdqa 224(%rdi), %xmm8 +L_AES_GCM_decrypt_update_vaes_aesenc_gfmul_last: + vaesenclast %xmm8, %xmm7, %xmm7 + vmovdqa %xmm12, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vmovdqu %xmm7, (%r10,%r14,1) + addl $16, %r14d + cmpl %r13d, %r14d + jl L_AES_GCM_decrypt_update_vaes_last_block_start +L_AES_GCM_decrypt_update_vaes_last_block_done: +L_AES_GCM_decrypt_update_vaes_done_dec: + vmovdqa %xmm15, (%r9) + vzeroupper + addq $0x210, %rsp + popq %rbx + popq %r15 + popq %r14 + popq %r12 + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_decrypt_update_vaes,.-AES_GCM_decrypt_update_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_decrypt_final_vaes +.type AES_GCM_decrypt_final_vaes,@function +.align 16 +AES_GCM_decrypt_final_vaes: +#else +.section __TEXT,__text +.globl _AES_GCM_decrypt_final_vaes +.p2align 4 +_AES_GCM_decrypt_final_vaes: +#endif /* __APPLE__ */ + pushq %r13 + pushq %rbp + pushq %r12 + movl %edx, %eax + movl %ecx, %r10d + movl %r8d, %r11d + movq 32(%rsp), %r8 + movq 40(%rsp), %rbp + subq $16, %rsp + vmovdqa (%rdi), %xmm6 + vmovdqa (%r9), %xmm5 + vmovdqa (%r8), %xmm15 + vpsrlq $63, %xmm5, %xmm8 + vpsllq $0x01, %xmm5, %xmm7 + vpslldq $8, %xmm8, %xmm8 + vpor %xmm8, %xmm7, %xmm7 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpand L_vaes_aes_gcm_mod2_128(%rip), %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + movl %r10d, %edx + movl %r11d, %ecx + shlq $3, %rdx + shlq $3, %rcx + vmovq %rdx, %xmm0 + vmovq %rcx, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + # ghash_gfmul_red_avx + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm7 + vpclmulqdq $0x01, %xmm5, %xmm6, %xmm8 + vpclmulqdq $16, %xmm5, %xmm6, %xmm9 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm10 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqa L_vaes_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpxor %xmm11, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm8, %xmm10, %xmm10 + vmovdqa %xmm10, %xmm6 + vpshufb L_vaes_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6 + vpxor %xmm15, %xmm6, %xmm0 + cmpl $16, %eax + je L_AES_GCM_decrypt_final_vaes_cmp_tag_16 + subq $16, %rsp + xorq %rcx, %rcx + xorq %r12, %r12 + vmovdqu %xmm0, (%rsp) +L_AES_GCM_decrypt_final_vaes_cmp_tag_loop: + movzbl (%rsp,%rcx,1), %r13d + xorb (%rsi,%rcx,1), %r13b + orb %r13b, %r12b + incl %ecx + cmpl %eax, %ecx + jne L_AES_GCM_decrypt_final_vaes_cmp_tag_loop + cmpb $0x00, %r12b + sete %r12b + addq $16, %rsp + xorq %rcx, %rcx + jmp L_AES_GCM_decrypt_final_vaes_cmp_tag_done +L_AES_GCM_decrypt_final_vaes_cmp_tag_16: + vmovdqu (%rsi), %xmm1 + vpcmpeqb %xmm1, %xmm0, %xmm0 + vpmovmskb %xmm0, %rdx + # %%edx == 0xFFFF then return 1 else => return 0 + xorl %r12d, %r12d + cmpl $0xffff, %edx + sete %r12b +L_AES_GCM_decrypt_final_vaes_cmp_tag_done: + movl %r12d, (%rbp) + vzeroupper + addq $16, %rsp + popq %r12 + popq %rbp + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_decrypt_final_vaes,.-AES_GCM_decrypt_final_vaes +#endif /* __APPLE__ */ +#endif /* WOLFSSL_AESGCM_STREAM */ +#endif /* HAVE_INTEL_VAES */ +#ifdef HAVE_INTEL_AVX512 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_inc_z0: +.quad 0x0000000000000000,0x0000000000000000 +.quad 0x0000000000000000,0x0000000000000001 +.quad 0x0000000000000000,0x0000000000000002 +.quad 0x0000000000000000,0x0000000000000003 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_inc_z1: +.quad 0x0000000000000000,0x0000000000000004 +.quad 0x0000000000000000,0x0000000000000005 +.quad 0x0000000000000000,0x0000000000000006 +.quad 0x0000000000000000,0x0000000000000007 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_inc_z2: +.quad 0x0000000000000000,0x0000000000000008 +.quad 0x0000000000000000,0x0000000000000009 +.quad 0x0000000000000000,0x000000000000000a +.quad 0x0000000000000000,0x000000000000000b +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_inc_z3: +.quad 0x0000000000000000,0x000000000000000c +.quad 0x0000000000000000,0x000000000000000d +.quad 0x0000000000000000,0x000000000000000e +.quad 0x0000000000000000,0x000000000000000f +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_sixteen: +.quad 0x0000000000000000,0x0000000000000010 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_GCM_generate_m0_avx512_rev8: +.quad 0x08090a0b0c0d0e0f,0x0001020304050607 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_GCM_generate_m0_avx512_mod2_128: +.quad 0x0000000000000000,0xe100000000000000 +#ifndef __APPLE__ +.text +.globl GCM_generate_m0_avx512 +.type GCM_generate_m0_avx512,@function +.align 16 +GCM_generate_m0_avx512: +#else +.section __TEXT,__text +.globl _GCM_generate_m0_avx512 +.p2align 4 +_GCM_generate_m0_avx512: +#endif /* __APPLE__ */ + vmovdqu L_GCM_generate_m0_avx512_rev8(%rip), %xmm9 + vmovdqu L_GCM_generate_m0_avx512_mod2_128(%rip), %xmm10 + vpxor %xmm8, %xmm8, %xmm8 + vmovdqu (%rdi), %xmm0 + vmovdqu %xmm8, (%rsi) + vmovdqu %xmm0, %xmm8 + vpshufb %xmm9, %xmm0, %xmm0 + vpsllq $63, %xmm0, %xmm5 + vpsrlq $0x01, %xmm0, %xmm4 + vpslldq $8, %xmm5, %xmm1 + vpsrldq $8, %xmm5, %xmm5 + vpshufd $0xff, %xmm1, %xmm1 + vpor %xmm5, %xmm4, %xmm4 + vpsrad $31, %xmm1, %xmm1 + vpand %xmm10, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpsllq $63, %xmm1, %xmm5 + vpsrlq $0x01, %xmm1, %xmm4 + vpslldq $8, %xmm5, %xmm2 + vpsrldq $8, %xmm5, %xmm5 + vpshufd $0xff, %xmm2, %xmm2 + vpor %xmm5, %xmm4, %xmm4 + vpsrad $31, %xmm2, %xmm2 + vpand %xmm10, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpsllq $63, %xmm2, %xmm5 + vpsrlq $0x01, %xmm2, %xmm4 + vpslldq $8, %xmm5, %xmm3 + vpsrldq $8, %xmm5, %xmm5 + vpshufd $0xff, %xmm3, %xmm3 + vpor %xmm5, %xmm4, %xmm4 + vpsrad $31, %xmm3, %xmm3 + vpand %xmm10, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpshufb %xmm9, %xmm3, %xmm3 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm0, %xmm0 + vpxor %xmm2, %xmm3, %xmm8 + vmovdqu %xmm3, 16(%rsi) + vmovdqu %xmm2, 32(%rsi) + vmovdqu %xmm8, 48(%rsi) + vmovdqu %xmm1, 64(%rsi) + vpxor %xmm1, %xmm3, %xmm4 + vpxor %xmm1, %xmm2, %xmm5 + vpxor %xmm1, %xmm8, %xmm6 + vmovdqu %xmm4, 80(%rsi) + vmovdqu %xmm5, 96(%rsi) + vmovdqu %xmm6, 112(%rsi) + vmovdqu %xmm0, 128(%rsi) + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm0, %xmm3, %xmm4 + vpxor %xmm0, %xmm2, %xmm6 + vmovdqu %xmm4, 144(%rsi) + vmovdqu %xmm6, 160(%rsi) + vpxor %xmm6, %xmm3, %xmm6 + vmovdqu %xmm6, 176(%rsi) + vmovdqu %xmm1, 192(%rsi) + vpxor %xmm1, %xmm3, %xmm4 + vpxor %xmm1, %xmm2, %xmm5 + vpxor %xmm1, %xmm8, %xmm6 + vmovdqu %xmm4, 208(%rsi) + vmovdqu %xmm5, 224(%rsi) + vmovdqu %xmm6, 240(%rsi) + vmovdqu (%rsi), %xmm0 + vmovdqu 16(%rsi), %xmm1 + vmovdqu 32(%rsi), %xmm2 + vmovdqu 48(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 256(%rsi) + vmovdqu %xmm1, 272(%rsi) + vmovdqu %xmm2, 288(%rsi) + vmovdqu %xmm3, 304(%rsi) + vmovdqu 64(%rsi), %xmm0 + vmovdqu 80(%rsi), %xmm1 + vmovdqu 96(%rsi), %xmm2 + vmovdqu 112(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 320(%rsi) + vmovdqu %xmm1, 336(%rsi) + vmovdqu %xmm2, 352(%rsi) + vmovdqu %xmm3, 368(%rsi) + vmovdqu 128(%rsi), %xmm0 + vmovdqu 144(%rsi), %xmm1 + vmovdqu 160(%rsi), %xmm2 + vmovdqu 176(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 384(%rsi) + vmovdqu %xmm1, 400(%rsi) + vmovdqu %xmm2, 416(%rsi) + vmovdqu %xmm3, 432(%rsi) + vmovdqu 192(%rsi), %xmm0 + vmovdqu 208(%rsi), %xmm1 + vmovdqu 224(%rsi), %xmm2 + vmovdqu 240(%rsi), %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vpsllq $60, %xmm0, %xmm4 + vpsllq $60, %xmm1, %xmm5 + vpsllq $60, %xmm2, %xmm6 + vpsllq $60, %xmm3, %xmm7 + vpsrlq $4, %xmm0, %xmm0 + vpsrlq $4, %xmm1, %xmm1 + vpsrlq $4, %xmm2, %xmm2 + vpsrlq $4, %xmm3, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpsrldq $8, %xmm5, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpsrldq $8, %xmm7, %xmm7 + vpor %xmm4, %xmm0, %xmm0 + vpor %xmm5, %xmm1, %xmm1 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm7, %xmm3, %xmm3 + vpshufb %xmm9, %xmm0, %xmm0 + vpshufb %xmm9, %xmm1, %xmm1 + vpshufb %xmm9, %xmm2, %xmm2 + vpshufb %xmm9, %xmm3, %xmm3 + vmovdqu %xmm0, 448(%rsi) + vmovdqu %xmm1, 464(%rsi) + vmovdqu %xmm2, 480(%rsi) + vmovdqu %xmm3, 496(%rsi) + repz retq +#ifndef __APPLE__ +.size GCM_generate_m0_avx512,.-GCM_generate_m0_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_one: +.quad 0x0000000000000000,0x0000000000000001 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_two: +.quad 0x0000000000000000,0x0000000000000002 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_three: +.quad 0x0000000000000000,0x0000000000000003 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_four: +.quad 0x0000000000000000,0x0000000000000004 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_five: +.quad 0x0000000000000000,0x0000000000000005 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_six: +.quad 0x0000000000000000,0x0000000000000006 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_seven: +.quad 0x0000000000000000,0x0000000000000007 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_eight: +.quad 0x0000000000000000,0x0000000000000008 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_bswap_epi64: +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_bswap_mask: +.quad 0x08090a0b0c0d0e0f,0x0001020304050607 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_gcm_mod2_128: +.quad 0x0000000000000001,0xc200000000000000 +#ifndef __APPLE__ +.text +.globl AES_GCM_encrypt_avx512 +.type AES_GCM_encrypt_avx512,@function +.align 16 +AES_GCM_encrypt_avx512: +#else +.section __TEXT,__text +.globl _AES_GCM_encrypt_avx512 +.p2align 4 +_AES_GCM_encrypt_avx512: +#endif /* __APPLE__ */ + pushq %r13 + pushq %r12 + pushq %rbx + pushq %r14 + pushq %r15 + movq %rdx, %r12 + movq %rcx, %rax + movl 48(%rsp), %r11d + movl 56(%rsp), %ebx + movl 64(%rsp), %r14d + movq 72(%rsp), %r15 + movl 80(%rsp), %r10d + subq $0x440, %rsp + vpxor %xmm4, %xmm4, %xmm4 + vpxor %xmm6, %xmm6, %xmm6 + movl %ebx, %edx + cmpl $12, %edx + jne L_AES_GCM_encrypt_avx512_iv_not_12 + # # Calculate values when IV is 12 bytes + # Set counter based on IV + movl $0x1000000, %ecx + vmovq (%rax), %xmm4 + vpinsrd $2, 8(%rax), %xmm4, %xmm4 + vpinsrd $3, %ecx, %xmm4, %xmm4 + # H = Encrypt X(=0) and T = Encrypt counter + vmovdqa (%r15), %xmm5 + vpxor %xmm5, %xmm4, %xmm1 + vmovdqa 16(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 32(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 48(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 64(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 80(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 96(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 112(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 128(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 144(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm7 + jl L_AES_GCM_encrypt_avx512_calc_iv_12_last + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 176(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm7 + jl L_AES_GCM_encrypt_avx512_calc_iv_12_last + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 208(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 224(%r15), %xmm7 +L_AES_GCM_encrypt_avx512_calc_iv_12_last: + vaesenclast %xmm7, %xmm5, %xmm5 + vaesenclast %xmm7, %xmm1, %xmm1 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + vmovdqu %xmm1, 1040(%rsp) + jmp L_AES_GCM_encrypt_avx512_iv_done +L_AES_GCM_encrypt_avx512_iv_not_12: + # Calculate values when IV is not 12 bytes + # H = Encrypt X(=0) + vmovdqa (%r15), %xmm5 + vaesenc 16(%r15), %xmm5, %xmm5 + vaesenc 32(%r15), %xmm5, %xmm5 + vaesenc 48(%r15), %xmm5, %xmm5 + vaesenc 64(%r15), %xmm5, %xmm5 + vaesenc 80(%r15), %xmm5, %xmm5 + vaesenc 96(%r15), %xmm5, %xmm5 + vaesenc 112(%r15), %xmm5, %xmm5 + vaesenc 128(%r15), %xmm5, %xmm5 + vaesenc 144(%r15), %xmm5, %xmm5 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm9 + jl L_AES_GCM_encrypt_avx512_calc_iv_1_aesenc_avx_last + vaesenc %xmm9, %xmm5, %xmm5 + vaesenc 176(%r15), %xmm5, %xmm5 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm9 + jl L_AES_GCM_encrypt_avx512_calc_iv_1_aesenc_avx_last + vaesenc %xmm9, %xmm5, %xmm5 + vaesenc 208(%r15), %xmm5, %xmm5 + vmovdqa 224(%r15), %xmm9 +L_AES_GCM_encrypt_avx512_calc_iv_1_aesenc_avx_last: + vaesenclast %xmm9, %xmm5, %xmm5 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + # Calc counter + # Initialization vector + cmpl $0x00, %edx + movq $0x00, %rcx + je L_AES_GCM_encrypt_avx512_calc_iv_done + cmpl $16, %edx + jl L_AES_GCM_encrypt_avx512_calc_iv_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_encrypt_avx512_calc_iv_16_loop: + vmovdqu (%rax,%rcx,1), %xmm8 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + vpxor %xmm8, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_avx512_calc_iv_16_loop + movl %ebx, %edx + cmpl %edx, %ecx + je L_AES_GCM_encrypt_avx512_calc_iv_done +L_AES_GCM_encrypt_avx512_calc_iv_lt16: + subq $16, %rsp + vpxor %xmm8, %xmm8, %xmm8 + xorl %ebx, %ebx + vmovdqu %xmm8, (%rsp) +L_AES_GCM_encrypt_avx512_calc_iv_loop: + movzbl (%rax,%rcx,1), %r13d + movb %r13b, (%rsp,%rbx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_avx512_calc_iv_loop + vmovdqu (%rsp), %xmm8 + addq $16, %rsp + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + vpxor %xmm8, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 +L_AES_GCM_encrypt_avx512_calc_iv_done: + # T = Encrypt counter + vpxor %xmm0, %xmm0, %xmm0 + shll $3, %edx + vmovq %rdx, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4 + # Encrypt counter + vmovdqa (%r15), %xmm8 + vpxor %xmm4, %xmm8, %xmm8 + vaesenc 16(%r15), %xmm8, %xmm8 + vaesenc 32(%r15), %xmm8, %xmm8 + vaesenc 48(%r15), %xmm8, %xmm8 + vaesenc 64(%r15), %xmm8, %xmm8 + vaesenc 80(%r15), %xmm8, %xmm8 + vaesenc 96(%r15), %xmm8, %xmm8 + vaesenc 112(%r15), %xmm8, %xmm8 + vaesenc 128(%r15), %xmm8, %xmm8 + vaesenc 144(%r15), %xmm8, %xmm8 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm9 + jl L_AES_GCM_encrypt_avx512_calc_iv_2_aesenc_avx_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 176(%r15), %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm9 + jl L_AES_GCM_encrypt_avx512_calc_iv_2_aesenc_avx_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 208(%r15), %xmm8, %xmm8 + vmovdqa 224(%r15), %xmm9 +L_AES_GCM_encrypt_avx512_calc_iv_2_aesenc_avx_last: + vaesenclast %xmm9, %xmm8, %xmm8 + vmovdqu %xmm8, 1040(%rsp) +L_AES_GCM_encrypt_avx512_iv_done: + # Additional authentication data + movl %r11d, %edx + cmpl $0x00, %edx + je L_AES_GCM_encrypt_avx512_calc_aad_done + xorl %ecx, %ecx + cmpl $16, %edx + jl L_AES_GCM_encrypt_avx512_calc_aad_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_encrypt_avx512_calc_aad_16_loop: + vmovdqu (%r12,%rcx,1), %xmm8 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + vpxor %xmm8, %xmm6, %xmm6 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm6, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm6, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm0 + vpxor %xmm6, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm6 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm6, %xmm6 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm6, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm6, %xmm6 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm6, %xmm6 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm6, %xmm6 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm6, %xmm6 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_avx512_calc_aad_16_loop + movl %r11d, %edx + cmpl %edx, %ecx + je L_AES_GCM_encrypt_avx512_calc_aad_done +L_AES_GCM_encrypt_avx512_calc_aad_lt16: + subq $16, %rsp + vpxor %xmm8, %xmm8, %xmm8 + xorl %ebx, %ebx + vmovdqu %xmm8, (%rsp) +L_AES_GCM_encrypt_avx512_calc_aad_loop: + movzbl (%r12,%rcx,1), %r13d + movb %r13b, (%rsp,%rbx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_avx512_calc_aad_loop + vmovdqu (%rsp), %xmm8 + addq $16, %rsp + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + vpxor %xmm8, %xmm6, %xmm6 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm6, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm6, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm0 + vpxor %xmm6, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm6 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm6, %xmm6 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm6, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm6, %xmm6 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm6, %xmm6 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm6, %xmm6 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm6, %xmm6 +L_AES_GCM_encrypt_avx512_calc_aad_done: + # Calculate counter and H + vpsrlq $63, %xmm5, %xmm9 + vpsllq $0x01, %xmm5, %xmm8 + vpslldq $8, %xmm9, %xmm9 + vpor %xmm9, %xmm8, %xmm8 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4 + vpand L_avx512_aes_gcm_mod2_128(%rip), %xmm5, %xmm5 + vpaddd L_avx512_aes_gcm_one(%rip), %xmm4, %xmm4 + vpxor %xmm8, %xmm5, %xmm5 + vmovdqu %xmm4, 1024(%rsp) + xorl %ebx, %ebx + cmpl $0x100, %r9d + jl L_AES_GCM_encrypt_avx512_done_128 + vmovdqa %xmm6, %xmm2 + # H ^ 1 + vmovdqu %xmm5, (%rsp) + # H ^ 2 + vpclmulqdq $0x00, %xmm5, %xmm5, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm5, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm0 + vmovdqu %xmm0, 16(%rsp) + # H ^ 3 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpshufd $0x4e, %xmm0, %xmm10 + vpxor %xmm0, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm5, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm0, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm1 + vmovdqu %xmm1, 32(%rsp) + # H ^ 4 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm3 + vmovdqu %xmm3, 48(%rsp) + # H ^ 5 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 64(%rsp) + # H ^ 6 + vpclmulqdq $0x00, %xmm1, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm1, %xmm1, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 80(%rsp) + # H ^ 7 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm1, %xmm9 + vpxor %xmm1, %xmm9, %xmm9 + vpshufd $0x4e, %xmm3, %xmm10 + vpxor %xmm3, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm1, %xmm3, %xmm8 + vpclmulqdq $0x11, %xmm1, %xmm3, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 96(%rsp) + # H ^ 8 + vpclmulqdq $0x00, %xmm3, %xmm3, %xmm8 + vpclmulqdq $0x11, %xmm3, %xmm3, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 112(%rsp) + # H ^ 9 + vmovdqu 48(%rsp), %xmm0 + vmovdqu 64(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 128(%rsp) + # H ^ 10 + vmovdqu 64(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 144(%rsp) + # H ^ 11 + vmovdqu 64(%rsp), %xmm0 + vmovdqu 80(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 160(%rsp) + # H ^ 12 + vmovdqu 80(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 176(%rsp) + # H ^ 13 + vmovdqu 80(%rsp), %xmm0 + vmovdqu 96(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 192(%rsp) + # H ^ 14 + vmovdqu 96(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 208(%rsp) + # H ^ 15 + vmovdqu 96(%rsp), %xmm0 + vmovdqu 112(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 224(%rsp) + # H ^ 16 + vmovdqu 112(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 240(%rsp) + cmpl $0x200, %r9d + jl L_AES_GCM_encrypt_avx512_no_ext + # H ^ 17 + vmovdqu 112(%rsp), %xmm0 + vmovdqu 128(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 256(%rsp) + # H ^ 18 + vmovdqu 128(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 272(%rsp) + # H ^ 19 + vmovdqu 128(%rsp), %xmm0 + vmovdqu 144(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 288(%rsp) + # H ^ 20 + vmovdqu 144(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 304(%rsp) + # H ^ 21 + vmovdqu 144(%rsp), %xmm0 + vmovdqu 160(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 320(%rsp) + # H ^ 22 + vmovdqu 160(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 336(%rsp) + # H ^ 23 + vmovdqu 160(%rsp), %xmm0 + vmovdqu 176(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 352(%rsp) + # H ^ 24 + vmovdqu 176(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 368(%rsp) + # H ^ 25 + vmovdqu 176(%rsp), %xmm0 + vmovdqu 192(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 384(%rsp) + # H ^ 26 + vmovdqu 192(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 400(%rsp) + # H ^ 27 + vmovdqu 192(%rsp), %xmm0 + vmovdqu 208(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 416(%rsp) + # H ^ 28 + vmovdqu 208(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 432(%rsp) + # H ^ 29 + vmovdqu 208(%rsp), %xmm0 + vmovdqu 224(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 448(%rsp) + # H ^ 30 + vmovdqu 224(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 464(%rsp) + # H ^ 31 + vmovdqu 224(%rsp), %xmm0 + vmovdqu 240(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 480(%rsp) + # H ^ 32 + vmovdqu 240(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 496(%rsp) +L_AES_GCM_encrypt_avx512_no_ext: + vbroadcasti32x4 L_avx512_aes_gcm_bswap_epi64(%rip), %zmm22 + vbroadcasti32x4 L_avx512_aes_gcm_bswap_mask(%rip), %zmm30 + vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31 + vbroadcasti32x4 (%r15), %zmm9 + vbroadcasti32x4 16(%r15), %zmm10 + vbroadcasti32x4 32(%r15), %zmm11 + vbroadcasti32x4 48(%r15), %zmm12 + vbroadcasti32x4 64(%r15), %zmm13 + vbroadcasti32x4 80(%r15), %zmm14 + vbroadcasti32x4 96(%r15), %zmm15 + vbroadcasti32x4 112(%r15), %zmm1 + vbroadcasti32x4 128(%r15), %zmm2 + vbroadcasti32x4 144(%r15), %zmm3 + cmpl $0x200, %r9d + jl L_AES_GCM_encrypt_avx512_no_windows + movl %r9d, %r13d + andl $0xfffffe00, %r13d + vmovdqu64 448(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 384(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 320(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 256(%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + vmovdqu64 %zmm23, 512(%rsp) + vmovdqu64 %zmm24, 576(%rsp) + vmovdqu64 %zmm25, 640(%rsp) + vmovdqu64 %zmm26, 704(%rsp) + vmovdqu64 192(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 128(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 64(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 (%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + vmovdqu64 %zmm23, 768(%rsp) + vmovdqu64 %zmm24, 832(%rsp) + vmovdqu64 %zmm25, 896(%rsp) + vmovdqu64 %zmm26, 960(%rsp) + # 512 bytes of input + leaq (%rsi,%rbx,1), %rcx + movq %rcx, 1056(%rsp) + vbroadcasti32x4 1024(%rsp), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu 1024(%rsp), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 1024(%rsp) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %r10d + vbroadcasti32x4 160(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %r10d + vbroadcasti32x4 192(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%r15), %zmm20 +L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %ebx + vbroadcasti32x4 1024(%rsp), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu 1024(%rsp), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 1024(%rsp) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %r10d + vbroadcasti32x4 160(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %r10d + vbroadcasti32x4 192(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%r15), %zmm20 +L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %ebx + cmpl %r13d, %ebx + jge L_AES_GCM_encrypt_avx512_last_win +L_AES_GCM_encrypt_avx512_win_loop: + leaq (%rsi,%rbx,1), %rcx + movq %rcx, 1072(%rsp) + movq 1056(%rsp), %r12 + vpxorq %zmm21, %zmm21, %zmm21 + vinserti32x4 $0x00, %xmm6, %zmm21, %zmm21 + vbroadcasti32x4 1024(%rsp), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu 1024(%rsp), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 1024(%rsp) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vmovdqu64 (%r12), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vpxorq %zmm21, %zmm31, %zmm31 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vpclmulqdq $0x00, 512(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 512(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 512(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 512(%rsp), %zmm31, %zmm26 + vmovdqa64 %zmm23, %zmm27 + vpxorq %zmm24, %zmm25, %zmm28 + vmovdqa64 %zmm26, %zmm29 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vmovdqu64 64(%r12), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vpclmulqdq $0x00, 576(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 576(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 576(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 576(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vmovdqu64 128(%r12), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vpclmulqdq $0x00, 640(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 640(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 640(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 640(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vmovdqu64 192(%r12), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vpclmulqdq $0x00, 704(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 704(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 704(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 704(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %r10d + vbroadcasti32x4 160(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_a_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %r10d + vbroadcasti32x4 192(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_a_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%r15), %zmm20 +L_AES_GCM_encrypt_avx512_a_il_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %ebx + vbroadcasti32x4 1024(%rsp), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu 1024(%rsp), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 1024(%rsp) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vmovdqu64 256(%r12), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vpclmulqdq $0x00, 768(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 768(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 768(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 768(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vmovdqu64 320(%r12), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vpclmulqdq $0x00, 832(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 832(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 832(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 832(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vmovdqu64 384(%r12), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vpclmulqdq $0x00, 896(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 896(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 896(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 896(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vmovdqu64 448(%r12), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vpclmulqdq $0x00, 960(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 960(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 960(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 960(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %r10d + vbroadcasti32x4 160(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_b_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %r10d + vbroadcasti32x4 192(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_b_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%r15), %zmm20 +L_AES_GCM_encrypt_avx512_b_il_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %ebx + vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm23 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm23, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm23 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm23, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 + movq 1072(%rsp), %rcx + movq %rcx, 1056(%rsp) + cmpl %r13d, %ebx + jl L_AES_GCM_encrypt_avx512_win_loop +L_AES_GCM_encrypt_avx512_last_win: + movq 1056(%rsp), %rcx + vpxorq %zmm20, %zmm20, %zmm20 + vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20 + vmovdqu64 512(%rsp), %zmm23 + vmovdqu64 576(%rsp), %zmm24 + vmovdqu64 640(%rsp), %zmm25 + vmovdqu64 704(%rsp), %zmm26 + vmovdqu64 (%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpxorq %zmm20, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vmovdqa64 %zmm16, %zmm27 + vpxorq %zmm17, %zmm18, %zmm28 + vmovdqa64 %zmm19, %zmm29 + vmovdqu64 64(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 128(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 192(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 768(%rsp), %zmm23 + vmovdqu64 832(%rsp), %zmm24 + vmovdqu64 896(%rsp), %zmm25 + vmovdqu64 960(%rsp), %zmm26 + vmovdqu64 256(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 320(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 384(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 448(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm21, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm21, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 +L_AES_GCM_encrypt_avx512_no_windows: + vmovdqu64 192(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 128(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 64(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 (%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + movl %r9d, %r13d + andl $0xffffff00, %r13d + cmpl %r13d, %ebx + jge L_AES_GCM_encrypt_avx512_after_256 + # 256 bytes of input + vbroadcasti32x4 1024(%rsp), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu 1024(%rsp), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 1024(%rsp) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %r10d + vbroadcasti32x4 160(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %r10d + vbroadcasti32x4 192(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%r15), %zmm20 +L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + movq %rdx, 1056(%rsp) + addl $0x100, %ebx + cmpl %r13d, %ebx + jge L_AES_GCM_encrypt_avx512_last_ghash +L_AES_GCM_encrypt_avx512_ghash_128: + vbroadcasti32x4 1024(%rsp), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu 1024(%rsp), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 1024(%rsp) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %r10d + vbroadcasti32x4 160(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %r10d + vbroadcasti32x4 192(%r15), %zmm20 + jl L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%r15), %zmm20 +L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + movq 1056(%rsp), %rcx + vpxorq %zmm20, %zmm20, %zmm20 + vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20 + vmovdqu64 (%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpxorq %zmm20, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vmovdqa64 %zmm16, %zmm27 + vpxorq %zmm17, %zmm18, %zmm28 + vmovdqa64 %zmm19, %zmm29 + vmovdqu64 64(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 128(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 192(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm21, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm21, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 + movq %rdx, 1056(%rsp) + addl $0x100, %ebx + cmpl %r13d, %ebx + jl L_AES_GCM_encrypt_avx512_ghash_128 +L_AES_GCM_encrypt_avx512_last_ghash: + movq 1056(%rsp), %rcx + vpxorq %zmm20, %zmm20, %zmm20 + vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20 + vmovdqu64 (%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpxorq %zmm20, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vmovdqa64 %zmm16, %zmm27 + vpxorq %zmm17, %zmm18, %zmm28 + vmovdqa64 %zmm19, %zmm29 + vmovdqu64 64(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 128(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 192(%rcx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm21, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm21, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 +L_AES_GCM_encrypt_avx512_after_256: + vmovdqu (%rsp), %xmm5 +L_AES_GCM_encrypt_avx512_done_128: + movl %r9d, %edx + cmpl %edx, %ebx + jge L_AES_GCM_encrypt_avx512_done_enc + movl %r9d, %r13d + andl $0xfffffff0, %r13d + cmpl %r13d, %ebx + jge L_AES_GCM_encrypt_avx512_last_block_done + vmovdqu 1024(%rsp), %xmm9 + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8 + vpaddd L_avx512_aes_gcm_one(%rip), %xmm9, %xmm9 + vmovdqu %xmm9, 1024(%rsp) + vpxor (%r15), %xmm8, %xmm8 + vaesenc 16(%r15), %xmm8, %xmm8 + vaesenc 32(%r15), %xmm8, %xmm8 + vaesenc 48(%r15), %xmm8, %xmm8 + vaesenc 64(%r15), %xmm8, %xmm8 + vaesenc 80(%r15), %xmm8, %xmm8 + vaesenc 96(%r15), %xmm8, %xmm8 + vaesenc 112(%r15), %xmm8, %xmm8 + vaesenc 128(%r15), %xmm8, %xmm8 + vaesenc 144(%r15), %xmm8, %xmm8 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm9 + jl L_AES_GCM_encrypt_avx512_aesenc_block_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 176(%r15), %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm9 + jl L_AES_GCM_encrypt_avx512_aesenc_block_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 208(%r15), %xmm8, %xmm8 + vmovdqa 224(%r15), %xmm9 +L_AES_GCM_encrypt_avx512_aesenc_block_last: + vaesenclast %xmm9, %xmm8, %xmm8 + vmovdqu (%rdi,%rbx,1), %xmm9 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqu %xmm8, (%rsi,%rbx,1) + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + vpxor %xmm8, %xmm6, %xmm6 + addl $16, %ebx + cmpl %r13d, %ebx + jge L_AES_GCM_encrypt_avx512_last_block_ghash +L_AES_GCM_encrypt_avx512_last_block_start: + vmovdqu (%rdi,%rbx,1), %xmm13 + vmovdqu 1024(%rsp), %xmm9 + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8 + vpaddd L_avx512_aes_gcm_one(%rip), %xmm9, %xmm9 + vmovdqu %xmm9, 1024(%rsp) + vpxor (%r15), %xmm8, %xmm8 + vpclmulqdq $16, %xmm5, %xmm6, %xmm10 + vaesenc 16(%r15), %xmm8, %xmm8 + vaesenc 32(%r15), %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm5, %xmm6, %xmm11 + vaesenc 48(%r15), %xmm8, %xmm8 + vaesenc 64(%r15), %xmm8, %xmm8 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm12 + vaesenc 80(%r15), %xmm8, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm1 + vaesenc 96(%r15), %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpslldq $8, %xmm10, %xmm2 + vpsrldq $8, %xmm10, %xmm10 + vaesenc 112(%r15), %xmm8, %xmm8 + vpxor %xmm12, %xmm2, %xmm2 + vpxor %xmm10, %xmm1, %xmm3 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm0 + vpclmulqdq $16, %xmm0, %xmm2, %xmm11 + vaesenc 128(%r15), %xmm8, %xmm8 + vpshufd $0x4e, %xmm2, %xmm10 + vpxor %xmm11, %xmm10, %xmm10 + vpclmulqdq $16, %xmm0, %xmm10, %xmm11 + vaesenc 144(%r15), %xmm8, %xmm8 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm3, %xmm10, %xmm6 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm9 + jl L_AES_GCM_encrypt_avx512_aesenc_gfmul_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 176(%r15), %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm9 + jl L_AES_GCM_encrypt_avx512_aesenc_gfmul_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 208(%r15), %xmm8, %xmm8 + vmovdqa 224(%r15), %xmm9 +L_AES_GCM_encrypt_avx512_aesenc_gfmul_last: + vaesenclast %xmm9, %xmm8, %xmm8 + vmovdqa %xmm13, %xmm0 + vpxor %xmm0, %xmm8, %xmm8 + vmovdqu %xmm8, (%rsi,%rbx,1) + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + addl $16, %ebx + vpxor %xmm8, %xmm6, %xmm6 + cmpl %r13d, %ebx + jl L_AES_GCM_encrypt_avx512_last_block_start +L_AES_GCM_encrypt_avx512_last_block_ghash: + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpshufd $0x4e, %xmm6, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm6 +L_AES_GCM_encrypt_avx512_last_block_done: + movl %r9d, %ecx + movl %ecx, %edx + andl $15, %ecx + jz L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_done + vmovdqu 1024(%rsp), %xmm4 + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4 + vpxor (%r15), %xmm4, %xmm4 + vaesenc 16(%r15), %xmm4, %xmm4 + vaesenc 32(%r15), %xmm4, %xmm4 + vaesenc 48(%r15), %xmm4, %xmm4 + vaesenc 64(%r15), %xmm4, %xmm4 + vaesenc 80(%r15), %xmm4, %xmm4 + vaesenc 96(%r15), %xmm4, %xmm4 + vaesenc 112(%r15), %xmm4, %xmm4 + vaesenc 128(%r15), %xmm4, %xmm4 + vaesenc 144(%r15), %xmm4, %xmm4 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm9 + jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc %xmm9, %xmm4, %xmm4 + vaesenc 176(%r15), %xmm4, %xmm4 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm9 + jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc %xmm9, %xmm4, %xmm4 + vaesenc 208(%r15), %xmm4, %xmm4 + vmovdqa 224(%r15), %xmm9 +L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_aesenc_avx_last: + vaesenclast %xmm9, %xmm4, %xmm4 + subq $16, %rsp + xorl %ecx, %ecx + vmovdqu %xmm4, (%rsp) +L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_loop: + movzbl (%rdi,%rbx,1), %r13d + xorb (%rsp,%rcx,1), %r13b + movb %r13b, (%rsi,%rbx,1) + movb %r13b, (%rsp,%rcx,1) + incl %ebx + incl %ecx + cmpl %edx, %ebx + jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_loop + xorq %r13, %r13 + cmpl $16, %ecx + je L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_finish_enc +L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_byte_loop: + movb %r13b, (%rsp,%rcx,1) + incl %ecx + cmpl $16, %ecx + jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_byte_loop +L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_finish_enc: + vmovdqu (%rsp), %xmm4 + addq $16, %rsp + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4 + vpxor %xmm4, %xmm6, %xmm6 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpshufd $0x4e, %xmm6, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm6 +L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_done: +L_AES_GCM_encrypt_avx512_done_enc: + movl %r9d, %edx + movl %r11d, %ecx + shlq $3, %rdx + shlq $3, %rcx + vmovq %rdx, %xmm0 + vmovq %rcx, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpshufd $0x4e, %xmm6, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm6 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6 + vmovdqu 1040(%rsp), %xmm0 + vpxor %xmm6, %xmm0, %xmm0 + cmpl $16, %r14d + je L_AES_GCM_encrypt_avx512_store_tag_16 + xorq %rcx, %rcx + vmovdqu %xmm0, (%rsp) +L_AES_GCM_encrypt_avx512_store_tag_loop: + movzbl (%rsp,%rcx,1), %r13d + movb %r13b, (%r8,%rcx,1) + incl %ecx + cmpl %r14d, %ecx + jne L_AES_GCM_encrypt_avx512_store_tag_loop + jmp L_AES_GCM_encrypt_avx512_store_tag_done +L_AES_GCM_encrypt_avx512_store_tag_16: + vmovdqu %xmm0, (%r8) +L_AES_GCM_encrypt_avx512_store_tag_done: + vzeroupper + addq $0x440, %rsp + popq %r15 + popq %r14 + popq %rbx + popq %r12 + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_encrypt_avx512,.-AES_GCM_encrypt_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_decrypt_avx512 +.type AES_GCM_decrypt_avx512,@function +.align 16 +AES_GCM_decrypt_avx512: +#else +.section __TEXT,__text +.globl _AES_GCM_decrypt_avx512 +.p2align 4 +_AES_GCM_decrypt_avx512: +#endif /* __APPLE__ */ + pushq %r13 + pushq %r12 + pushq %rbx + pushq %r14 + pushq %r15 + pushq %rbp + movq %rdx, %r12 + movq %rcx, %rax + movl 56(%rsp), %r11d + movl 64(%rsp), %ebx + movl 72(%rsp), %r14d + movq 80(%rsp), %r15 + movl 88(%rsp), %r10d + movq 96(%rsp), %rbp + subq $0x420, %rsp + vpxor %xmm4, %xmm4, %xmm4 + vpxor %xmm6, %xmm6, %xmm6 + cmpl $12, %ebx + movl %ebx, %edx + jne L_AES_GCM_decrypt_avx512_iv_not_12 + # # Calculate values when IV is 12 bytes + # Set counter based on IV + movl $0x1000000, %ecx + vmovq (%rax), %xmm4 + vpinsrd $2, 8(%rax), %xmm4, %xmm4 + vpinsrd $3, %ecx, %xmm4, %xmm4 + # H = Encrypt X(=0) and T = Encrypt counter + vmovdqa (%r15), %xmm5 + vpxor %xmm5, %xmm4, %xmm1 + vmovdqa 16(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 32(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 48(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 64(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 80(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 96(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 112(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 128(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 144(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm7 + jl L_AES_GCM_decrypt_avx512_calc_iv_12_last + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 176(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm7 + jl L_AES_GCM_decrypt_avx512_calc_iv_12_last + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 208(%r15), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 224(%r15), %xmm7 +L_AES_GCM_decrypt_avx512_calc_iv_12_last: + vaesenclast %xmm7, %xmm5, %xmm5 + vaesenclast %xmm7, %xmm1, %xmm1 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + vmovdqu %xmm1, 1040(%rsp) + jmp L_AES_GCM_decrypt_avx512_iv_done +L_AES_GCM_decrypt_avx512_iv_not_12: + # Calculate values when IV is not 12 bytes + # H = Encrypt X(=0) + vmovdqa (%r15), %xmm5 + vaesenc 16(%r15), %xmm5, %xmm5 + vaesenc 32(%r15), %xmm5, %xmm5 + vaesenc 48(%r15), %xmm5, %xmm5 + vaesenc 64(%r15), %xmm5, %xmm5 + vaesenc 80(%r15), %xmm5, %xmm5 + vaesenc 96(%r15), %xmm5, %xmm5 + vaesenc 112(%r15), %xmm5, %xmm5 + vaesenc 128(%r15), %xmm5, %xmm5 + vaesenc 144(%r15), %xmm5, %xmm5 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm9 + jl L_AES_GCM_decrypt_avx512_calc_iv_1_aesenc_avx_last + vaesenc %xmm9, %xmm5, %xmm5 + vaesenc 176(%r15), %xmm5, %xmm5 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm9 + jl L_AES_GCM_decrypt_avx512_calc_iv_1_aesenc_avx_last + vaesenc %xmm9, %xmm5, %xmm5 + vaesenc 208(%r15), %xmm5, %xmm5 + vmovdqa 224(%r15), %xmm9 +L_AES_GCM_decrypt_avx512_calc_iv_1_aesenc_avx_last: + vaesenclast %xmm9, %xmm5, %xmm5 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + # Calc counter + # Initialization vector + cmpl $0x00, %edx + movq $0x00, %rcx + je L_AES_GCM_decrypt_avx512_calc_iv_done + cmpl $16, %edx + jl L_AES_GCM_decrypt_avx512_calc_iv_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_decrypt_avx512_calc_iv_16_loop: + vmovdqu (%rax,%rcx,1), %xmm8 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + vpxor %xmm8, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_avx512_calc_iv_16_loop + movl %ebx, %edx + cmpl %edx, %ecx + je L_AES_GCM_decrypt_avx512_calc_iv_done +L_AES_GCM_decrypt_avx512_calc_iv_lt16: + subq $16, %rsp + vpxor %xmm8, %xmm8, %xmm8 + xorl %ebx, %ebx + vmovdqu %xmm8, (%rsp) +L_AES_GCM_decrypt_avx512_calc_iv_loop: + movzbl (%rax,%rcx,1), %r13d + movb %r13b, (%rsp,%rbx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_avx512_calc_iv_loop + vmovdqu (%rsp), %xmm8 + addq $16, %rsp + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + vpxor %xmm8, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 +L_AES_GCM_decrypt_avx512_calc_iv_done: + # T = Encrypt counter + vpxor %xmm0, %xmm0, %xmm0 + shll $3, %edx + vmovq %rdx, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4 + # Encrypt counter + vmovdqa (%r15), %xmm8 + vpxor %xmm4, %xmm8, %xmm8 + vaesenc 16(%r15), %xmm8, %xmm8 + vaesenc 32(%r15), %xmm8, %xmm8 + vaesenc 48(%r15), %xmm8, %xmm8 + vaesenc 64(%r15), %xmm8, %xmm8 + vaesenc 80(%r15), %xmm8, %xmm8 + vaesenc 96(%r15), %xmm8, %xmm8 + vaesenc 112(%r15), %xmm8, %xmm8 + vaesenc 128(%r15), %xmm8, %xmm8 + vaesenc 144(%r15), %xmm8, %xmm8 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm9 + jl L_AES_GCM_decrypt_avx512_calc_iv_2_aesenc_avx_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 176(%r15), %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm9 + jl L_AES_GCM_decrypt_avx512_calc_iv_2_aesenc_avx_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 208(%r15), %xmm8, %xmm8 + vmovdqa 224(%r15), %xmm9 +L_AES_GCM_decrypt_avx512_calc_iv_2_aesenc_avx_last: + vaesenclast %xmm9, %xmm8, %xmm8 + vmovdqu %xmm8, 1040(%rsp) +L_AES_GCM_decrypt_avx512_iv_done: + # Additional authentication data + movl %r11d, %edx + cmpl $0x00, %edx + je L_AES_GCM_decrypt_avx512_calc_aad_done + xorl %ecx, %ecx + cmpl $16, %edx + jl L_AES_GCM_decrypt_avx512_calc_aad_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_decrypt_avx512_calc_aad_16_loop: + vmovdqu (%r12,%rcx,1), %xmm8 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + vpxor %xmm8, %xmm6, %xmm6 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm6, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm6, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm0 + vpxor %xmm6, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm6 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm6, %xmm6 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm6, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm6, %xmm6 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm6, %xmm6 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm6, %xmm6 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm6, %xmm6 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_avx512_calc_aad_16_loop + movl %r11d, %edx + cmpl %edx, %ecx + je L_AES_GCM_decrypt_avx512_calc_aad_done +L_AES_GCM_decrypt_avx512_calc_aad_lt16: + subq $16, %rsp + vpxor %xmm8, %xmm8, %xmm8 + xorl %ebx, %ebx + vmovdqu %xmm8, (%rsp) +L_AES_GCM_decrypt_avx512_calc_aad_loop: + movzbl (%r12,%rcx,1), %r13d + movb %r13b, (%rsp,%rbx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_avx512_calc_aad_loop + vmovdqu (%rsp), %xmm8 + addq $16, %rsp + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + vpxor %xmm8, %xmm6, %xmm6 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm6, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm6, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm0 + vpxor %xmm6, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm6 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm6, %xmm6 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm6, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm6, %xmm6 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm6, %xmm6 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm6, %xmm6 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm6, %xmm6 +L_AES_GCM_decrypt_avx512_calc_aad_done: + # Calculate counter and H + vpsrlq $63, %xmm5, %xmm9 + vpsllq $0x01, %xmm5, %xmm8 + vpslldq $8, %xmm9, %xmm9 + vpor %xmm9, %xmm8, %xmm8 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4 + vpand L_avx512_aes_gcm_mod2_128(%rip), %xmm5, %xmm5 + vpaddd L_avx512_aes_gcm_one(%rip), %xmm4, %xmm4 + vpxor %xmm8, %xmm5, %xmm5 + vmovdqu %xmm4, 1024(%rsp) + xorl %ebx, %ebx + cmpl $0x100, %r9d + jl L_AES_GCM_decrypt_avx512_done_128 + vmovdqa %xmm6, %xmm2 + # H ^ 1 + vmovdqu %xmm5, (%rsp) + # H ^ 2 + vpclmulqdq $0x00, %xmm5, %xmm5, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm5, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm0 + vmovdqu %xmm0, 16(%rsp) + # H ^ 3 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpshufd $0x4e, %xmm0, %xmm10 + vpxor %xmm0, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm5, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm0, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm1 + vmovdqu %xmm1, 32(%rsp) + # H ^ 4 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm3 + vmovdqu %xmm3, 48(%rsp) + # H ^ 5 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 64(%rsp) + # H ^ 6 + vpclmulqdq $0x00, %xmm1, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm1, %xmm1, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 80(%rsp) + # H ^ 7 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm1, %xmm9 + vpxor %xmm1, %xmm9, %xmm9 + vpshufd $0x4e, %xmm3, %xmm10 + vpxor %xmm3, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm1, %xmm3, %xmm8 + vpclmulqdq $0x11, %xmm1, %xmm3, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 96(%rsp) + # H ^ 8 + vpclmulqdq $0x00, %xmm3, %xmm3, %xmm8 + vpclmulqdq $0x11, %xmm3, %xmm3, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 112(%rsp) + # H ^ 9 + vmovdqu 48(%rsp), %xmm0 + vmovdqu 64(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 128(%rsp) + # H ^ 10 + vmovdqu 64(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 144(%rsp) + # H ^ 11 + vmovdqu 64(%rsp), %xmm0 + vmovdqu 80(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 160(%rsp) + # H ^ 12 + vmovdqu 80(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 176(%rsp) + # H ^ 13 + vmovdqu 80(%rsp), %xmm0 + vmovdqu 96(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 192(%rsp) + # H ^ 14 + vmovdqu 96(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 208(%rsp) + # H ^ 15 + vmovdqu 96(%rsp), %xmm0 + vmovdqu 112(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 224(%rsp) + # H ^ 16 + vmovdqu 112(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 240(%rsp) + cmpl $0x200, %r9d + jl L_AES_GCM_decrypt_avx512_no_ext + # H ^ 17 + vmovdqu 112(%rsp), %xmm0 + vmovdqu 128(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 256(%rsp) + # H ^ 18 + vmovdqu 128(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 272(%rsp) + # H ^ 19 + vmovdqu 128(%rsp), %xmm0 + vmovdqu 144(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 288(%rsp) + # H ^ 20 + vmovdqu 144(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 304(%rsp) + # H ^ 21 + vmovdqu 144(%rsp), %xmm0 + vmovdqu 160(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 320(%rsp) + # H ^ 22 + vmovdqu 160(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 336(%rsp) + # H ^ 23 + vmovdqu 160(%rsp), %xmm0 + vmovdqu 176(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 352(%rsp) + # H ^ 24 + vmovdqu 176(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 368(%rsp) + # H ^ 25 + vmovdqu 176(%rsp), %xmm0 + vmovdqu 192(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 384(%rsp) + # H ^ 26 + vmovdqu 192(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 400(%rsp) + # H ^ 27 + vmovdqu 192(%rsp), %xmm0 + vmovdqu 208(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 416(%rsp) + # H ^ 28 + vmovdqu 208(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 432(%rsp) + # H ^ 29 + vmovdqu 208(%rsp), %xmm0 + vmovdqu 224(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 448(%rsp) + # H ^ 30 + vmovdqu 224(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 464(%rsp) + # H ^ 31 + vmovdqu 224(%rsp), %xmm0 + vmovdqu 240(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 480(%rsp) + # H ^ 32 + vmovdqu 240(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 496(%rsp) +L_AES_GCM_decrypt_avx512_no_ext: + vbroadcasti32x4 L_avx512_aes_gcm_bswap_epi64(%rip), %zmm22 + vbroadcasti32x4 L_avx512_aes_gcm_bswap_mask(%rip), %zmm30 + vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31 + vbroadcasti32x4 (%r15), %zmm9 + vbroadcasti32x4 16(%r15), %zmm10 + vbroadcasti32x4 32(%r15), %zmm11 + vbroadcasti32x4 48(%r15), %zmm12 + vbroadcasti32x4 64(%r15), %zmm13 + vbroadcasti32x4 80(%r15), %zmm14 + vbroadcasti32x4 96(%r15), %zmm15 + vbroadcasti32x4 112(%r15), %zmm1 + vbroadcasti32x4 128(%r15), %zmm2 + vbroadcasti32x4 144(%r15), %zmm3 + cmpl $0x200, %r9d + jl L_AES_GCM_decrypt_avx512_no_windows + movl %r9d, %r13d + andl $0xfffffe00, %r13d + vmovdqu64 448(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 384(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 320(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 256(%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + vmovdqu64 %zmm23, 512(%rsp) + vmovdqu64 %zmm24, 576(%rsp) + vmovdqu64 %zmm25, 640(%rsp) + vmovdqu64 %zmm26, 704(%rsp) + vmovdqu64 192(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 128(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 64(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 (%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + vmovdqu64 %zmm23, 768(%rsp) + vmovdqu64 %zmm24, 832(%rsp) + vmovdqu64 %zmm25, 896(%rsp) + vmovdqu64 %zmm26, 960(%rsp) + # 512 bytes of input + xorl %r12d, %r12d + leaq (%rdi,%rbx,1), %rax + vpxorq %zmm20, %zmm20, %zmm20 + vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20 + vmovdqu64 512(%rsp), %zmm23 + vmovdqu64 576(%rsp), %zmm24 + vmovdqu64 640(%rsp), %zmm25 + vmovdqu64 704(%rsp), %zmm26 + vmovdqu64 (%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpxorq %zmm20, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vmovdqa64 %zmm16, %zmm27 + vpxorq %zmm17, %zmm18, %zmm28 + vmovdqa64 %zmm19, %zmm29 + vmovdqu64 64(%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 128(%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 192(%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 768(%rsp), %zmm23 + vmovdqu64 832(%rsp), %zmm24 + vmovdqu64 896(%rsp), %zmm25 + vmovdqu64 960(%rsp), %zmm26 + vmovdqu64 256(%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 320(%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 384(%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 448(%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm21, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm21, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 + addl $0x200, %ebx + cmpl %r13d, %ebx + jge L_AES_GCM_decrypt_avx512_last_aes +L_AES_GCM_decrypt_avx512_win_loop: + leaq (%rdi,%rbx,1), %rax + vpxorq %zmm21, %zmm21, %zmm21 + vinserti32x4 $0x00, %xmm6, %zmm21, %zmm21 + vbroadcasti32x4 1024(%rsp), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu 1024(%rsp), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 1024(%rsp) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vmovdqu64 (%rax), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vpxorq %zmm21, %zmm31, %zmm31 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vpclmulqdq $0x00, 512(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 512(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 512(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 512(%rsp), %zmm31, %zmm26 + vmovdqa64 %zmm23, %zmm27 + vpxorq %zmm24, %zmm25, %zmm28 + vmovdqa64 %zmm26, %zmm29 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vmovdqu64 64(%rax), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vpclmulqdq $0x00, 576(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 576(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 576(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 576(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vmovdqu64 128(%rax), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vpclmulqdq $0x00, 640(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 640(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 640(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 640(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vmovdqu64 192(%rax), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vpclmulqdq $0x00, 704(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 704(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 704(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 704(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %r10d + vbroadcasti32x4 160(%r15), %zmm20 + jl L_AES_GCM_decrypt_avx512_a_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %r10d + vbroadcasti32x4 192(%r15), %zmm20 + jl L_AES_GCM_decrypt_avx512_a_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%r15), %zmm20 +L_AES_GCM_decrypt_avx512_a_il_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r12d + vbroadcasti32x4 1024(%rsp), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu 1024(%rsp), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 1024(%rsp) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vmovdqu64 256(%rax), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vpclmulqdq $0x00, 768(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 768(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 768(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 768(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vmovdqu64 320(%rax), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vpclmulqdq $0x00, 832(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 832(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 832(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 832(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vmovdqu64 384(%rax), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vpclmulqdq $0x00, 896(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 896(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 896(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 896(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vmovdqu64 448(%rax), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vpclmulqdq $0x00, 960(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 960(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 960(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 960(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %r10d + vbroadcasti32x4 160(%r15), %zmm20 + jl L_AES_GCM_decrypt_avx512_b_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %r10d + vbroadcasti32x4 192(%r15), %zmm20 + jl L_AES_GCM_decrypt_avx512_b_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%r15), %zmm20 +L_AES_GCM_decrypt_avx512_b_il_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r12d + vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm23 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm23, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm23 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm23, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 + addl $0x200, %ebx + cmpl %r13d, %ebx + jl L_AES_GCM_decrypt_avx512_win_loop +L_AES_GCM_decrypt_avx512_last_aes: + vbroadcasti32x4 1024(%rsp), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu 1024(%rsp), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 1024(%rsp) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %r10d + vbroadcasti32x4 160(%r15), %zmm20 + jl L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %r10d + vbroadcasti32x4 192(%r15), %zmm20 + jl L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%r15), %zmm20 +L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r12d + vbroadcasti32x4 1024(%rsp), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu 1024(%rsp), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 1024(%rsp) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %r10d + vbroadcasti32x4 160(%r15), %zmm20 + jl L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %r10d + vbroadcasti32x4 192(%r15), %zmm20 + jl L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%r15), %zmm20 +L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r12d +L_AES_GCM_decrypt_avx512_no_windows: + vmovdqu64 192(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 128(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 64(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 (%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + movl %r9d, %r13d + andl $0xffffff00, %r13d + cmpl %r13d, %ebx + jge L_AES_GCM_decrypt_avx512_after_256 + # 256 bytes of input + leaq (%rdi,%rbx,1), %rax + vpxorq %zmm20, %zmm20, %zmm20 + vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20 + vmovdqu64 (%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpxorq %zmm20, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vmovdqa64 %zmm16, %zmm27 + vpxorq %zmm17, %zmm18, %zmm28 + vmovdqa64 %zmm19, %zmm29 + vmovdqu64 64(%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 128(%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 192(%rax), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm21, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm21, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 + vbroadcasti32x4 1024(%rsp), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu 1024(%rsp), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, 1024(%rsp) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %r10d + vbroadcasti32x4 160(%r15), %zmm20 + jl L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %r10d + vbroadcasti32x4 192(%r15), %zmm20 + jl L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%r15), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%r15), %zmm20 +L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%rdi,%rbx,1), %rcx + leaq (%rsi,%rbx,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %ebx +L_AES_GCM_decrypt_avx512_after_256: + vmovdqu (%rsp), %xmm5 +L_AES_GCM_decrypt_avx512_done_128: + movl %r9d, %edx + cmpl %edx, %ebx + jge L_AES_GCM_decrypt_avx512_done_dec + movl %r9d, %r13d + andl $0xfffffff0, %r13d + cmpl %r13d, %ebx + jge L_AES_GCM_decrypt_avx512_last_block_done +L_AES_GCM_decrypt_avx512_last_block_start: + vmovdqu (%rdi,%rbx,1), %xmm13 + vmovdqa %xmm5, %xmm0 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm13, %xmm1 + vpxor %xmm6, %xmm1, %xmm1 + vmovdqu 1024(%rsp), %xmm9 + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8 + vpaddd L_avx512_aes_gcm_one(%rip), %xmm9, %xmm9 + vmovdqu %xmm9, 1024(%rsp) + vpxor (%r15), %xmm8, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm10 + vaesenc 16(%r15), %xmm8, %xmm8 + vaesenc 32(%r15), %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm11 + vaesenc 48(%r15), %xmm8, %xmm8 + vaesenc 64(%r15), %xmm8, %xmm8 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm12 + vaesenc 80(%r15), %xmm8, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vaesenc 96(%r15), %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpslldq $8, %xmm10, %xmm2 + vpsrldq $8, %xmm10, %xmm10 + vaesenc 112(%r15), %xmm8, %xmm8 + vpxor %xmm12, %xmm2, %xmm2 + vpxor %xmm10, %xmm1, %xmm3 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm0 + vpclmulqdq $16, %xmm0, %xmm2, %xmm11 + vaesenc 128(%r15), %xmm8, %xmm8 + vpshufd $0x4e, %xmm2, %xmm10 + vpxor %xmm11, %xmm10, %xmm10 + vpclmulqdq $16, %xmm0, %xmm10, %xmm11 + vaesenc 144(%r15), %xmm8, %xmm8 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm3, %xmm10, %xmm6 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm9 + jl L_AES_GCM_decrypt_avx512_aesenc_gfmul_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 176(%r15), %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm9 + jl L_AES_GCM_decrypt_avx512_aesenc_gfmul_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 208(%r15), %xmm8, %xmm8 + vmovdqa 224(%r15), %xmm9 +L_AES_GCM_decrypt_avx512_aesenc_gfmul_last: + vaesenclast %xmm9, %xmm8, %xmm8 + vmovdqa %xmm13, %xmm0 + vpxor %xmm0, %xmm8, %xmm8 + vmovdqu %xmm8, (%rsi,%rbx,1) + addl $16, %ebx + cmpl %r13d, %ebx + jl L_AES_GCM_decrypt_avx512_last_block_start +L_AES_GCM_decrypt_avx512_last_block_done: + movl %r9d, %ecx + movl %ecx, %edx + andl $15, %ecx + jz L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_done + vmovdqu 1024(%rsp), %xmm4 + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4 + vpxor (%r15), %xmm4, %xmm4 + vaesenc 16(%r15), %xmm4, %xmm4 + vaesenc 32(%r15), %xmm4, %xmm4 + vaesenc 48(%r15), %xmm4, %xmm4 + vaesenc 64(%r15), %xmm4, %xmm4 + vaesenc 80(%r15), %xmm4, %xmm4 + vaesenc 96(%r15), %xmm4, %xmm4 + vaesenc 112(%r15), %xmm4, %xmm4 + vaesenc 128(%r15), %xmm4, %xmm4 + vaesenc 144(%r15), %xmm4, %xmm4 + cmpl $11, %r10d + vmovdqa 160(%r15), %xmm9 + jl L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc %xmm9, %xmm4, %xmm4 + vaesenc 176(%r15), %xmm4, %xmm4 + cmpl $13, %r10d + vmovdqa 192(%r15), %xmm9 + jl L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc %xmm9, %xmm4, %xmm4 + vaesenc 208(%r15), %xmm4, %xmm4 + vmovdqa 224(%r15), %xmm9 +L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_aesenc_avx_last: + vaesenclast %xmm9, %xmm4, %xmm4 + subq $32, %rsp + xorl %ecx, %ecx + vmovdqu %xmm4, (%rsp) + vpxor %xmm0, %xmm0, %xmm0 + vmovdqu %xmm0, 16(%rsp) +L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_loop: + movzbl (%rdi,%rbx,1), %r13d + movb %r13b, 16(%rsp,%rcx,1) + xorb (%rsp,%rcx,1), %r13b + movb %r13b, (%rsi,%rbx,1) + incl %ebx + incl %ecx + cmpl %edx, %ebx + jl L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_loop + vmovdqu 16(%rsp), %xmm4 + addq $32, %rsp + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4 + vpxor %xmm4, %xmm6, %xmm6 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpshufd $0x4e, %xmm6, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm6 +L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_done: +L_AES_GCM_decrypt_avx512_done_dec: + movl %r9d, %edx + movl %r11d, %ecx + shlq $3, %rdx + shlq $3, %rcx + vmovq %rdx, %xmm0 + vmovq %rcx, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpshufd $0x4e, %xmm6, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm6 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6 + vmovdqu 1040(%rsp), %xmm0 + vpxor %xmm6, %xmm0, %xmm0 + cmpl $16, %r14d + je L_AES_GCM_decrypt_avx512_cmp_tag_16 + subq $16, %rsp + xorq %rcx, %rcx + xorq %rbx, %rbx + vmovdqu %xmm0, (%rsp) +L_AES_GCM_decrypt_avx512_cmp_tag_loop: + movzbl (%rsp,%rcx,1), %r13d + xorb (%r8,%rcx,1), %r13b + orb %r13b, %bl + incl %ecx + cmpl %r14d, %ecx + jne L_AES_GCM_decrypt_avx512_cmp_tag_loop + cmpb $0x00, %bl + sete %bl + addq $16, %rsp + xorq %rcx, %rcx + jmp L_AES_GCM_decrypt_avx512_cmp_tag_done +L_AES_GCM_decrypt_avx512_cmp_tag_16: + vmovdqu (%r8), %xmm1 + vpcmpeqb %xmm1, %xmm0, %xmm0 + vpmovmskb %xmm0, %rdx + # %%edx == 0xFFFF then return 1 else => return 0 + xorl %ebx, %ebx + cmpl $0xffff, %edx + sete %bl +L_AES_GCM_decrypt_avx512_cmp_tag_done: + movl %ebx, (%rbp) + vzeroupper + addq $0x420, %rsp + popq %rbp + popq %r15 + popq %r14 + popq %rbx + popq %r12 + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_decrypt_avx512,.-AES_GCM_decrypt_avx512 +#endif /* __APPLE__ */ +#ifdef WOLFSSL_AESGCM_STREAM +#ifndef __APPLE__ +.text +.globl AES_GCM_init_avx512 +.type AES_GCM_init_avx512,@function +.align 16 +AES_GCM_init_avx512: +#else +.section __TEXT,__text +.globl _AES_GCM_init_avx512 +.p2align 4 +_AES_GCM_init_avx512: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + movq %rdx, %r10 + movl %ecx, %r11d + movq 24(%rsp), %rax + subq $16, %rsp + vpxor %xmm4, %xmm4, %xmm4 + movl %r11d, %edx + cmpl $12, %edx + jne L_AES_GCM_init_avx512_iv_not_12 + # # Calculate values when IV is 12 bytes + # Set counter based on IV + movl $0x1000000, %ecx + vmovq (%r10), %xmm4 + vpinsrd $2, 8(%r10), %xmm4, %xmm4 + vpinsrd $3, %ecx, %xmm4, %xmm4 + # H = Encrypt X(=0) and T = Encrypt counter + vmovdqa (%rdi), %xmm5 + vpxor %xmm5, %xmm4, %xmm1 + vmovdqa 16(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 32(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 48(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 64(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 80(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 96(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 112(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 128(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 144(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm6 + jl L_AES_GCM_init_avx512_calc_iv_12_last + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 176(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm6 + jl L_AES_GCM_init_avx512_calc_iv_12_last + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 208(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 224(%rdi), %xmm6 +L_AES_GCM_init_avx512_calc_iv_12_last: + vaesenclast %xmm6, %xmm5, %xmm5 + vaesenclast %xmm6, %xmm1, %xmm1 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + vmovdqu %xmm1, %xmm15 + jmp L_AES_GCM_init_avx512_iv_done +L_AES_GCM_init_avx512_iv_not_12: + # Calculate values when IV is not 12 bytes + # H = Encrypt X(=0) + vmovdqa (%rdi), %xmm5 + vaesenc 16(%rdi), %xmm5, %xmm5 + vaesenc 32(%rdi), %xmm5, %xmm5 + vaesenc 48(%rdi), %xmm5, %xmm5 + vaesenc 64(%rdi), %xmm5, %xmm5 + vaesenc 80(%rdi), %xmm5, %xmm5 + vaesenc 96(%rdi), %xmm5, %xmm5 + vaesenc 112(%rdi), %xmm5, %xmm5 + vaesenc 128(%rdi), %xmm5, %xmm5 + vaesenc 144(%rdi), %xmm5, %xmm5 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm8 + jl L_AES_GCM_init_avx512_calc_iv_1_aesenc_avx_last + vaesenc %xmm8, %xmm5, %xmm5 + vaesenc 176(%rdi), %xmm5, %xmm5 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm8 + jl L_AES_GCM_init_avx512_calc_iv_1_aesenc_avx_last + vaesenc %xmm8, %xmm5, %xmm5 + vaesenc 208(%rdi), %xmm5, %xmm5 + vmovdqa 224(%rdi), %xmm8 +L_AES_GCM_init_avx512_calc_iv_1_aesenc_avx_last: + vaesenclast %xmm8, %xmm5, %xmm5 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 + # Calc counter + # Initialization vector + cmpl $0x00, %edx + movq $0x00, %rcx + je L_AES_GCM_init_avx512_calc_iv_done + cmpl $16, %edx + jl L_AES_GCM_init_avx512_calc_iv_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_init_avx512_calc_iv_16_loop: + vmovdqu (%r10,%rcx,1), %xmm7 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm6 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm6, %xmm6 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm6, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm6, %xmm6 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm6, %xmm6 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm6, %xmm0 + vpslld $30, %xmm6, %xmm1 + vpslld $25, %xmm6, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm2 + vpsrld $2, %xmm6, %xmm3 + vpsrld $7, %xmm6, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm6, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_init_avx512_calc_iv_16_loop + movl %r11d, %edx + cmpl %edx, %ecx + je L_AES_GCM_init_avx512_calc_iv_done +L_AES_GCM_init_avx512_calc_iv_lt16: + subq $16, %rsp + vpxor %xmm7, %xmm7, %xmm7 + xorl %r13d, %r13d + vmovdqu %xmm7, (%rsp) +L_AES_GCM_init_avx512_calc_iv_loop: + movzbl (%r10,%rcx,1), %r12d + movb %r12b, (%rsp,%r13,1) + incl %ecx + incl %r13d + cmpl %edx, %ecx + jl L_AES_GCM_init_avx512_calc_iv_loop + vmovdqu (%rsp), %xmm7 + addq $16, %rsp + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm6 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm6, %xmm6 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm6, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm6, %xmm6 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm6, %xmm6 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm6, %xmm0 + vpslld $30, %xmm6, %xmm1 + vpslld $25, %xmm6, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm2 + vpsrld $2, %xmm6, %xmm3 + vpsrld $7, %xmm6, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm6, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 +L_AES_GCM_init_avx512_calc_iv_done: + # T = Encrypt counter + vpxor %xmm0, %xmm0, %xmm0 + shll $3, %edx + vmovq %rdx, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm6 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm6, %xmm6 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm6, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm6, %xmm6 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm6, %xmm6 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm6, %xmm0 + vpslld $30, %xmm6, %xmm1 + vpslld $25, %xmm6, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm2 + vpsrld $2, %xmm6, %xmm3 + vpsrld $7, %xmm6, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm6, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4 + # Encrypt counter + vmovdqa (%rdi), %xmm7 + vpxor %xmm4, %xmm7, %xmm7 + vaesenc 16(%rdi), %xmm7, %xmm7 + vaesenc 32(%rdi), %xmm7, %xmm7 + vaesenc 48(%rdi), %xmm7, %xmm7 + vaesenc 64(%rdi), %xmm7, %xmm7 + vaesenc 80(%rdi), %xmm7, %xmm7 + vaesenc 96(%rdi), %xmm7, %xmm7 + vaesenc 112(%rdi), %xmm7, %xmm7 + vaesenc 128(%rdi), %xmm7, %xmm7 + vaesenc 144(%rdi), %xmm7, %xmm7 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm8 + jl L_AES_GCM_init_avx512_calc_iv_2_aesenc_avx_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 176(%rdi), %xmm7, %xmm7 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm8 + jl L_AES_GCM_init_avx512_calc_iv_2_aesenc_avx_last + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 208(%rdi), %xmm7, %xmm7 + vmovdqa 224(%rdi), %xmm8 +L_AES_GCM_init_avx512_calc_iv_2_aesenc_avx_last: + vaesenclast %xmm8, %xmm7, %xmm7 + vmovdqu %xmm7, %xmm15 +L_AES_GCM_init_avx512_iv_done: + vmovdqa %xmm15, (%rax) + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4 + vpaddd L_avx512_aes_gcm_one(%rip), %xmm4, %xmm4 + vmovdqa %xmm5, (%r8) + vmovdqa %xmm4, (%r9) + addq $16, %rsp + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_GCM_init_avx512,.-AES_GCM_init_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_aad_update_avx512 +.type AES_GCM_aad_update_avx512,@function +.align 16 +AES_GCM_aad_update_avx512: +#else +.section __TEXT,__text +.globl _AES_GCM_aad_update_avx512 +.p2align 4 +_AES_GCM_aad_update_avx512: +#endif /* __APPLE__ */ + movq %rcx, %rax + vmovdqa (%rdx), %xmm5 + vmovdqa (%rax), %xmm6 + xorl %ecx, %ecx +L_AES_GCM_aad_update_avx512_16_loop: + vmovdqu (%rdi,%rcx,1), %xmm7 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm5, %xmm5 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm5 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm5, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm5, %xmm5 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm5, %xmm5 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm5, %xmm5 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + addl $16, %ecx + cmpl %esi, %ecx + jl L_AES_GCM_aad_update_avx512_16_loop + vmovdqa %xmm5, (%rdx) + repz retq +#ifndef __APPLE__ +.size AES_GCM_aad_update_avx512,.-AES_GCM_aad_update_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_encrypt_block_avx512 +.type AES_GCM_encrypt_block_avx512,@function +.align 16 +AES_GCM_encrypt_block_avx512: +#else +.section __TEXT,__text +.globl _AES_GCM_encrypt_block_avx512 +.p2align 4 +_AES_GCM_encrypt_block_avx512: +#endif /* __APPLE__ */ + movq %rdx, %r10 + movq %rcx, %r11 + vmovdqu (%r8), %xmm1 + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm1, %xmm0 + vpaddd L_avx512_aes_gcm_one(%rip), %xmm1, %xmm1 + vmovdqu %xmm1, (%r8) + vpxor (%rdi), %xmm0, %xmm0 + vaesenc 16(%rdi), %xmm0, %xmm0 + vaesenc 32(%rdi), %xmm0, %xmm0 + vaesenc 48(%rdi), %xmm0, %xmm0 + vaesenc 64(%rdi), %xmm0, %xmm0 + vaesenc 80(%rdi), %xmm0, %xmm0 + vaesenc 96(%rdi), %xmm0, %xmm0 + vaesenc 112(%rdi), %xmm0, %xmm0 + vaesenc 128(%rdi), %xmm0, %xmm0 + vaesenc 144(%rdi), %xmm0, %xmm0 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm1 + jl L_AES_GCM_encrypt_block_avx512_aesenc_block_last + vaesenc %xmm1, %xmm0, %xmm0 + vaesenc 176(%rdi), %xmm0, %xmm0 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm1 + jl L_AES_GCM_encrypt_block_avx512_aesenc_block_last + vaesenc %xmm1, %xmm0, %xmm0 + vaesenc 208(%rdi), %xmm0, %xmm0 + vmovdqa 224(%rdi), %xmm1 +L_AES_GCM_encrypt_block_avx512_aesenc_block_last: + vaesenclast %xmm1, %xmm0, %xmm0 + vmovdqu (%r11), %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vmovdqu %xmm0, (%r10) + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 + vzeroupper + repz retq +#ifndef __APPLE__ +.size AES_GCM_encrypt_block_avx512,.-AES_GCM_encrypt_block_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_ghash_block_avx512 +.type AES_GCM_ghash_block_avx512,@function +.align 16 +AES_GCM_ghash_block_avx512: +#else +.section __TEXT,__text +.globl _AES_GCM_ghash_block_avx512 +.p2align 4 +_AES_GCM_ghash_block_avx512: +#endif /* __APPLE__ */ + vmovdqa (%rsi), %xmm4 + vmovdqa (%rdx), %xmm5 + vmovdqu (%rdi), %xmm7 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm6 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm6, %xmm6 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm6, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm6, %xmm6 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm6, %xmm6 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm6, %xmm0 + vpslld $30, %xmm6, %xmm1 + vpslld $25, %xmm6, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm2 + vpsrld $2, %xmm6, %xmm3 + vpsrld $7, %xmm6, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm6, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vmovdqa %xmm4, (%rsi) + vzeroupper + repz retq +#ifndef __APPLE__ +.size AES_GCM_ghash_block_avx512,.-AES_GCM_ghash_block_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_encrypt_update_avx512 +.type AES_GCM_encrypt_update_avx512,@function +.align 16 +AES_GCM_encrypt_update_avx512: +#else +.section __TEXT,__text +.globl _AES_GCM_encrypt_update_avx512 +.p2align 4 +_AES_GCM_encrypt_update_avx512: +#endif /* __APPLE__ */ + pushq %r13 + pushq %r12 + pushq %r14 + pushq %r15 + pushq %rbx + pushq %rbp + movq %rdx, %r10 + movq %rcx, %r11 + movq 56(%rsp), %rax + movq 64(%rsp), %r12 + subq $0x410, %rsp + vmovdqa (%r9), %xmm6 + vmovdqa (%rax), %xmm5 + vpsrlq $63, %xmm5, %xmm9 + vpsllq $0x01, %xmm5, %xmm8 + vpslldq $8, %xmm9, %xmm9 + vpor %xmm9, %xmm8, %xmm8 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpand L_avx512_aes_gcm_mod2_128(%rip), %xmm5, %xmm5 + vpxor %xmm8, %xmm5, %xmm5 + xorl %r14d, %r14d + cmpl $0x100, %r8d + jl L_AES_GCM_encrypt_update_avx512_done_128 + vmovdqa %xmm6, %xmm2 + # H ^ 1 + vmovdqu %xmm5, (%rsp) + # H ^ 2 + vpclmulqdq $0x00, %xmm5, %xmm5, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm5, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm0 + vmovdqu %xmm0, 16(%rsp) + # H ^ 3 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpshufd $0x4e, %xmm0, %xmm10 + vpxor %xmm0, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm5, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm0, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm1 + vmovdqu %xmm1, 32(%rsp) + # H ^ 4 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm3 + vmovdqu %xmm3, 48(%rsp) + # H ^ 5 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 64(%rsp) + # H ^ 6 + vpclmulqdq $0x00, %xmm1, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm1, %xmm1, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 80(%rsp) + # H ^ 7 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm1, %xmm9 + vpxor %xmm1, %xmm9, %xmm9 + vpshufd $0x4e, %xmm3, %xmm10 + vpxor %xmm3, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm1, %xmm3, %xmm8 + vpclmulqdq $0x11, %xmm1, %xmm3, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 96(%rsp) + # H ^ 8 + vpclmulqdq $0x00, %xmm3, %xmm3, %xmm8 + vpclmulqdq $0x11, %xmm3, %xmm3, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 112(%rsp) + # H ^ 9 + vmovdqu 48(%rsp), %xmm0 + vmovdqu 64(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 128(%rsp) + # H ^ 10 + vmovdqu 64(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 144(%rsp) + # H ^ 11 + vmovdqu 64(%rsp), %xmm0 + vmovdqu 80(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 160(%rsp) + # H ^ 12 + vmovdqu 80(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 176(%rsp) + # H ^ 13 + vmovdqu 80(%rsp), %xmm0 + vmovdqu 96(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 192(%rsp) + # H ^ 14 + vmovdqu 96(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 208(%rsp) + # H ^ 15 + vmovdqu 96(%rsp), %xmm0 + vmovdqu 112(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 224(%rsp) + # H ^ 16 + vmovdqu 112(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 240(%rsp) + cmpl $0x200, %r8d + jl L_AES_GCM_encrypt_update_avx512_no_ext + # H ^ 17 + vmovdqu 112(%rsp), %xmm0 + vmovdqu 128(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 256(%rsp) + # H ^ 18 + vmovdqu 128(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 272(%rsp) + # H ^ 19 + vmovdqu 128(%rsp), %xmm0 + vmovdqu 144(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 288(%rsp) + # H ^ 20 + vmovdqu 144(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 304(%rsp) + # H ^ 21 + vmovdqu 144(%rsp), %xmm0 + vmovdqu 160(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 320(%rsp) + # H ^ 22 + vmovdqu 160(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 336(%rsp) + # H ^ 23 + vmovdqu 160(%rsp), %xmm0 + vmovdqu 176(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 352(%rsp) + # H ^ 24 + vmovdqu 176(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 368(%rsp) + # H ^ 25 + vmovdqu 176(%rsp), %xmm0 + vmovdqu 192(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 384(%rsp) + # H ^ 26 + vmovdqu 192(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 400(%rsp) + # H ^ 27 + vmovdqu 192(%rsp), %xmm0 + vmovdqu 208(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 416(%rsp) + # H ^ 28 + vmovdqu 208(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 432(%rsp) + # H ^ 29 + vmovdqu 208(%rsp), %xmm0 + vmovdqu 224(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 448(%rsp) + # H ^ 30 + vmovdqu 224(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 464(%rsp) + # H ^ 31 + vmovdqu 224(%rsp), %xmm0 + vmovdqu 240(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 480(%rsp) + # H ^ 32 + vmovdqu 240(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 496(%rsp) +L_AES_GCM_encrypt_update_avx512_no_ext: + vbroadcasti32x4 L_avx512_aes_gcm_bswap_epi64(%rip), %zmm22 + vbroadcasti32x4 L_avx512_aes_gcm_bswap_mask(%rip), %zmm30 + vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31 + vbroadcasti32x4 (%rdi), %zmm9 + vbroadcasti32x4 16(%rdi), %zmm10 + vbroadcasti32x4 32(%rdi), %zmm11 + vbroadcasti32x4 48(%rdi), %zmm12 + vbroadcasti32x4 64(%rdi), %zmm13 + vbroadcasti32x4 80(%rdi), %zmm14 + vbroadcasti32x4 96(%rdi), %zmm15 + vbroadcasti32x4 112(%rdi), %zmm1 + vbroadcasti32x4 128(%rdi), %zmm2 + vbroadcasti32x4 144(%rdi), %zmm3 + cmpl $0x200, %r8d + jl L_AES_GCM_encrypt_update_avx512_no_windows + movl %r8d, %ebp + andl $0xfffffe00, %ebp + vmovdqu64 448(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 384(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 320(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 256(%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + vmovdqu64 %zmm23, 512(%rsp) + vmovdqu64 %zmm24, 576(%rsp) + vmovdqu64 %zmm25, 640(%rsp) + vmovdqu64 %zmm26, 704(%rsp) + vmovdqu64 192(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 128(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 64(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 (%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + vmovdqu64 %zmm23, 768(%rsp) + vmovdqu64 %zmm24, 832(%rsp) + vmovdqu64 %zmm25, 896(%rsp) + vmovdqu64 %zmm26, 960(%rsp) + # 512 bytes of input + leaq (%r10,%r14,1), %r15 + vbroadcasti32x4 (%r12), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu (%r12), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %esi + vbroadcasti32x4 160(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %esi + vbroadcasti32x4 192(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%rdi), %zmm20 +L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r14d + vbroadcasti32x4 (%r12), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu (%r12), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %esi + vbroadcasti32x4 160(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %esi + vbroadcasti32x4 192(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%rdi), %zmm20 +L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r14d + cmpl %ebp, %r14d + jge L_AES_GCM_encrypt_update_avx512_last_win +L_AES_GCM_encrypt_update_avx512_win_loop: + leaq (%r10,%r14,1), %rbx + vpxorq %zmm21, %zmm21, %zmm21 + vinserti32x4 $0x00, %xmm6, %zmm21, %zmm21 + vbroadcasti32x4 (%r12), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu (%r12), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vmovdqu64 (%r15), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vpxorq %zmm21, %zmm31, %zmm31 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vpclmulqdq $0x00, 512(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 512(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 512(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 512(%rsp), %zmm31, %zmm26 + vmovdqa64 %zmm23, %zmm27 + vpxorq %zmm24, %zmm25, %zmm28 + vmovdqa64 %zmm26, %zmm29 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vmovdqu64 64(%r15), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vpclmulqdq $0x00, 576(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 576(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 576(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 576(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vmovdqu64 128(%r15), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vpclmulqdq $0x00, 640(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 640(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 640(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 640(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vmovdqu64 192(%r15), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vpclmulqdq $0x00, 704(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 704(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 704(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 704(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %esi + vbroadcasti32x4 160(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_a_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %esi + vbroadcasti32x4 192(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_a_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%rdi), %zmm20 +L_AES_GCM_encrypt_update_avx512_a_il_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r14d + vbroadcasti32x4 (%r12), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu (%r12), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vmovdqu64 256(%r15), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vpclmulqdq $0x00, 768(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 768(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 768(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 768(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vmovdqu64 320(%r15), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vpclmulqdq $0x00, 832(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 832(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 832(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 832(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vmovdqu64 384(%r15), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vpclmulqdq $0x00, 896(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 896(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 896(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 896(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vmovdqu64 448(%r15), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vpclmulqdq $0x00, 960(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 960(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 960(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 960(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %esi + vbroadcasti32x4 160(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_b_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %esi + vbroadcasti32x4 192(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_b_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%rdi), %zmm20 +L_AES_GCM_encrypt_update_avx512_b_il_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r14d + vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm23 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm23, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm23 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm23, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 + movq %rbx, %r15 + cmpl %ebp, %r14d + jl L_AES_GCM_encrypt_update_avx512_win_loop +L_AES_GCM_encrypt_update_avx512_last_win: + vpxorq %zmm20, %zmm20, %zmm20 + vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20 + vmovdqu64 512(%rsp), %zmm23 + vmovdqu64 576(%rsp), %zmm24 + vmovdqu64 640(%rsp), %zmm25 + vmovdqu64 704(%rsp), %zmm26 + vmovdqu64 (%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpxorq %zmm20, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vmovdqa64 %zmm16, %zmm27 + vpxorq %zmm17, %zmm18, %zmm28 + vmovdqa64 %zmm19, %zmm29 + vmovdqu64 64(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 128(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 192(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 768(%rsp), %zmm23 + vmovdqu64 832(%rsp), %zmm24 + vmovdqu64 896(%rsp), %zmm25 + vmovdqu64 960(%rsp), %zmm26 + vmovdqu64 256(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 320(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 384(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 448(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm21, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm21, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 +L_AES_GCM_encrypt_update_avx512_no_windows: + vmovdqu64 192(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 128(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 64(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 (%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + movl %r8d, %r13d + andl $0xffffff00, %r13d + cmpl %r13d, %r14d + jge L_AES_GCM_encrypt_update_avx512_after_256 + # 256 bytes of input + vbroadcasti32x4 (%r12), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu (%r12), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %esi + vbroadcasti32x4 160(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %esi + vbroadcasti32x4 192(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%rdi), %zmm20 +L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + movq %rdx, %r15 + addl $0x100, %r14d + cmpl %r13d, %r14d + jge L_AES_GCM_encrypt_update_avx512_last_ghash +L_AES_GCM_encrypt_update_avx512_ghash_128: + vbroadcasti32x4 (%r12), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu (%r12), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %esi + vbroadcasti32x4 160(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %esi + vbroadcasti32x4 192(%rdi), %zmm20 + jl L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%rdi), %zmm20 +L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + vpxorq %zmm20, %zmm20, %zmm20 + vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20 + vmovdqu64 (%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpxorq %zmm20, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vmovdqa64 %zmm16, %zmm27 + vpxorq %zmm17, %zmm18, %zmm28 + vmovdqa64 %zmm19, %zmm29 + vmovdqu64 64(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 128(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 192(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm21, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm21, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 + movq %rdx, %r15 + addl $0x100, %r14d + cmpl %r13d, %r14d + jl L_AES_GCM_encrypt_update_avx512_ghash_128 +L_AES_GCM_encrypt_update_avx512_last_ghash: + vpxorq %zmm20, %zmm20, %zmm20 + vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20 + vmovdqu64 (%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpxorq %zmm20, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vmovdqa64 %zmm16, %zmm27 + vpxorq %zmm17, %zmm18, %zmm28 + vmovdqa64 %zmm19, %zmm29 + vmovdqu64 64(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 128(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 192(%r15), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm21, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm21, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 +L_AES_GCM_encrypt_update_avx512_after_256: + vmovdqu (%rsp), %xmm5 +L_AES_GCM_encrypt_update_avx512_done_128: + movl %r8d, %edx + cmpl %edx, %r14d + jge L_AES_GCM_encrypt_update_avx512_done_enc + movl %r8d, %r13d + andl $0xfffffff0, %r13d + cmpl %r13d, %r14d + jge L_AES_GCM_encrypt_update_avx512_last_block_done + vmovdqu (%r12), %xmm9 + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8 + vpaddd L_avx512_aes_gcm_one(%rip), %xmm9, %xmm9 + vmovdqu %xmm9, (%r12) + vpxor (%rdi), %xmm8, %xmm8 + vaesenc 16(%rdi), %xmm8, %xmm8 + vaesenc 32(%rdi), %xmm8, %xmm8 + vaesenc 48(%rdi), %xmm8, %xmm8 + vaesenc 64(%rdi), %xmm8, %xmm8 + vaesenc 80(%rdi), %xmm8, %xmm8 + vaesenc 96(%rdi), %xmm8, %xmm8 + vaesenc 112(%rdi), %xmm8, %xmm8 + vaesenc 128(%rdi), %xmm8, %xmm8 + vaesenc 144(%rdi), %xmm8, %xmm8 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm9 + jl L_AES_GCM_encrypt_update_avx512_aesenc_block_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 176(%rdi), %xmm8, %xmm8 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm9 + jl L_AES_GCM_encrypt_update_avx512_aesenc_block_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 208(%rdi), %xmm8, %xmm8 + vmovdqa 224(%rdi), %xmm9 +L_AES_GCM_encrypt_update_avx512_aesenc_block_last: + vaesenclast %xmm9, %xmm8, %xmm8 + vmovdqu (%r11,%r14,1), %xmm9 + vpxor %xmm9, %xmm8, %xmm8 + vmovdqu %xmm8, (%r10,%r14,1) + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + vpxor %xmm8, %xmm6, %xmm6 + addl $16, %r14d + cmpl %r13d, %r14d + jge L_AES_GCM_encrypt_update_avx512_last_block_ghash +L_AES_GCM_encrypt_update_avx512_last_block_start: + vmovdqu (%r11,%r14,1), %xmm13 + vmovdqu (%r12), %xmm9 + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8 + vpaddd L_avx512_aes_gcm_one(%rip), %xmm9, %xmm9 + vmovdqu %xmm9, (%r12) + vpxor (%rdi), %xmm8, %xmm8 + vpclmulqdq $16, %xmm5, %xmm6, %xmm10 + vaesenc 16(%rdi), %xmm8, %xmm8 + vaesenc 32(%rdi), %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm5, %xmm6, %xmm11 + vaesenc 48(%rdi), %xmm8, %xmm8 + vaesenc 64(%rdi), %xmm8, %xmm8 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm12 + vaesenc 80(%rdi), %xmm8, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm1 + vaesenc 96(%rdi), %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpslldq $8, %xmm10, %xmm2 + vpsrldq $8, %xmm10, %xmm10 + vaesenc 112(%rdi), %xmm8, %xmm8 + vpxor %xmm12, %xmm2, %xmm2 + vpxor %xmm10, %xmm1, %xmm3 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm0 + vpclmulqdq $16, %xmm0, %xmm2, %xmm11 + vaesenc 128(%rdi), %xmm8, %xmm8 + vpshufd $0x4e, %xmm2, %xmm10 + vpxor %xmm11, %xmm10, %xmm10 + vpclmulqdq $16, %xmm0, %xmm10, %xmm11 + vaesenc 144(%rdi), %xmm8, %xmm8 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm3, %xmm10, %xmm6 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm9 + jl L_AES_GCM_encrypt_update_avx512_aesenc_gfmul_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 176(%rdi), %xmm8, %xmm8 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm9 + jl L_AES_GCM_encrypt_update_avx512_aesenc_gfmul_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 208(%rdi), %xmm8, %xmm8 + vmovdqa 224(%rdi), %xmm9 +L_AES_GCM_encrypt_update_avx512_aesenc_gfmul_last: + vaesenclast %xmm9, %xmm8, %xmm8 + vmovdqa %xmm13, %xmm0 + vpxor %xmm0, %xmm8, %xmm8 + vmovdqu %xmm8, (%r10,%r14,1) + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + addl $16, %r14d + vpxor %xmm8, %xmm6, %xmm6 + cmpl %r13d, %r14d + jl L_AES_GCM_encrypt_update_avx512_last_block_start +L_AES_GCM_encrypt_update_avx512_last_block_ghash: + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpshufd $0x4e, %xmm6, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm6 +L_AES_GCM_encrypt_update_avx512_last_block_done: +L_AES_GCM_encrypt_update_avx512_done_enc: + vmovdqa %xmm6, (%r9) + vzeroupper + addq $0x410, %rsp + popq %rbp + popq %rbx + popq %r15 + popq %r14 + popq %r12 + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_encrypt_update_avx512,.-AES_GCM_encrypt_update_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_encrypt_final_avx512 +.type AES_GCM_encrypt_final_avx512,@function +.align 16 +AES_GCM_encrypt_final_avx512: +#else +.section __TEXT,__text +.globl _AES_GCM_encrypt_final_avx512 +.p2align 4 +_AES_GCM_encrypt_final_avx512: +#endif /* __APPLE__ */ + pushq %r13 + movl %edx, %eax + movl %ecx, %r10d + movl %r8d, %r11d + movq 16(%rsp), %r8 + subq $16, %rsp + vmovdqa (%rdi), %xmm4 + vmovdqa (%r9), %xmm5 + vmovdqa (%r8), %xmm6 + vpsrlq $63, %xmm5, %xmm8 + vpsllq $0x01, %xmm5, %xmm7 + vpslldq $8, %xmm8, %xmm8 + vpor %xmm8, %xmm7, %xmm7 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpand L_avx512_aes_gcm_mod2_128(%rip), %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + movl %r10d, %edx + movl %r11d, %ecx + shlq $3, %rdx + shlq $3, %rcx + vmovq %rdx, %xmm0 + vmovq %rcx, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm8 + vpxor %xmm5, %xmm8, %xmm8 + vpshufd $0x4e, %xmm4, %xmm9 + vpxor %xmm4, %xmm9, %xmm9 + vpclmulqdq $0x00, %xmm5, %xmm4, %xmm7 + vpclmulqdq $0x11, %xmm5, %xmm4, %xmm10 + vpclmulqdq $0x00, %xmm9, %xmm8, %xmm8 + vpternlogq $0x96, %xmm7, %xmm10, %xmm8 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpternlogq $0x96, %xmm11, %xmm7, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm11, %xmm8, %xmm10 + vmovdqa %xmm10, %xmm4 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm0 + cmpl $16, %eax + je L_AES_GCM_encrypt_final_avx512_store_tag_16 + xorq %rcx, %rcx + vmovdqu %xmm0, (%rsp) +L_AES_GCM_encrypt_final_avx512_store_tag_loop: + movzbl (%rsp,%rcx,1), %r13d + movb %r13b, (%rsi,%rcx,1) + incl %ecx + cmpl %eax, %ecx + jne L_AES_GCM_encrypt_final_avx512_store_tag_loop + jmp L_AES_GCM_encrypt_final_avx512_store_tag_done +L_AES_GCM_encrypt_final_avx512_store_tag_16: + vmovdqu %xmm0, (%rsi) +L_AES_GCM_encrypt_final_avx512_store_tag_done: + vzeroupper + addq $16, %rsp + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_encrypt_final_avx512,.-AES_GCM_encrypt_final_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_decrypt_update_avx512 +.type AES_GCM_decrypt_update_avx512,@function +.align 16 +AES_GCM_decrypt_update_avx512: +#else +.section __TEXT,__text +.globl _AES_GCM_decrypt_update_avx512 +.p2align 4 +_AES_GCM_decrypt_update_avx512: +#endif /* __APPLE__ */ + pushq %r13 + pushq %r12 + pushq %r14 + pushq %r15 + pushq %rbx + movq %rdx, %r10 + movq %rcx, %r11 + movq 48(%rsp), %rax + movq 56(%rsp), %r12 + subq $0x410, %rsp + vmovdqa (%r9), %xmm6 + vmovdqa (%rax), %xmm5 + vpsrlq $63, %xmm5, %xmm9 + vpsllq $0x01, %xmm5, %xmm8 + vpslldq $8, %xmm9, %xmm9 + vpor %xmm9, %xmm8, %xmm8 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpand L_avx512_aes_gcm_mod2_128(%rip), %xmm5, %xmm5 + vpxor %xmm8, %xmm5, %xmm5 + xorl %r14d, %r14d + cmpl $0x100, %r8d + jl L_AES_GCM_decrypt_update_avx512_done_128 + vmovdqa %xmm6, %xmm2 + # H ^ 1 + vmovdqu %xmm5, (%rsp) + # H ^ 2 + vpclmulqdq $0x00, %xmm5, %xmm5, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm5, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm0 + vmovdqu %xmm0, 16(%rsp) + # H ^ 3 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpshufd $0x4e, %xmm0, %xmm10 + vpxor %xmm0, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm5, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm5, %xmm0, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm1 + vmovdqu %xmm1, 32(%rsp) + # H ^ 4 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm3 + vmovdqu %xmm3, 48(%rsp) + # H ^ 5 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 64(%rsp) + # H ^ 6 + vpclmulqdq $0x00, %xmm1, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm1, %xmm1, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 80(%rsp) + # H ^ 7 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm1, %xmm9 + vpxor %xmm1, %xmm9, %xmm9 + vpshufd $0x4e, %xmm3, %xmm10 + vpxor %xmm3, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm1, %xmm3, %xmm8 + vpclmulqdq $0x11, %xmm1, %xmm3, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 96(%rsp) + # H ^ 8 + vpclmulqdq $0x00, %xmm3, %xmm3, %xmm8 + vpclmulqdq $0x11, %xmm3, %xmm3, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 112(%rsp) + # H ^ 9 + vmovdqu 48(%rsp), %xmm0 + vmovdqu 64(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 128(%rsp) + # H ^ 10 + vmovdqu 64(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 144(%rsp) + # H ^ 11 + vmovdqu 64(%rsp), %xmm0 + vmovdqu 80(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 160(%rsp) + # H ^ 12 + vmovdqu 80(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 176(%rsp) + # H ^ 13 + vmovdqu 80(%rsp), %xmm0 + vmovdqu 96(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 192(%rsp) + # H ^ 14 + vmovdqu 96(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 208(%rsp) + # H ^ 15 + vmovdqu 96(%rsp), %xmm0 + vmovdqu 112(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 224(%rsp) + # H ^ 16 + vmovdqu 112(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 240(%rsp) + cmpl $0x200, %r8d + jl L_AES_GCM_decrypt_update_avx512_no_ext + # H ^ 17 + vmovdqu 112(%rsp), %xmm0 + vmovdqu 128(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 256(%rsp) + # H ^ 18 + vmovdqu 128(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 272(%rsp) + # H ^ 19 + vmovdqu 128(%rsp), %xmm0 + vmovdqu 144(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 288(%rsp) + # H ^ 20 + vmovdqu 144(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 304(%rsp) + # H ^ 21 + vmovdqu 144(%rsp), %xmm0 + vmovdqu 160(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 320(%rsp) + # H ^ 22 + vmovdqu 160(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 336(%rsp) + # H ^ 23 + vmovdqu 160(%rsp), %xmm0 + vmovdqu 176(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 352(%rsp) + # H ^ 24 + vmovdqu 176(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 368(%rsp) + # H ^ 25 + vmovdqu 176(%rsp), %xmm0 + vmovdqu 192(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 384(%rsp) + # H ^ 26 + vmovdqu 192(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 400(%rsp) + # H ^ 27 + vmovdqu 192(%rsp), %xmm0 + vmovdqu 208(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 416(%rsp) + # H ^ 28 + vmovdqu 208(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 432(%rsp) + # H ^ 29 + vmovdqu 208(%rsp), %xmm0 + vmovdqu 224(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 448(%rsp) + # H ^ 30 + vmovdqu 224(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 464(%rsp) + # H ^ 31 + vmovdqu 224(%rsp), %xmm0 + vmovdqu 240(%rsp), %xmm1 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm0, %xmm9 + vpxor %xmm0, %xmm9, %xmm9 + vpshufd $0x4e, %xmm1, %xmm10 + vpxor %xmm1, %xmm10, %xmm10 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm11 + vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 + vpternlogq $0x96, %xmm8, %xmm11, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 480(%rsp) + # H ^ 32 + vmovdqu 240(%rsp), %xmm0 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm11 + vpxor %xmm9, %xmm9, %xmm9 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm10 + vpclmulqdq $0x01, %xmm8, %xmm10, %xmm12 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm12, %xmm8, %xmm9 + vpclmulqdq $0x01, %xmm9, %xmm10, %xmm12 + vpshufd $0x4e, %xmm9, %xmm9 + vpternlogq $0x96, %xmm12, %xmm9, %xmm11 + vmovdqa %xmm11, %xmm7 + vmovdqu %xmm7, 496(%rsp) +L_AES_GCM_decrypt_update_avx512_no_ext: + vbroadcasti32x4 L_avx512_aes_gcm_bswap_epi64(%rip), %zmm22 + vbroadcasti32x4 L_avx512_aes_gcm_bswap_mask(%rip), %zmm30 + vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31 + vbroadcasti32x4 (%rdi), %zmm9 + vbroadcasti32x4 16(%rdi), %zmm10 + vbroadcasti32x4 32(%rdi), %zmm11 + vbroadcasti32x4 48(%rdi), %zmm12 + vbroadcasti32x4 64(%rdi), %zmm13 + vbroadcasti32x4 80(%rdi), %zmm14 + vbroadcasti32x4 96(%rdi), %zmm15 + vbroadcasti32x4 112(%rdi), %zmm1 + vbroadcasti32x4 128(%rdi), %zmm2 + vbroadcasti32x4 144(%rdi), %zmm3 + cmpl $0x200, %r8d + jl L_AES_GCM_decrypt_update_avx512_no_windows + movl %r8d, %r13d + andl $0xfffffe00, %r13d + vmovdqu64 448(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 384(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 320(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 256(%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + vmovdqu64 %zmm23, 512(%rsp) + vmovdqu64 %zmm24, 576(%rsp) + vmovdqu64 %zmm25, 640(%rsp) + vmovdqu64 %zmm26, 704(%rsp) + vmovdqu64 192(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 128(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 64(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 (%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + vmovdqu64 %zmm23, 768(%rsp) + vmovdqu64 %zmm24, 832(%rsp) + vmovdqu64 %zmm25, 896(%rsp) + vmovdqu64 %zmm26, 960(%rsp) + # 512 bytes of input + xorl %r15d, %r15d + leaq (%r11,%r14,1), %rbx + vpxorq %zmm20, %zmm20, %zmm20 + vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20 + vmovdqu64 512(%rsp), %zmm23 + vmovdqu64 576(%rsp), %zmm24 + vmovdqu64 640(%rsp), %zmm25 + vmovdqu64 704(%rsp), %zmm26 + vmovdqu64 (%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpxorq %zmm20, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vmovdqa64 %zmm16, %zmm27 + vpxorq %zmm17, %zmm18, %zmm28 + vmovdqa64 %zmm19, %zmm29 + vmovdqu64 64(%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 128(%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 192(%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 768(%rsp), %zmm23 + vmovdqu64 832(%rsp), %zmm24 + vmovdqu64 896(%rsp), %zmm25 + vmovdqu64 960(%rsp), %zmm26 + vmovdqu64 256(%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 320(%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 384(%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 448(%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm21, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm21, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 + addl $0x200, %r14d + cmpl %r13d, %r14d + jge L_AES_GCM_decrypt_update_avx512_last_aes +L_AES_GCM_decrypt_update_avx512_win_loop: + leaq (%r11,%r14,1), %rbx + vpxorq %zmm21, %zmm21, %zmm21 + vinserti32x4 $0x00, %xmm6, %zmm21, %zmm21 + vbroadcasti32x4 (%r12), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu (%r12), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vmovdqu64 (%rbx), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vpxorq %zmm21, %zmm31, %zmm31 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vpclmulqdq $0x00, 512(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 512(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 512(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 512(%rsp), %zmm31, %zmm26 + vmovdqa64 %zmm23, %zmm27 + vpxorq %zmm24, %zmm25, %zmm28 + vmovdqa64 %zmm26, %zmm29 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vmovdqu64 64(%rbx), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vpclmulqdq $0x00, 576(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 576(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 576(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 576(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vmovdqu64 128(%rbx), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vpclmulqdq $0x00, 640(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 640(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 640(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 640(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vmovdqu64 192(%rbx), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vpclmulqdq $0x00, 704(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 704(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 704(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 704(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %esi + vbroadcasti32x4 160(%rdi), %zmm20 + jl L_AES_GCM_decrypt_update_avx512_a_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %esi + vbroadcasti32x4 192(%rdi), %zmm20 + jl L_AES_GCM_decrypt_update_avx512_a_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%rdi), %zmm20 +L_AES_GCM_decrypt_update_avx512_a_il_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%r11,%r15,1), %rcx + leaq (%r10,%r15,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r15d + vbroadcasti32x4 (%r12), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu (%r12), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vmovdqu64 256(%rbx), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vpclmulqdq $0x00, 768(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 768(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 768(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 768(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vmovdqu64 320(%rbx), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vpclmulqdq $0x00, 832(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 832(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 832(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 832(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vmovdqu64 384(%rbx), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vpclmulqdq $0x00, 896(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 896(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 896(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 896(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vmovdqu64 448(%rbx), %zmm31 + vpshufb %zmm30, %zmm31, %zmm31 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vpclmulqdq $0x00, 960(%rsp), %zmm31, %zmm23 + vpclmulqdq $0x01, 960(%rsp), %zmm31, %zmm24 + vpclmulqdq $16, 960(%rsp), %zmm31, %zmm25 + vpclmulqdq $0x11, 960(%rsp), %zmm31, %zmm26 + vpxorq %zmm23, %zmm27, %zmm27 + vpternlogq $0x96, %zmm24, %zmm25, %zmm28 + vpxorq %zmm26, %zmm29, %zmm29 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %esi + vbroadcasti32x4 160(%rdi), %zmm20 + jl L_AES_GCM_decrypt_update_avx512_b_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %esi + vbroadcasti32x4 192(%rdi), %zmm20 + jl L_AES_GCM_decrypt_update_avx512_b_il_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%rdi), %zmm20 +L_AES_GCM_decrypt_update_avx512_b_il_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%r11,%r15,1), %rcx + leaq (%r10,%r15,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r15d + vbroadcasti32x4 L_avx512_aes_gcm_mod2_128(%rip), %zmm31 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm23 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm23, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm23 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm23, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 + addl $0x200, %r14d + cmpl %r13d, %r14d + jl L_AES_GCM_decrypt_update_avx512_win_loop +L_AES_GCM_decrypt_update_avx512_last_aes: + vbroadcasti32x4 (%r12), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu (%r12), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %esi + vbroadcasti32x4 160(%rdi), %zmm20 + jl L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %esi + vbroadcasti32x4 192(%rdi), %zmm20 + jl L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%rdi), %zmm20 +L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%r11,%r15,1), %rcx + leaq (%r10,%r15,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r15d + vbroadcasti32x4 (%r12), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu (%r12), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %esi + vbroadcasti32x4 160(%rdi), %zmm20 + jl L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %esi + vbroadcasti32x4 192(%rdi), %zmm20 + jl L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%rdi), %zmm20 +L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%r11,%r15,1), %rcx + leaq (%r10,%r15,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r15d +L_AES_GCM_decrypt_update_avx512_no_windows: + vmovdqu64 192(%rsp), %zmm23 + vshufi64x2 $27, %zmm23, %zmm23, %zmm23 + vmovdqu64 128(%rsp), %zmm24 + vshufi64x2 $27, %zmm24, %zmm24, %zmm24 + vmovdqu64 64(%rsp), %zmm25 + vshufi64x2 $27, %zmm25, %zmm25, %zmm25 + vmovdqu64 (%rsp), %zmm26 + vshufi64x2 $27, %zmm26, %zmm26, %zmm26 + movl %r8d, %r13d + andl $0xffffff00, %r13d + cmpl %r13d, %r14d + jge L_AES_GCM_decrypt_update_avx512_after_256 + # 256 bytes of input + leaq (%r11,%r14,1), %rbx + vpxorq %zmm20, %zmm20, %zmm20 + vinserti32x4 $0x00, %xmm6, %zmm20, %zmm20 + vmovdqu64 (%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpxorq %zmm20, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm23, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm23, %zmm21, %zmm17 + vpclmulqdq $16, %zmm23, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm23, %zmm21, %zmm19 + vmovdqa64 %zmm16, %zmm27 + vpxorq %zmm17, %zmm18, %zmm28 + vmovdqa64 %zmm19, %zmm29 + vmovdqu64 64(%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm24, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm24, %zmm21, %zmm17 + vpclmulqdq $16, %zmm24, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm24, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 128(%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm25, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm25, %zmm21, %zmm17 + vpclmulqdq $16, %zmm25, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm25, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vmovdqu64 192(%rbx), %zmm21 + vpshufb %zmm30, %zmm21, %zmm21 + vpclmulqdq $0x00, %zmm26, %zmm21, %zmm16 + vpclmulqdq $0x01, %zmm26, %zmm21, %zmm17 + vpclmulqdq $16, %zmm26, %zmm21, %zmm18 + vpclmulqdq $0x11, %zmm26, %zmm21, %zmm19 + vpxorq %zmm16, %zmm27, %zmm27 + vpternlogq $0x96, %zmm17, %zmm18, %zmm28 + vpxorq %zmm19, %zmm29, %zmm29 + vpclmulqdq $0x01, %zmm27, %zmm31, %zmm21 + vpshufd $0x4e, %zmm27, %zmm27 + vpternlogq $0x96, %zmm21, %zmm27, %zmm28 + vpclmulqdq $0x01, %zmm28, %zmm31, %zmm21 + vpshufd $0x4e, %zmm28, %zmm28 + vpternlogq $0x96, %zmm21, %zmm28, %zmm29 + vextracti32x4 $0x01, %zmm29, %xmm0 + vextracti32x4 $2, %zmm29, %xmm4 + vextracti32x4 $3, %zmm29, %xmm5 + vpxorq %xmm0, %xmm29, %xmm6 + vpternlogq $0x96, %xmm4, %xmm5, %xmm6 + vbroadcasti32x4 (%r12), %zmm20 + vpaddd L_avx512_aes_gcm_inc_z0(%rip), %zmm20, %zmm16 + vpshufb %zmm22, %zmm16, %zmm16 + vpaddd L_avx512_aes_gcm_inc_z1(%rip), %zmm20, %zmm17 + vpshufb %zmm22, %zmm17, %zmm17 + vpaddd L_avx512_aes_gcm_inc_z2(%rip), %zmm20, %zmm18 + vpshufb %zmm22, %zmm18, %zmm18 + vpaddd L_avx512_aes_gcm_inc_z3(%rip), %zmm20, %zmm19 + vpshufb %zmm22, %zmm19, %zmm19 + vmovdqu (%r12), %xmm8 + vpaddd L_avx512_aes_gcm_sixteen(%rip), %xmm8, %xmm8 + vmovdqu %xmm8, (%r12) + vpxorq %zmm9, %zmm16, %zmm16 + vpxorq %zmm9, %zmm17, %zmm17 + vpxorq %zmm9, %zmm18, %zmm18 + vpxorq %zmm9, %zmm19, %zmm19 + vaesenc %zmm10, %zmm16, %zmm16 + vaesenc %zmm10, %zmm17, %zmm17 + vaesenc %zmm10, %zmm18, %zmm18 + vaesenc %zmm10, %zmm19, %zmm19 + vaesenc %zmm11, %zmm16, %zmm16 + vaesenc %zmm11, %zmm17, %zmm17 + vaesenc %zmm11, %zmm18, %zmm18 + vaesenc %zmm11, %zmm19, %zmm19 + vaesenc %zmm12, %zmm16, %zmm16 + vaesenc %zmm12, %zmm17, %zmm17 + vaesenc %zmm12, %zmm18, %zmm18 + vaesenc %zmm12, %zmm19, %zmm19 + vaesenc %zmm13, %zmm16, %zmm16 + vaesenc %zmm13, %zmm17, %zmm17 + vaesenc %zmm13, %zmm18, %zmm18 + vaesenc %zmm13, %zmm19, %zmm19 + vaesenc %zmm14, %zmm16, %zmm16 + vaesenc %zmm14, %zmm17, %zmm17 + vaesenc %zmm14, %zmm18, %zmm18 + vaesenc %zmm14, %zmm19, %zmm19 + vaesenc %zmm15, %zmm16, %zmm16 + vaesenc %zmm15, %zmm17, %zmm17 + vaesenc %zmm15, %zmm18, %zmm18 + vaesenc %zmm15, %zmm19, %zmm19 + vaesenc %zmm1, %zmm16, %zmm16 + vaesenc %zmm1, %zmm17, %zmm17 + vaesenc %zmm1, %zmm18, %zmm18 + vaesenc %zmm1, %zmm19, %zmm19 + vaesenc %zmm2, %zmm16, %zmm16 + vaesenc %zmm2, %zmm17, %zmm17 + vaesenc %zmm2, %zmm18, %zmm18 + vaesenc %zmm2, %zmm19, %zmm19 + vaesenc %zmm3, %zmm16, %zmm16 + vaesenc %zmm3, %zmm17, %zmm17 + vaesenc %zmm3, %zmm18, %zmm18 + vaesenc %zmm3, %zmm19, %zmm19 + cmpl $11, %esi + vbroadcasti32x4 160(%rdi), %zmm20 + jl L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 176(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + cmpl $13, %esi + vbroadcasti32x4 192(%rdi), %zmm20 + jl L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 208(%rdi), %zmm20 + vaesenc %zmm20, %zmm16, %zmm16 + vaesenc %zmm20, %zmm17, %zmm17 + vaesenc %zmm20, %zmm18, %zmm18 + vaesenc %zmm20, %zmm19, %zmm19 + vbroadcasti32x4 224(%rdi), %zmm20 +L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last: + vaesenclast %zmm20, %zmm16, %zmm16 + vaesenclast %zmm20, %zmm17, %zmm17 + vaesenclast %zmm20, %zmm18, %zmm18 + vaesenclast %zmm20, %zmm19, %zmm19 + leaq (%r11,%r14,1), %rcx + leaq (%r10,%r14,1), %rdx + vmovdqu64 (%rcx), %zmm21 + vpxorq %zmm21, %zmm16, %zmm16 + vmovdqu64 %zmm16, (%rdx) + vmovdqu64 64(%rcx), %zmm21 + vpxorq %zmm21, %zmm17, %zmm17 + vmovdqu64 %zmm17, 64(%rdx) + vmovdqu64 128(%rcx), %zmm21 + vpxorq %zmm21, %zmm18, %zmm18 + vmovdqu64 %zmm18, 128(%rdx) + vmovdqu64 192(%rcx), %zmm21 + vpxorq %zmm21, %zmm19, %zmm19 + vmovdqu64 %zmm19, 192(%rdx) + addl $0x100, %r14d +L_AES_GCM_decrypt_update_avx512_after_256: + vmovdqu (%rsp), %xmm5 +L_AES_GCM_decrypt_update_avx512_done_128: + movl %r8d, %edx + cmpl %edx, %r14d + jge L_AES_GCM_decrypt_update_avx512_done_dec + movl %r8d, %r13d + andl $0xfffffff0, %r13d + cmpl %r13d, %r14d + jge L_AES_GCM_decrypt_update_avx512_last_block_done +L_AES_GCM_decrypt_update_avx512_last_block_start: + vmovdqu (%r11,%r14,1), %xmm13 + vmovdqa %xmm5, %xmm0 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm13, %xmm1 + vpxor %xmm6, %xmm1, %xmm1 + vmovdqu (%r12), %xmm9 + vpshufb L_avx512_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8 + vpaddd L_avx512_aes_gcm_one(%rip), %xmm9, %xmm9 + vmovdqu %xmm9, (%r12) + vpxor (%rdi), %xmm8, %xmm8 + vpclmulqdq $16, %xmm0, %xmm1, %xmm10 + vaesenc 16(%rdi), %xmm8, %xmm8 + vaesenc 32(%rdi), %xmm8, %xmm8 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm11 + vaesenc 48(%rdi), %xmm8, %xmm8 + vaesenc 64(%rdi), %xmm8, %xmm8 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm12 + vaesenc 80(%rdi), %xmm8, %xmm8 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vaesenc 96(%rdi), %xmm8, %xmm8 + vpxor %xmm11, %xmm10, %xmm10 + vpslldq $8, %xmm10, %xmm2 + vpsrldq $8, %xmm10, %xmm10 + vaesenc 112(%rdi), %xmm8, %xmm8 + vpxor %xmm12, %xmm2, %xmm2 + vpxor %xmm10, %xmm1, %xmm3 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm0 + vpclmulqdq $16, %xmm0, %xmm2, %xmm11 + vaesenc 128(%rdi), %xmm8, %xmm8 + vpshufd $0x4e, %xmm2, %xmm10 + vpxor %xmm11, %xmm10, %xmm10 + vpclmulqdq $16, %xmm0, %xmm10, %xmm11 + vaesenc 144(%rdi), %xmm8, %xmm8 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm11, %xmm10, %xmm10 + vpxor %xmm3, %xmm10, %xmm6 + cmpl $11, %esi + vmovdqa 160(%rdi), %xmm9 + jl L_AES_GCM_decrypt_update_avx512_aesenc_gfmul_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 176(%rdi), %xmm8, %xmm8 + cmpl $13, %esi + vmovdqa 192(%rdi), %xmm9 + jl L_AES_GCM_decrypt_update_avx512_aesenc_gfmul_last + vaesenc %xmm9, %xmm8, %xmm8 + vaesenc 208(%rdi), %xmm8, %xmm8 + vmovdqa 224(%rdi), %xmm9 +L_AES_GCM_decrypt_update_avx512_aesenc_gfmul_last: + vaesenclast %xmm9, %xmm8, %xmm8 + vmovdqa %xmm13, %xmm0 + vpxor %xmm0, %xmm8, %xmm8 + vmovdqu %xmm8, (%r10,%r14,1) + addl $16, %r14d + cmpl %r13d, %r14d + jl L_AES_GCM_decrypt_update_avx512_last_block_start +L_AES_GCM_decrypt_update_avx512_last_block_done: +L_AES_GCM_decrypt_update_avx512_done_dec: + vmovdqa %xmm6, (%r9) + vzeroupper + addq $0x410, %rsp + popq %rbx + popq %r15 + popq %r14 + popq %r12 + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_decrypt_update_avx512,.-AES_GCM_decrypt_update_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_decrypt_final_avx512 +.type AES_GCM_decrypt_final_avx512,@function +.align 16 +AES_GCM_decrypt_final_avx512: +#else +.section __TEXT,__text +.globl _AES_GCM_decrypt_final_avx512 +.p2align 4 +_AES_GCM_decrypt_final_avx512: +#endif /* __APPLE__ */ + pushq %r13 + pushq %rbp + pushq %r12 + movl %edx, %eax + movl %ecx, %r10d + movl %r8d, %r11d + movq 32(%rsp), %r8 + movq 40(%rsp), %rbp + subq $16, %rsp + vmovdqa (%rdi), %xmm6 + vmovdqa (%r9), %xmm5 + vmovdqa (%r8), %xmm15 + vpsrlq $63, %xmm5, %xmm8 + vpsllq $0x01, %xmm5, %xmm7 + vpslldq $8, %xmm8, %xmm8 + vpor %xmm8, %xmm7, %xmm7 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpand L_avx512_aes_gcm_mod2_128(%rip), %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + movl %r10d, %edx + movl %r11d, %ecx + shlq $3, %rdx + shlq $3, %rcx + vmovq %rdx, %xmm0 + vmovq %rcx, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm8 + vpxor %xmm5, %xmm8, %xmm8 + vpshufd $0x4e, %xmm6, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm7 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm10 + vpclmulqdq $0x00, %xmm9, %xmm8, %xmm8 + vpternlogq $0x96, %xmm7, %xmm10, %xmm8 + vmovdqa L_avx512_aes_gcm_mod2_128(%rip), %xmm9 + vpclmulqdq $0x01, %xmm7, %xmm9, %xmm11 + vpshufd $0x4e, %xmm7, %xmm7 + vpternlogq $0x96, %xmm11, %xmm7, %xmm8 + vpclmulqdq $0x01, %xmm8, %xmm9, %xmm11 + vpshufd $0x4e, %xmm8, %xmm8 + vpternlogq $0x96, %xmm11, %xmm8, %xmm10 + vmovdqa %xmm10, %xmm6 + vpshufb L_avx512_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6 + vpxor %xmm15, %xmm6, %xmm0 + cmpl $16, %eax + je L_AES_GCM_decrypt_final_avx512_cmp_tag_16 + subq $16, %rsp + xorq %rcx, %rcx + xorq %r12, %r12 + vmovdqu %xmm0, (%rsp) +L_AES_GCM_decrypt_final_avx512_cmp_tag_loop: + movzbl (%rsp,%rcx,1), %r13d + xorb (%rsi,%rcx,1), %r13b + orb %r13b, %r12b + incl %ecx + cmpl %eax, %ecx + jne L_AES_GCM_decrypt_final_avx512_cmp_tag_loop + cmpb $0x00, %r12b + sete %r12b + addq $16, %rsp + xorq %rcx, %rcx + jmp L_AES_GCM_decrypt_final_avx512_cmp_tag_done +L_AES_GCM_decrypt_final_avx512_cmp_tag_16: + vmovdqu (%rsi), %xmm1 + vpcmpeqb %xmm1, %xmm0, %xmm0 + vpmovmskb %xmm0, %rdx + # %%edx == 0xFFFF then return 1 else => return 0 + xorl %r12d, %r12d + cmpl $0xffff, %edx + sete %r12b +L_AES_GCM_decrypt_final_avx512_cmp_tag_done: + movl %r12d, (%rbp) + vzeroupper + addq $16, %rsp + popq %r12 + popq %rbp + popq %r13 + repz retq +#ifndef __APPLE__ +.size AES_GCM_decrypt_final_avx512,.-AES_GCM_decrypt_final_avx512 +#endif /* __APPLE__ */ +#endif /* WOLFSSL_AESGCM_STREAM */ +#endif /* HAVE_INTEL_AVX512 */ #endif /* WOLFSSL_X86_64_BUILD */ #if defined(__linux__) && defined(__ELF__) diff --git a/wolfcrypt/src/aes_gcm_asm.asm b/wolfcrypt/src/aes_gcm_asm.asm index d222bc14478..34f68476310 100644 --- a/wolfcrypt/src/aes_gcm_asm.asm +++ b/wolfcrypt/src/aes_gcm_asm.asm @@ -171,10 +171,10 @@ GCM_generate_m0_aesni PROC por xmm1, xmm5 por xmm2, xmm6 por xmm3, xmm7 - vpshufb xmm0, xmm0, xmm9 - vpshufb xmm1, xmm1, xmm9 - vpshufb xmm2, xmm2, xmm9 - vpshufb xmm3, xmm3, xmm9 + pshufb xmm0, xmm9 + pshufb xmm1, xmm9 + pshufb xmm2, xmm9 + pshufb xmm3, xmm9 movdqu OWORD PTR [rdx+256], xmm0 movdqu OWORD PTR [rdx+272], xmm1 movdqu OWORD PTR [rdx+288], xmm2 @@ -207,10 +207,10 @@ GCM_generate_m0_aesni PROC por xmm1, xmm5 por xmm2, xmm6 por xmm3, xmm7 - vpshufb xmm0, xmm0, xmm9 - vpshufb xmm1, xmm1, xmm9 - vpshufb xmm2, xmm2, xmm9 - vpshufb xmm3, xmm3, xmm9 + pshufb xmm0, xmm9 + pshufb xmm1, xmm9 + pshufb xmm2, xmm9 + pshufb xmm3, xmm9 movdqu OWORD PTR [rdx+320], xmm0 movdqu OWORD PTR [rdx+336], xmm1 movdqu OWORD PTR [rdx+352], xmm2 @@ -243,10 +243,10 @@ GCM_generate_m0_aesni PROC por xmm1, xmm5 por xmm2, xmm6 por xmm3, xmm7 - vpshufb xmm0, xmm0, xmm9 - vpshufb xmm1, xmm1, xmm9 - vpshufb xmm2, xmm2, xmm9 - vpshufb xmm3, xmm3, xmm9 + pshufb xmm0, xmm9 + pshufb xmm1, xmm9 + pshufb xmm2, xmm9 + pshufb xmm3, xmm9 movdqu OWORD PTR [rdx+384], xmm0 movdqu OWORD PTR [rdx+400], xmm1 movdqu OWORD PTR [rdx+416], xmm2 @@ -279,10 +279,10 @@ GCM_generate_m0_aesni PROC por xmm1, xmm5 por xmm2, xmm6 por xmm3, xmm7 - vpshufb xmm0, xmm0, xmm9 - vpshufb xmm1, xmm1, xmm9 - vpshufb xmm2, xmm2, xmm9 - vpshufb xmm3, xmm3, xmm9 + pshufb xmm0, xmm9 + pshufb xmm1, xmm9 + pshufb xmm2, xmm9 + pshufb xmm3, xmm9 movdqu OWORD PTR [rdx+448], xmm0 movdqu OWORD PTR [rdx+464], xmm1 movdqu OWORD PTR [rdx+480], xmm2 @@ -16518,4 +16518,14153 @@ L_AES_GCM_decrypt_final_avx2_cmp_tag_done: AES_GCM_decrypt_final_avx2 ENDP _TEXT ENDS ENDIF +IFDEF HAVE_INTEL_VAES +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_inc_y0 QWORD \ + 0000000000000000h, 0000000000000000h, + 0000000000000000h, 0000000000000001h +ptr_L_vaes_aes_gcm_inc_y0 QWORD L_vaes_aes_gcm_inc_y0 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_inc_y1 QWORD \ + 0000000000000000h, 0000000000000002h, + 0000000000000000h, 0000000000000003h +ptr_L_vaes_aes_gcm_inc_y1 QWORD L_vaes_aes_gcm_inc_y1 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_inc_y2 QWORD \ + 0000000000000000h, 0000000000000004h, + 0000000000000000h, 0000000000000005h +ptr_L_vaes_aes_gcm_inc_y2 QWORD L_vaes_aes_gcm_inc_y2 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_inc_y3 QWORD \ + 0000000000000000h, 0000000000000006h, + 0000000000000000h, 0000000000000007h +ptr_L_vaes_aes_gcm_inc_y3 QWORD L_vaes_aes_gcm_inc_y3 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_GCM_generate_m0_vaes_rev8 QWORD \ + 08090a0b0c0d0e0fh, 0001020304050607h +ptr_L_GCM_generate_m0_vaes_rev8 QWORD L_GCM_generate_m0_vaes_rev8 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_GCM_generate_m0_vaes_mod2_128 QWORD \ + 0000000000000000h, 0e100000000000000h +ptr_L_GCM_generate_m0_vaes_mod2_128 QWORD L_GCM_generate_m0_vaes_mod2_128 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +GCM_generate_m0_vaes PROC + sub rsp, 80 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu xmm9, OWORD PTR L_GCM_generate_m0_vaes_rev8 + vmovdqu xmm10, OWORD PTR L_GCM_generate_m0_vaes_mod2_128 + vpxor xmm8, xmm8, xmm8 + vmovdqu xmm0, OWORD PTR [rcx] + vmovdqu OWORD PTR [rdx], xmm8 + vmovdqu xmm8, xmm0 + vpshufb xmm0, xmm0, xmm9 + vpsllq xmm5, xmm0, 63 + vpsrlq xmm4, xmm0, 1 + vpslldq xmm1, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpshufd xmm1, xmm1, 255 + vpor xmm4, xmm4, xmm5 + vpsrad xmm1, xmm1, 31 + vpand xmm1, xmm1, xmm10 + vpxor xmm1, xmm1, xmm4 + vpsllq xmm5, xmm1, 63 + vpsrlq xmm4, xmm1, 1 + vpslldq xmm2, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpshufd xmm2, xmm2, 255 + vpor xmm4, xmm4, xmm5 + vpsrad xmm2, xmm2, 31 + vpand xmm2, xmm2, xmm10 + vpxor xmm2, xmm2, xmm4 + vpsllq xmm5, xmm2, 63 + vpsrlq xmm4, xmm2, 1 + vpslldq xmm3, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpshufd xmm3, xmm3, 255 + vpor xmm4, xmm4, xmm5 + vpsrad xmm3, xmm3, 31 + vpand xmm3, xmm3, xmm10 + vpxor xmm3, xmm3, xmm4 + vpshufb xmm3, xmm3, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm0, xmm0, xmm9 + vpxor xmm8, xmm3, xmm2 + vmovdqu OWORD PTR [rdx+16], xmm3 + vmovdqu OWORD PTR [rdx+32], xmm2 + vmovdqu OWORD PTR [rdx+48], xmm8 + vmovdqu OWORD PTR [rdx+64], xmm1 + vpxor xmm4, xmm3, xmm1 + vpxor xmm5, xmm2, xmm1 + vpxor xmm6, xmm8, xmm1 + vmovdqu OWORD PTR [rdx+80], xmm4 + vmovdqu OWORD PTR [rdx+96], xmm5 + vmovdqu OWORD PTR [rdx+112], xmm6 + vmovdqu OWORD PTR [rdx+128], xmm0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm3, xmm0 + vpxor xmm6, xmm2, xmm0 + vmovdqu OWORD PTR [rdx+144], xmm4 + vmovdqu OWORD PTR [rdx+160], xmm6 + vpxor xmm6, xmm3, xmm6 + vmovdqu OWORD PTR [rdx+176], xmm6 + vmovdqu OWORD PTR [rdx+192], xmm1 + vpxor xmm4, xmm3, xmm1 + vpxor xmm5, xmm2, xmm1 + vpxor xmm6, xmm8, xmm1 + vmovdqu OWORD PTR [rdx+208], xmm4 + vmovdqu OWORD PTR [rdx+224], xmm5 + vmovdqu OWORD PTR [rdx+240], xmm6 + vmovdqu xmm0, OWORD PTR [rdx] + vmovdqu xmm1, OWORD PTR [rdx+16] + vmovdqu xmm2, OWORD PTR [rdx+32] + vmovdqu xmm3, OWORD PTR [rdx+48] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+256], xmm0 + vmovdqu OWORD PTR [rdx+272], xmm1 + vmovdqu OWORD PTR [rdx+288], xmm2 + vmovdqu OWORD PTR [rdx+304], xmm3 + vmovdqu xmm0, OWORD PTR [rdx+64] + vmovdqu xmm1, OWORD PTR [rdx+80] + vmovdqu xmm2, OWORD PTR [rdx+96] + vmovdqu xmm3, OWORD PTR [rdx+112] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+320], xmm0 + vmovdqu OWORD PTR [rdx+336], xmm1 + vmovdqu OWORD PTR [rdx+352], xmm2 + vmovdqu OWORD PTR [rdx+368], xmm3 + vmovdqu xmm0, OWORD PTR [rdx+128] + vmovdqu xmm1, OWORD PTR [rdx+144] + vmovdqu xmm2, OWORD PTR [rdx+160] + vmovdqu xmm3, OWORD PTR [rdx+176] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+384], xmm0 + vmovdqu OWORD PTR [rdx+400], xmm1 + vmovdqu OWORD PTR [rdx+416], xmm2 + vmovdqu OWORD PTR [rdx+432], xmm3 + vmovdqu xmm0, OWORD PTR [rdx+192] + vmovdqu xmm1, OWORD PTR [rdx+208] + vmovdqu xmm2, OWORD PTR [rdx+224] + vmovdqu xmm3, OWORD PTR [rdx+240] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+448], xmm0 + vmovdqu OWORD PTR [rdx+464], xmm1 + vmovdqu OWORD PTR [rdx+480], xmm2 + vmovdqu OWORD PTR [rdx+496], xmm3 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + add rsp, 80 + ret +GCM_generate_m0_vaes ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_one QWORD \ + 0000000000000000h, 0000000000000001h +ptr_L_vaes_aes_gcm_one QWORD L_vaes_aes_gcm_one +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_two QWORD \ + 0000000000000000h, 0000000000000002h +ptr_L_vaes_aes_gcm_two QWORD L_vaes_aes_gcm_two +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_three QWORD \ + 0000000000000000h, 0000000000000003h +ptr_L_vaes_aes_gcm_three QWORD L_vaes_aes_gcm_three +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_four QWORD \ + 0000000000000000h, 0000000000000004h +ptr_L_vaes_aes_gcm_four QWORD L_vaes_aes_gcm_four +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_five QWORD \ + 0000000000000000h, 0000000000000005h +ptr_L_vaes_aes_gcm_five QWORD L_vaes_aes_gcm_five +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_six QWORD \ + 0000000000000000h, 0000000000000006h +ptr_L_vaes_aes_gcm_six QWORD L_vaes_aes_gcm_six +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_seven QWORD \ + 0000000000000000h, 0000000000000007h +ptr_L_vaes_aes_gcm_seven QWORD L_vaes_aes_gcm_seven +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_eight QWORD \ + 0000000000000000h, 0000000000000008h +ptr_L_vaes_aes_gcm_eight QWORD L_vaes_aes_gcm_eight +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_bswap_epi64 QWORD \ + 0001020304050607h, 08090a0b0c0d0e0fh +ptr_L_vaes_aes_gcm_bswap_epi64 QWORD L_vaes_aes_gcm_bswap_epi64 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_bswap_mask QWORD \ + 08090a0b0c0d0e0fh, 0001020304050607h +ptr_L_vaes_aes_gcm_bswap_mask QWORD L_vaes_aes_gcm_bswap_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_gcm_mod2_128 QWORD \ + 0000000000000001h, 0c200000000000000h +ptr_L_vaes_aes_gcm_mod2_128 QWORD L_vaes_aes_gcm_mod2_128 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_vaes PROC + push r13 + push rdi + push rsi + push r12 + push rbx + push r14 + push r15 + mov rdi, rcx + mov rsi, rdx + mov r12, r8 + mov rax, r9 + mov r8, QWORD PTR [rsp+96] + mov r9d, DWORD PTR [rsp+104] + mov r11d, DWORD PTR [rsp+112] + mov ebx, DWORD PTR [rsp+120] + mov r14d, DWORD PTR [rsp+128] + mov r15, QWORD PTR [rsp+136] + mov r10d, DWORD PTR [rsp+144] + sub rsp, 720 + vmovdqu OWORD PTR [rsp+560], xmm6 + vmovdqu OWORD PTR [rsp+576], xmm7 + vmovdqu OWORD PTR [rsp+592], xmm8 + vmovdqu OWORD PTR [rsp+608], xmm9 + vmovdqu OWORD PTR [rsp+624], xmm10 + vmovdqu OWORD PTR [rsp+640], xmm11 + vmovdqu OWORD PTR [rsp+656], xmm12 + vmovdqu OWORD PTR [rsp+672], xmm13 + vmovdqu OWORD PTR [rsp+688], xmm14 + vmovdqu OWORD PTR [rsp+704], xmm15 + vpxor xmm5, xmm5, xmm5 + vpxor xmm15, xmm15, xmm15 + mov edx, ebx + cmp edx, 12 + jne L_AES_GCM_encrypt_vaes_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + vmovq xmm5, QWORD PTR [rax] + vpinsrd xmm5, xmm5, DWORD PTR [rax+8], 2 + vpinsrd xmm5, xmm5, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqa xmm6, OWORD PTR [r15] + vpxor xmm1, xmm5, xmm6 + vmovdqa xmm4, OWORD PTR [r15+16] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+32] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+48] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+64] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+80] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+96] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+112] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+128] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+144] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + cmp r10d, 11 + vmovdqa xmm4, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_vaes_calc_iv_12_last + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+176] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + cmp r10d, 13 + vmovdqa xmm4, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_vaes_calc_iv_12_last + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+208] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+224] +L_AES_GCM_encrypt_vaes_calc_iv_12_last: + vaesenclast xmm6, xmm6, xmm4 + vaesenclast xmm1, xmm1, xmm4 + vpshufb xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_bswap_mask + vmovdqu OWORD PTR [rsp+528], xmm1 + jmp L_AES_GCM_encrypt_vaes_iv_done +L_AES_GCM_encrypt_vaes_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqa xmm6, OWORD PTR [r15] + vaesenc xmm6, xmm6, [r15+16] + vaesenc xmm6, xmm6, [r15+32] + vaesenc xmm6, xmm6, [r15+48] + vaesenc xmm6, xmm6, [r15+64] + vaesenc xmm6, xmm6, [r15+80] + vaesenc xmm6, xmm6, [r15+96] + vaesenc xmm6, xmm6, [r15+112] + vaesenc xmm6, xmm6, [r15+128] + vaesenc xmm6, xmm6, [r15+144] + cmp r10d, 11 + vmovdqa xmm8, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_vaes_calc_iv_1_aesenc_avx_last + vaesenc xmm6, xmm6, xmm8 + vaesenc xmm6, xmm6, [r15+176] + cmp r10d, 13 + vmovdqa xmm8, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_vaes_calc_iv_1_aesenc_avx_last + vaesenc xmm6, xmm6, xmm8 + vaesenc xmm6, xmm6, [r15+208] + vmovdqa xmm8, OWORD PTR [r15+224] +L_AES_GCM_encrypt_vaes_calc_iv_1_aesenc_avx_last: + vaesenclast xmm6, xmm6, xmm8 + vpshufb xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov rcx, 0 + je L_AES_GCM_encrypt_vaes_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_encrypt_vaes_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_encrypt_vaes_calc_iv_16_loop: + vmovdqu xmm7, OWORD PTR [rax+rcx] + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm5, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm5, xmm5, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm5, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm5, xmm5, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm5, xmm5, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm5, xmm5, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm5, xmm5, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_encrypt_vaes_calc_iv_16_loop + mov edx, ebx + cmp ecx, edx + je L_AES_GCM_encrypt_vaes_calc_iv_done +L_AES_GCM_encrypt_vaes_calc_iv_lt16: + sub rsp, 16 + vpxor xmm7, xmm7, xmm7 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm7 +L_AES_GCM_encrypt_vaes_calc_iv_loop: + movzx r13d, BYTE PTR [rax+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_encrypt_vaes_calc_iv_loop + vmovdqu xmm7, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm5, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm5, xmm5, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm5, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm5, xmm5, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm5, xmm5, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm5, xmm5, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm5, xmm5, xmm2 +L_AES_GCM_encrypt_vaes_calc_iv_done: + ; T = Encrypt counter + vpxor xmm0, xmm0, xmm0 + shl edx, 3 + vmovq xmm0, rdx + vpxor xmm5, xmm5, xmm0 + ; ghash_gfmul_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm5, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm5, xmm5, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm5, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm5, xmm5, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm5, xmm5, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm5, xmm5, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm5, xmm5, xmm2 + vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_mask + ; Encrypt counter + vmovdqa xmm7, OWORD PTR [r15] + vpxor xmm7, xmm7, xmm5 + vaesenc xmm7, xmm7, [r15+16] + vaesenc xmm7, xmm7, [r15+32] + vaesenc xmm7, xmm7, [r15+48] + vaesenc xmm7, xmm7, [r15+64] + vaesenc xmm7, xmm7, [r15+80] + vaesenc xmm7, xmm7, [r15+96] + vaesenc xmm7, xmm7, [r15+112] + vaesenc xmm7, xmm7, [r15+128] + vaesenc xmm7, xmm7, [r15+144] + cmp r10d, 11 + vmovdqa xmm8, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_vaes_calc_iv_2_aesenc_avx_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [r15+176] + cmp r10d, 13 + vmovdqa xmm8, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_vaes_calc_iv_2_aesenc_avx_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [r15+208] + vmovdqa xmm8, OWORD PTR [r15+224] +L_AES_GCM_encrypt_vaes_calc_iv_2_aesenc_avx_last: + vaesenclast xmm7, xmm7, xmm8 + vmovdqu OWORD PTR [rsp+528], xmm7 +L_AES_GCM_encrypt_vaes_iv_done: + ; Additional authentication data + mov edx, r11d + cmp edx, 0 + je L_AES_GCM_encrypt_vaes_calc_aad_done + xor ecx, ecx + cmp edx, 16 + jl L_AES_GCM_encrypt_vaes_calc_aad_lt16 + and edx, 4294967280 +L_AES_GCM_encrypt_vaes_calc_aad_16_loop: + vmovdqu xmm7, OWORD PTR [r12+rcx] + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm15, xmm15, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm15, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm15, 17 + vpclmulqdq xmm0, xmm6, xmm15, 0 + vpxor xmm1, xmm1, xmm15 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm15, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm15, xmm15, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm15, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm15, xmm15, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm15, xmm15, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm15, xmm15, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm15, xmm15, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_encrypt_vaes_calc_aad_16_loop + mov edx, r11d + cmp ecx, edx + je L_AES_GCM_encrypt_vaes_calc_aad_done +L_AES_GCM_encrypt_vaes_calc_aad_lt16: + sub rsp, 16 + vpxor xmm7, xmm7, xmm7 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm7 +L_AES_GCM_encrypt_vaes_calc_aad_loop: + movzx r13d, BYTE PTR [r12+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_encrypt_vaes_calc_aad_loop + vmovdqu xmm7, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm15, xmm15, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm15, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm15, 17 + vpclmulqdq xmm0, xmm6, xmm15, 0 + vpxor xmm1, xmm1, xmm15 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm15, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm15, xmm15, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm15, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm15, xmm15, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm15, xmm15, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm15, xmm15, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm15, xmm15, xmm2 +L_AES_GCM_encrypt_vaes_calc_aad_done: + ; Calculate counter and H + vpsrlq xmm8, xmm6, 63 + vpsllq xmm7, xmm6, 1 + vpslldq xmm8, xmm8, 8 + vpor xmm7, xmm7, xmm8 + vpshufd xmm6, xmm6, 255 + vpsrad xmm6, xmm6, 31 + vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpand xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpaddd xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_one + vpxor xmm6, xmm6, xmm7 + vmovdqu OWORD PTR [rsp+512], xmm5 + xor ebx, ebx + cmp r9d, 128 + jl L_AES_GCM_encrypt_vaes_done_128 + vmovdqa xmm2, xmm15 + ; H ^ 1 + vmovdqu OWORD PTR [rsp], xmm6 + ; H ^ 2 + vpclmulqdq xmm7, xmm6, xmm6, 0 + vpclmulqdq xmm10, xmm6, xmm6, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm0, xmm10 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm0, xmm6, 0 + vpclmulqdq xmm8, xmm0, xmm6, 1 + vpclmulqdq xmm9, xmm0, xmm6, 16 + vpclmulqdq xmm10, xmm0, xmm6, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm1, xmm10 + vmovdqu OWORD PTR [rsp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm3, xmm10 + vmovdqu OWORD PTR [rsp+48], xmm3 + ; H ^ 5 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+64], xmm4 + ; H ^ 6 + vpclmulqdq xmm7, xmm1, xmm1, 0 + vpclmulqdq xmm10, xmm1, xmm1, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+80], xmm4 + ; H ^ 7 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm3, xmm1, 0 + vpclmulqdq xmm8, xmm3, xmm1, 1 + vpclmulqdq xmm9, xmm3, xmm1, 16 + vpclmulqdq xmm10, xmm3, xmm1, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+96], xmm4 + ; H ^ 8 + vpclmulqdq xmm7, xmm3, xmm3, 0 + vpclmulqdq xmm10, xmm3, xmm3, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+112], xmm4 + cmp r9d, 256 + jl L_AES_GCM_encrypt_vaes_no_ext + ; H ^ 9 + vmovdqu xmm0, OWORD PTR [rsp+48] + vmovdqu xmm1, OWORD PTR [rsp+64] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+128], xmm4 + ; H ^ 10 + vmovdqu xmm0, OWORD PTR [rsp+64] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+144], xmm4 + ; H ^ 11 + vmovdqu xmm0, OWORD PTR [rsp+64] + vmovdqu xmm1, OWORD PTR [rsp+80] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+160], xmm4 + ; H ^ 12 + vmovdqu xmm0, OWORD PTR [rsp+80] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+176], xmm4 + ; H ^ 13 + vmovdqu xmm0, OWORD PTR [rsp+80] + vmovdqu xmm1, OWORD PTR [rsp+96] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+192], xmm4 + ; H ^ 14 + vmovdqu xmm0, OWORD PTR [rsp+96] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+208], xmm4 + ; H ^ 15 + vmovdqu xmm0, OWORD PTR [rsp+96] + vmovdqu xmm1, OWORD PTR [rsp+112] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+224], xmm4 + ; H ^ 16 + vmovdqu xmm0, OWORD PTR [rsp+112] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+240], xmm4 + vmovdqu ymm7, YMMWORD PTR [rsp+224] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+192] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+160] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp+128] + vpermq ymm10, ymm10, 78 + vmovdqu YMMWORD PTR [rsp+256], ymm7 + vmovdqu YMMWORD PTR [rsp+288], ymm8 + vmovdqu YMMWORD PTR [rsp+320], ymm9 + vmovdqu YMMWORD PTR [rsp+352], ymm10 + vmovdqu ymm7, YMMWORD PTR [rsp+96] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+64] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+32] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp] + vpermq ymm10, ymm10, 78 + vmovdqu YMMWORD PTR [rsp+384], ymm7 + vmovdqu YMMWORD PTR [rsp+416], ymm8 + vmovdqu YMMWORD PTR [rsp+448], ymm9 + vmovdqu YMMWORD PTR [rsp+480], ymm10 +L_AES_GCM_encrypt_vaes_no_ext: + vbroadcasti128 ymm14, ptr_L_vaes_aes_gcm_mod2_128 + cmp r9d, 256 + jl L_AES_GCM_encrypt_vaes_after_256 + mov r13d, r9d + and r13d, 4294967040 +L_AES_GCM_encrypt_vaes_loop_256: + ; 256 bytes of input + lea rcx, QWORD PTR [rsi+rbx] + mov QWORD PTR [rsp+544], rcx + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 + vbroadcasti128 ymm4, [rsp+512] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [rsp+512] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [rsp+512], xmm7 + vbroadcasti128 ymm4, [r15] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 11 + vbroadcasti128 ymm4, [r15+160] + jl L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 13 + vbroadcasti128 ymm4, [r15+192] + jl L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+224] +L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add ebx, 128 + vbroadcasti128 ymm4, [rsp+512] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [rsp+512] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [rsp+512], xmm7 + vbroadcasti128 ymm4, [r15] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 11 + vbroadcasti128 ymm4, [r15+160] + jl L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 13 + vbroadcasti128 ymm4, [r15+192] + jl L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+224] +L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add ebx, 128 + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask + mov rcx, QWORD PTR [rsp+544] + vpxor ymm4, ymm4, ymm4 + vinserti128 ymm4, ymm4, xmm15, 0 + vmovdqu ymm7, YMMWORD PTR [rsp+256] + vmovdqu ymm8, YMMWORD PTR [rsp+288] + vmovdqu ymm9, YMMWORD PTR [rsp+320] + vmovdqu ymm10, YMMWORD PTR [rsp+352] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpshufb ymm5, ymm5, ymm6 + vpxor ymm5, ymm5, ymm4 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vmovdqa ymm11, ymm0 + vpxor ymm12, ymm2, ymm1 + vmovdqa ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm7, YMMWORD PTR [rsp+384] + vmovdqu ymm8, YMMWORD PTR [rsp+416] + vmovdqu ymm9, YMMWORD PTR [rsp+448] + vmovdqu ymm10, YMMWORD PTR [rsp+480] + vmovdqu ymm5, YMMWORD PTR [rcx+128] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rcx+160] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rcx+192] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rcx+224] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vpclmulqdq ymm5, ymm14, ymm11, 1 + vpshufd ymm11, ymm11, 78 + vpxor ymm12, ymm12, ymm5 + vpxor ymm12, ymm12, ymm11 + vpclmulqdq ymm5, ymm14, ymm12, 1 + vpshufd ymm12, ymm12, 78 + vpxor ymm13, ymm13, ymm5 + vpxor ymm13, ymm13, ymm12 + vextracti128 xmm0, ymm13, 1 + vpxor xmm15, xmm13, xmm0 + cmp ebx, r13d + jl L_AES_GCM_encrypt_vaes_loop_256 +L_AES_GCM_encrypt_vaes_after_256: + mov r13d, r9d + and r13d, 4294967168 + cmp ebx, r13d + jge L_AES_GCM_encrypt_vaes_after_128 + ; 128 bytes of input + lea rcx, QWORD PTR [rsi+rbx] + mov QWORD PTR [rsp+544], rcx + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 + vbroadcasti128 ymm4, [rsp+512] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [rsp+512] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [rsp+512], xmm7 + vbroadcasti128 ymm4, [r15] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 11 + vbroadcasti128 ymm4, [r15+160] + jl L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 13 + vbroadcasti128 ymm4, [r15+192] + jl L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+224] +L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add ebx, 128 + vmovdqu ymm7, YMMWORD PTR [rsp+96] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+64] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+32] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp] + vpermq ymm10, ymm10, 78 + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask + mov rcx, QWORD PTR [rsp+544] + vpxor ymm4, ymm4, ymm4 + vinserti128 ymm4, ymm4, xmm15, 0 + vmovdqu ymm5, YMMWORD PTR [rcx] + vpshufb ymm5, ymm5, ymm6 + vpxor ymm5, ymm5, ymm4 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vmovdqa ymm11, ymm0 + vpxor ymm12, ymm2, ymm1 + vmovdqa ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vpclmulqdq ymm5, ymm14, ymm11, 1 + vpshufd ymm11, ymm11, 78 + vpxor ymm12, ymm12, ymm5 + vpxor ymm12, ymm12, ymm11 + vpclmulqdq ymm5, ymm14, ymm12, 1 + vpshufd ymm12, ymm12, 78 + vpxor ymm13, ymm13, ymm5 + vpxor ymm13, ymm13, ymm12 + vextracti128 xmm0, ymm13, 1 + vpxor xmm15, xmm13, xmm0 +L_AES_GCM_encrypt_vaes_after_128: + vmovdqu xmm6, OWORD PTR [rsp] +L_AES_GCM_encrypt_vaes_done_128: + mov edx, r9d + cmp ebx, edx + jge L_AES_GCM_encrypt_vaes_done_enc + mov r13d, r9d + and r13d, 4294967280 + cmp ebx, r13d + jge L_AES_GCM_encrypt_vaes_last_block_done + vmovdqu xmm8, OWORD PTR [rsp+512] + vpshufb xmm7, xmm8, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpaddd xmm8, xmm8, OWORD PTR L_vaes_aes_gcm_one + vmovdqu OWORD PTR [rsp+512], xmm8 + vpxor xmm7, xmm7, [r15] + vaesenc xmm7, xmm7, [r15+16] + vaesenc xmm7, xmm7, [r15+32] + vaesenc xmm7, xmm7, [r15+48] + vaesenc xmm7, xmm7, [r15+64] + vaesenc xmm7, xmm7, [r15+80] + vaesenc xmm7, xmm7, [r15+96] + vaesenc xmm7, xmm7, [r15+112] + vaesenc xmm7, xmm7, [r15+128] + vaesenc xmm7, xmm7, [r15+144] + cmp r10d, 11 + vmovdqa xmm8, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_vaes_aesenc_block_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [r15+176] + cmp r10d, 13 + vmovdqa xmm8, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_vaes_aesenc_block_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [r15+208] + vmovdqa xmm8, OWORD PTR [r15+224] +L_AES_GCM_encrypt_vaes_aesenc_block_last: + vaesenclast xmm7, xmm7, xmm8 + vmovdqu xmm8, OWORD PTR [rdi+rbx] + vpxor xmm7, xmm7, xmm8 + vmovdqu OWORD PTR [rsi+rbx], xmm7 + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm15, xmm15, xmm7 + add ebx, 16 + cmp ebx, r13d + jge L_AES_GCM_encrypt_vaes_last_block_ghash +L_AES_GCM_encrypt_vaes_last_block_start: + vmovdqu xmm12, OWORD PTR [rdi+rbx] + vmovdqu xmm8, OWORD PTR [rsp+512] + vpshufb xmm7, xmm8, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpaddd xmm8, xmm8, OWORD PTR L_vaes_aes_gcm_one + vmovdqu OWORD PTR [rsp+512], xmm8 + vpxor xmm7, xmm7, [r15] + vpclmulqdq xmm9, xmm15, xmm6, 16 + vaesenc xmm7, xmm7, [r15+16] + vaesenc xmm7, xmm7, [r15+32] + vpclmulqdq xmm10, xmm15, xmm6, 1 + vaesenc xmm7, xmm7, [r15+48] + vaesenc xmm7, xmm7, [r15+64] + vpclmulqdq xmm11, xmm15, xmm6, 0 + vaesenc xmm7, xmm7, [r15+80] + vpclmulqdq xmm1, xmm15, xmm6, 17 + vaesenc xmm7, xmm7, [r15+96] + vpxor xmm9, xmm9, xmm10 + vpslldq xmm2, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vaesenc xmm7, xmm7, [r15+112] + vpxor xmm2, xmm2, xmm11 + vpxor xmm3, xmm1, xmm9 + vmovdqa xmm0, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm10, xmm2, xmm0, 16 + vaesenc xmm7, xmm7, [r15+128] + vpshufd xmm9, xmm2, 78 + vpxor xmm9, xmm9, xmm10 + vpclmulqdq xmm10, xmm9, xmm0, 16 + vaesenc xmm7, xmm7, [r15+144] + vpshufd xmm9, xmm9, 78 + vpxor xmm9, xmm9, xmm10 + vpxor xmm15, xmm9, xmm3 + cmp r10d, 11 + vmovdqa xmm8, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_vaes_aesenc_gfmul_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [r15+176] + cmp r10d, 13 + vmovdqa xmm8, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_vaes_aesenc_gfmul_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [r15+208] + vmovdqa xmm8, OWORD PTR [r15+224] +L_AES_GCM_encrypt_vaes_aesenc_gfmul_last: + vaesenclast xmm7, xmm7, xmm8 + vmovdqa xmm0, xmm12 + vpxor xmm7, xmm7, xmm0 + vmovdqu OWORD PTR [rsi+rbx], xmm7 + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + add ebx, 16 + vpxor xmm15, xmm15, xmm7 + cmp ebx, r13d + jl L_AES_GCM_encrypt_vaes_last_block_start +L_AES_GCM_encrypt_vaes_last_block_ghash: + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm15, xmm6, 0 + vpclmulqdq xmm8, xmm15, xmm6, 1 + vpclmulqdq xmm9, xmm15, xmm6, 16 + vpclmulqdq xmm10, xmm15, xmm6, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm15, xmm10 +L_AES_GCM_encrypt_vaes_last_block_done: + mov ecx, r9d + mov edx, ecx + and ecx, 15 + jz L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_done + vmovdqu xmm5, OWORD PTR [rsp+512] + vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpxor xmm5, xmm5, [r15] + vaesenc xmm5, xmm5, [r15+16] + vaesenc xmm5, xmm5, [r15+32] + vaesenc xmm5, xmm5, [r15+48] + vaesenc xmm5, xmm5, [r15+64] + vaesenc xmm5, xmm5, [r15+80] + vaesenc xmm5, xmm5, [r15+96] + vaesenc xmm5, xmm5, [r15+112] + vaesenc xmm5, xmm5, [r15+128] + vaesenc xmm5, xmm5, [r15+144] + cmp r10d, 11 + vmovdqa xmm8, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc xmm5, xmm5, xmm8 + vaesenc xmm5, xmm5, [r15+176] + cmp r10d, 13 + vmovdqa xmm8, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc xmm5, xmm5, xmm8 + vaesenc xmm5, xmm5, [r15+208] + vmovdqa xmm8, OWORD PTR [r15+224] +L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_aesenc_avx_last: + vaesenclast xmm5, xmm5, xmm8 + sub rsp, 16 + xor ecx, ecx + vmovdqu OWORD PTR [rsp], xmm5 +L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_loop: + movzx r13d, BYTE PTR [rdi+rbx] + xor r13b, BYTE PTR [rsp+rcx] + mov BYTE PTR [rsi+rbx], r13b + mov BYTE PTR [rsp+rcx], r13b + inc ebx + inc ecx + cmp ebx, edx + jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_loop + xor r13, r13 + cmp ecx, 16 + je L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_finish_enc +L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_byte_loop: + mov BYTE PTR [rsp+rcx], r13b + inc ecx + cmp ecx, 16 + jl L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_byte_loop +L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_finish_enc: + vmovdqu xmm5, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm15, xmm15, xmm5 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm15, xmm6, 0 + vpclmulqdq xmm8, xmm15, xmm6, 1 + vpclmulqdq xmm9, xmm15, xmm6, 16 + vpclmulqdq xmm10, xmm15, xmm6, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm15, xmm10 +L_AES_GCM_encrypt_vaes_aesenc_last15_enc_avx_done: +L_AES_GCM_encrypt_vaes_done_enc: + mov edx, r9d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + vmovq xmm0, rdx + vmovq xmm1, rcx + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm15, xmm15, xmm0 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm15, xmm6, 0 + vpclmulqdq xmm8, xmm15, xmm6, 1 + vpclmulqdq xmm9, xmm15, xmm6, 16 + vpclmulqdq xmm10, xmm15, xmm6, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm15, xmm10 + vpshufb xmm15, xmm15, OWORD PTR L_vaes_aes_gcm_bswap_mask + vmovdqu xmm0, OWORD PTR [rsp+528] + vpxor xmm0, xmm0, xmm15 + cmp r14d, 16 + je L_AES_GCM_encrypt_vaes_store_tag_16 + xor rcx, rcx + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_encrypt_vaes_store_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + mov BYTE PTR [r8+rcx], r13b + inc ecx + cmp ecx, r14d + jne L_AES_GCM_encrypt_vaes_store_tag_loop + jmp L_AES_GCM_encrypt_vaes_store_tag_done +L_AES_GCM_encrypt_vaes_store_tag_16: + vmovdqu OWORD PTR [r8], xmm0 +L_AES_GCM_encrypt_vaes_store_tag_done: + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+560] + vmovdqu xmm7, OWORD PTR [rsp+576] + vmovdqu xmm8, OWORD PTR [rsp+592] + vmovdqu xmm9, OWORD PTR [rsp+608] + vmovdqu xmm10, OWORD PTR [rsp+624] + vmovdqu xmm11, OWORD PTR [rsp+640] + vmovdqu xmm12, OWORD PTR [rsp+656] + vmovdqu xmm13, OWORD PTR [rsp+672] + vmovdqu xmm14, OWORD PTR [rsp+688] + vmovdqu xmm15, OWORD PTR [rsp+704] + add rsp, 720 + pop r15 + pop r14 + pop rbx + pop r12 + pop rsi + pop rdi + pop r13 + ret +AES_GCM_encrypt_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_decrypt_vaes PROC + push r13 + push rdi + push rsi + push r12 + push rbx + push r14 + push r15 + push rbp + mov rdi, rcx + mov rsi, rdx + mov r12, r8 + mov rax, r9 + mov r8, QWORD PTR [rsp+104] + mov r9d, DWORD PTR [rsp+112] + mov r11d, DWORD PTR [rsp+120] + mov ebx, DWORD PTR [rsp+128] + mov r14d, DWORD PTR [rsp+136] + mov r15, QWORD PTR [rsp+144] + mov r10d, DWORD PTR [rsp+152] + mov rbp, QWORD PTR [rsp+160] + sub rsp, 704 + vmovdqu OWORD PTR [rsp+544], xmm6 + vmovdqu OWORD PTR [rsp+560], xmm7 + vmovdqu OWORD PTR [rsp+576], xmm8 + vmovdqu OWORD PTR [rsp+592], xmm9 + vmovdqu OWORD PTR [rsp+608], xmm10 + vmovdqu OWORD PTR [rsp+624], xmm11 + vmovdqu OWORD PTR [rsp+640], xmm12 + vmovdqu OWORD PTR [rsp+656], xmm13 + vmovdqu OWORD PTR [rsp+672], xmm14 + vmovdqu OWORD PTR [rsp+688], xmm15 + vpxor xmm5, xmm5, xmm5 + vpxor xmm15, xmm15, xmm15 + cmp ebx, 12 + mov edx, ebx + jne L_AES_GCM_decrypt_vaes_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + vmovq xmm5, QWORD PTR [rax] + vpinsrd xmm5, xmm5, DWORD PTR [rax+8], 2 + vpinsrd xmm5, xmm5, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqa xmm6, OWORD PTR [r15] + vpxor xmm1, xmm5, xmm6 + vmovdqa xmm4, OWORD PTR [r15+16] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+32] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+48] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+64] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+80] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+96] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+112] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+128] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+144] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + cmp r10d, 11 + vmovdqa xmm4, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_vaes_calc_iv_12_last + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+176] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + cmp r10d, 13 + vmovdqa xmm4, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_vaes_calc_iv_12_last + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+208] + vaesenc xmm6, xmm6, xmm4 + vaesenc xmm1, xmm1, xmm4 + vmovdqa xmm4, OWORD PTR [r15+224] +L_AES_GCM_decrypt_vaes_calc_iv_12_last: + vaesenclast xmm6, xmm6, xmm4 + vaesenclast xmm1, xmm1, xmm4 + vpshufb xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_bswap_mask + vmovdqu OWORD PTR [rsp+528], xmm1 + jmp L_AES_GCM_decrypt_vaes_iv_done +L_AES_GCM_decrypt_vaes_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqa xmm6, OWORD PTR [r15] + vaesenc xmm6, xmm6, [r15+16] + vaesenc xmm6, xmm6, [r15+32] + vaesenc xmm6, xmm6, [r15+48] + vaesenc xmm6, xmm6, [r15+64] + vaesenc xmm6, xmm6, [r15+80] + vaesenc xmm6, xmm6, [r15+96] + vaesenc xmm6, xmm6, [r15+112] + vaesenc xmm6, xmm6, [r15+128] + vaesenc xmm6, xmm6, [r15+144] + cmp r10d, 11 + vmovdqa xmm8, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_vaes_calc_iv_1_aesenc_avx_last + vaesenc xmm6, xmm6, xmm8 + vaesenc xmm6, xmm6, [r15+176] + cmp r10d, 13 + vmovdqa xmm8, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_vaes_calc_iv_1_aesenc_avx_last + vaesenc xmm6, xmm6, xmm8 + vaesenc xmm6, xmm6, [r15+208] + vmovdqa xmm8, OWORD PTR [r15+224] +L_AES_GCM_decrypt_vaes_calc_iv_1_aesenc_avx_last: + vaesenclast xmm6, xmm6, xmm8 + vpshufb xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov rcx, 0 + je L_AES_GCM_decrypt_vaes_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_decrypt_vaes_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_decrypt_vaes_calc_iv_16_loop: + vmovdqu xmm7, OWORD PTR [rax+rcx] + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm5, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm5, xmm5, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm5, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm5, xmm5, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm5, xmm5, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm5, xmm5, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm5, xmm5, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_decrypt_vaes_calc_iv_16_loop + mov edx, ebx + cmp ecx, edx + je L_AES_GCM_decrypt_vaes_calc_iv_done +L_AES_GCM_decrypt_vaes_calc_iv_lt16: + sub rsp, 16 + vpxor xmm7, xmm7, xmm7 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm7 +L_AES_GCM_decrypt_vaes_calc_iv_loop: + movzx r13d, BYTE PTR [rax+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_decrypt_vaes_calc_iv_loop + vmovdqu xmm7, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm5, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm5, xmm5, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm5, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm5, xmm5, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm5, xmm5, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm5, xmm5, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm5, xmm5, xmm2 +L_AES_GCM_decrypt_vaes_calc_iv_done: + ; T = Encrypt counter + vpxor xmm0, xmm0, xmm0 + shl edx, 3 + vmovq xmm0, rdx + vpxor xmm5, xmm5, xmm0 + ; ghash_gfmul_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm5, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm5, xmm5, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm5, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm5, xmm5, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm5, xmm5, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm5, xmm5, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm5, xmm5, xmm2 + vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_mask + ; Encrypt counter + vmovdqa xmm7, OWORD PTR [r15] + vpxor xmm7, xmm7, xmm5 + vaesenc xmm7, xmm7, [r15+16] + vaesenc xmm7, xmm7, [r15+32] + vaesenc xmm7, xmm7, [r15+48] + vaesenc xmm7, xmm7, [r15+64] + vaesenc xmm7, xmm7, [r15+80] + vaesenc xmm7, xmm7, [r15+96] + vaesenc xmm7, xmm7, [r15+112] + vaesenc xmm7, xmm7, [r15+128] + vaesenc xmm7, xmm7, [r15+144] + cmp r10d, 11 + vmovdqa xmm8, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_vaes_calc_iv_2_aesenc_avx_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [r15+176] + cmp r10d, 13 + vmovdqa xmm8, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_vaes_calc_iv_2_aesenc_avx_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [r15+208] + vmovdqa xmm8, OWORD PTR [r15+224] +L_AES_GCM_decrypt_vaes_calc_iv_2_aesenc_avx_last: + vaesenclast xmm7, xmm7, xmm8 + vmovdqu OWORD PTR [rsp+528], xmm7 +L_AES_GCM_decrypt_vaes_iv_done: + ; Additional authentication data + mov edx, r11d + cmp edx, 0 + je L_AES_GCM_decrypt_vaes_calc_aad_done + xor ecx, ecx + cmp edx, 16 + jl L_AES_GCM_decrypt_vaes_calc_aad_lt16 + and edx, 4294967280 +L_AES_GCM_decrypt_vaes_calc_aad_16_loop: + vmovdqu xmm7, OWORD PTR [r12+rcx] + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm15, xmm15, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm15, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm15, 17 + vpclmulqdq xmm0, xmm6, xmm15, 0 + vpxor xmm1, xmm1, xmm15 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm15, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm15, xmm15, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm15, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm15, xmm15, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm15, xmm15, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm15, xmm15, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm15, xmm15, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_decrypt_vaes_calc_aad_16_loop + mov edx, r11d + cmp ecx, edx + je L_AES_GCM_decrypt_vaes_calc_aad_done +L_AES_GCM_decrypt_vaes_calc_aad_lt16: + sub rsp, 16 + vpxor xmm7, xmm7, xmm7 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm7 +L_AES_GCM_decrypt_vaes_calc_aad_loop: + movzx r13d, BYTE PTR [r12+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_decrypt_vaes_calc_aad_loop + vmovdqu xmm7, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm15, xmm15, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm15, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm15, 17 + vpclmulqdq xmm0, xmm6, xmm15, 0 + vpxor xmm1, xmm1, xmm15 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm15, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm15, xmm15, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm15, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm15, xmm15, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm15, xmm15, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm15, xmm15, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm15, xmm15, xmm2 +L_AES_GCM_decrypt_vaes_calc_aad_done: + ; Calculate counter and H + vpsrlq xmm8, xmm6, 63 + vpsllq xmm7, xmm6, 1 + vpslldq xmm8, xmm8, 8 + vpor xmm7, xmm7, xmm8 + vpshufd xmm6, xmm6, 255 + vpsrad xmm6, xmm6, 31 + vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpand xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpaddd xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_one + vpxor xmm6, xmm6, xmm7 + vmovdqu OWORD PTR [rsp+512], xmm5 + xor ebx, ebx + cmp r9d, 128 + jl L_AES_GCM_decrypt_vaes_done_128 + vmovdqa xmm2, xmm15 + ; H ^ 1 + vmovdqu OWORD PTR [rsp], xmm6 + ; H ^ 2 + vpclmulqdq xmm7, xmm6, xmm6, 0 + vpclmulqdq xmm10, xmm6, xmm6, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm0, xmm10 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm0, xmm6, 0 + vpclmulqdq xmm8, xmm0, xmm6, 1 + vpclmulqdq xmm9, xmm0, xmm6, 16 + vpclmulqdq xmm10, xmm0, xmm6, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm1, xmm10 + vmovdqu OWORD PTR [rsp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm3, xmm10 + vmovdqu OWORD PTR [rsp+48], xmm3 + ; H ^ 5 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+64], xmm4 + ; H ^ 6 + vpclmulqdq xmm7, xmm1, xmm1, 0 + vpclmulqdq xmm10, xmm1, xmm1, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+80], xmm4 + ; H ^ 7 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm3, xmm1, 0 + vpclmulqdq xmm8, xmm3, xmm1, 1 + vpclmulqdq xmm9, xmm3, xmm1, 16 + vpclmulqdq xmm10, xmm3, xmm1, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+96], xmm4 + ; H ^ 8 + vpclmulqdq xmm7, xmm3, xmm3, 0 + vpclmulqdq xmm10, xmm3, xmm3, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+112], xmm4 + cmp r9d, 256 + jl L_AES_GCM_decrypt_vaes_no_ext + ; H ^ 9 + vmovdqu xmm0, OWORD PTR [rsp+48] + vmovdqu xmm1, OWORD PTR [rsp+64] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+128], xmm4 + ; H ^ 10 + vmovdqu xmm0, OWORD PTR [rsp+64] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+144], xmm4 + ; H ^ 11 + vmovdqu xmm0, OWORD PTR [rsp+64] + vmovdqu xmm1, OWORD PTR [rsp+80] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+160], xmm4 + ; H ^ 12 + vmovdqu xmm0, OWORD PTR [rsp+80] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+176], xmm4 + ; H ^ 13 + vmovdqu xmm0, OWORD PTR [rsp+80] + vmovdqu xmm1, OWORD PTR [rsp+96] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+192], xmm4 + ; H ^ 14 + vmovdqu xmm0, OWORD PTR [rsp+96] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+208], xmm4 + ; H ^ 15 + vmovdqu xmm0, OWORD PTR [rsp+96] + vmovdqu xmm1, OWORD PTR [rsp+112] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+224], xmm4 + ; H ^ 16 + vmovdqu xmm0, OWORD PTR [rsp+112] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+240], xmm4 + vmovdqu ymm7, YMMWORD PTR [rsp+224] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+192] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+160] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp+128] + vpermq ymm10, ymm10, 78 + vmovdqu YMMWORD PTR [rsp+256], ymm7 + vmovdqu YMMWORD PTR [rsp+288], ymm8 + vmovdqu YMMWORD PTR [rsp+320], ymm9 + vmovdqu YMMWORD PTR [rsp+352], ymm10 + vmovdqu ymm7, YMMWORD PTR [rsp+96] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+64] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+32] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp] + vpermq ymm10, ymm10, 78 + vmovdqu YMMWORD PTR [rsp+384], ymm7 + vmovdqu YMMWORD PTR [rsp+416], ymm8 + vmovdqu YMMWORD PTR [rsp+448], ymm9 + vmovdqu YMMWORD PTR [rsp+480], ymm10 +L_AES_GCM_decrypt_vaes_no_ext: + vbroadcasti128 ymm14, ptr_L_vaes_aes_gcm_mod2_128 + cmp r9d, 256 + jl L_AES_GCM_decrypt_vaes_after_256 + mov r13d, r9d + and r13d, 4294967040 +L_AES_GCM_decrypt_vaes_loop_256: + ; 256 bytes of input + lea rax, QWORD PTR [rdi+rbx] + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask + vpxor ymm4, ymm4, ymm4 + vinserti128 ymm4, ymm4, xmm15, 0 + vmovdqu ymm7, YMMWORD PTR [rsp+256] + vmovdqu ymm8, YMMWORD PTR [rsp+288] + vmovdqu ymm9, YMMWORD PTR [rsp+320] + vmovdqu ymm10, YMMWORD PTR [rsp+352] + vmovdqu ymm5, YMMWORD PTR [rax] + vpshufb ymm5, ymm5, ymm6 + vpxor ymm5, ymm5, ymm4 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vmovdqa ymm11, ymm0 + vpxor ymm12, ymm2, ymm1 + vmovdqa ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rax+32] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rax+64] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rax+96] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm7, YMMWORD PTR [rsp+384] + vmovdqu ymm8, YMMWORD PTR [rsp+416] + vmovdqu ymm9, YMMWORD PTR [rsp+448] + vmovdqu ymm10, YMMWORD PTR [rsp+480] + vmovdqu ymm5, YMMWORD PTR [rax+128] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rax+160] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rax+192] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rax+224] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vpclmulqdq ymm5, ymm14, ymm11, 1 + vpshufd ymm11, ymm11, 78 + vpxor ymm12, ymm12, ymm5 + vpxor ymm12, ymm12, ymm11 + vpclmulqdq ymm5, ymm14, ymm12, 1 + vpshufd ymm12, ymm12, 78 + vpxor ymm13, ymm13, ymm5 + vpxor ymm13, ymm13, ymm12 + vextracti128 xmm0, ymm13, 1 + vpxor xmm15, xmm13, xmm0 + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 + vbroadcasti128 ymm4, [rsp+512] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [rsp+512] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [rsp+512], xmm7 + vbroadcasti128 ymm4, [r15] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 11 + vbroadcasti128 ymm4, [r15+160] + jl L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 13 + vbroadcasti128 ymm4, [r15+192] + jl L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+224] +L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add ebx, 128 + vbroadcasti128 ymm4, [rsp+512] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [rsp+512] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [rsp+512], xmm7 + vbroadcasti128 ymm4, [r15] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 11 + vbroadcasti128 ymm4, [r15+160] + jl L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 13 + vbroadcasti128 ymm4, [r15+192] + jl L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+224] +L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add ebx, 128 + cmp ebx, r13d + jl L_AES_GCM_decrypt_vaes_loop_256 +L_AES_GCM_decrypt_vaes_after_256: + vmovdqu ymm7, YMMWORD PTR [rsp+96] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+64] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+32] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp] + vpermq ymm10, ymm10, 78 + mov r13d, r9d + and r13d, 4294967168 + cmp ebx, r13d + jge L_AES_GCM_decrypt_vaes_after_128 + ; 128 bytes of input + lea rax, QWORD PTR [rdi+rbx] + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask + vpxor ymm4, ymm4, ymm4 + vinserti128 ymm4, ymm4, xmm15, 0 + vmovdqu ymm5, YMMWORD PTR [rax] + vpshufb ymm5, ymm5, ymm6 + vpxor ymm5, ymm5, ymm4 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vmovdqa ymm11, ymm0 + vpxor ymm12, ymm2, ymm1 + vmovdqa ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rax+32] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rax+64] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rax+96] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vpclmulqdq ymm5, ymm14, ymm11, 1 + vpshufd ymm11, ymm11, 78 + vpxor ymm12, ymm12, ymm5 + vpxor ymm12, ymm12, ymm11 + vpclmulqdq ymm5, ymm14, ymm12, 1 + vpshufd ymm12, ymm12, 78 + vpxor ymm13, ymm13, ymm5 + vpxor ymm13, ymm13, ymm12 + vextracti128 xmm0, ymm13, 1 + vpxor xmm15, xmm13, xmm0 + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 + vbroadcasti128 ymm4, [rsp+512] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [rsp+512] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [rsp+512], xmm7 + vbroadcasti128 ymm4, [r15] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 11 + vbroadcasti128 ymm4, [r15+160] + jl L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r10d, 13 + vbroadcasti128 ymm4, [r15+192] + jl L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [r15+224] +L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add ebx, 128 +L_AES_GCM_decrypt_vaes_after_128: + vmovdqu xmm6, OWORD PTR [rsp] +L_AES_GCM_decrypt_vaes_done_128: + mov edx, r9d + cmp ebx, edx + jge L_AES_GCM_decrypt_vaes_done_dec + mov r13d, r9d + and r13d, 4294967280 + cmp ebx, r13d + jge L_AES_GCM_decrypt_vaes_last_block_done +L_AES_GCM_decrypt_vaes_last_block_start: + vmovdqu xmm12, OWORD PTR [rdi+rbx] + vmovdqa xmm0, xmm6 + vpshufb xmm1, xmm12, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm1, xmm1, xmm15 + vmovdqu xmm8, OWORD PTR [rsp+512] + vpshufb xmm7, xmm8, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpaddd xmm8, xmm8, OWORD PTR L_vaes_aes_gcm_one + vmovdqu OWORD PTR [rsp+512], xmm8 + vpxor xmm7, xmm7, [r15] + vpclmulqdq xmm9, xmm1, xmm0, 16 + vaesenc xmm7, xmm7, [r15+16] + vaesenc xmm7, xmm7, [r15+32] + vpclmulqdq xmm10, xmm1, xmm0, 1 + vaesenc xmm7, xmm7, [r15+48] + vaesenc xmm7, xmm7, [r15+64] + vpclmulqdq xmm11, xmm1, xmm0, 0 + vaesenc xmm7, xmm7, [r15+80] + vpclmulqdq xmm1, xmm1, xmm0, 17 + vaesenc xmm7, xmm7, [r15+96] + vpxor xmm9, xmm9, xmm10 + vpslldq xmm2, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vaesenc xmm7, xmm7, [r15+112] + vpxor xmm2, xmm2, xmm11 + vpxor xmm3, xmm1, xmm9 + vmovdqa xmm0, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm10, xmm2, xmm0, 16 + vaesenc xmm7, xmm7, [r15+128] + vpshufd xmm9, xmm2, 78 + vpxor xmm9, xmm9, xmm10 + vpclmulqdq xmm10, xmm9, xmm0, 16 + vaesenc xmm7, xmm7, [r15+144] + vpshufd xmm9, xmm9, 78 + vpxor xmm9, xmm9, xmm10 + vpxor xmm15, xmm9, xmm3 + cmp r10d, 11 + vmovdqa xmm8, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_vaes_aesenc_gfmul_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [r15+176] + cmp r10d, 13 + vmovdqa xmm8, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_vaes_aesenc_gfmul_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [r15+208] + vmovdqa xmm8, OWORD PTR [r15+224] +L_AES_GCM_decrypt_vaes_aesenc_gfmul_last: + vaesenclast xmm7, xmm7, xmm8 + vmovdqa xmm0, xmm12 + vpxor xmm7, xmm7, xmm0 + vmovdqu OWORD PTR [rsi+rbx], xmm7 + add ebx, 16 + cmp ebx, r13d + jl L_AES_GCM_decrypt_vaes_last_block_start +L_AES_GCM_decrypt_vaes_last_block_done: + mov ecx, r9d + mov edx, ecx + and ecx, 15 + jz L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_done + vmovdqu xmm5, OWORD PTR [rsp+512] + vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpxor xmm5, xmm5, [r15] + vaesenc xmm5, xmm5, [r15+16] + vaesenc xmm5, xmm5, [r15+32] + vaesenc xmm5, xmm5, [r15+48] + vaesenc xmm5, xmm5, [r15+64] + vaesenc xmm5, xmm5, [r15+80] + vaesenc xmm5, xmm5, [r15+96] + vaesenc xmm5, xmm5, [r15+112] + vaesenc xmm5, xmm5, [r15+128] + vaesenc xmm5, xmm5, [r15+144] + cmp r10d, 11 + vmovdqa xmm8, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc xmm5, xmm5, xmm8 + vaesenc xmm5, xmm5, [r15+176] + cmp r10d, 13 + vmovdqa xmm8, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc xmm5, xmm5, xmm8 + vaesenc xmm5, xmm5, [r15+208] + vmovdqa xmm8, OWORD PTR [r15+224] +L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_aesenc_avx_last: + vaesenclast xmm5, xmm5, xmm8 + sub rsp, 32 + xor ecx, ecx + vmovdqu OWORD PTR [rsp], xmm5 + vpxor xmm0, xmm0, xmm0 + vmovdqu OWORD PTR [rsp+16], xmm0 +L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_loop: + movzx r13d, BYTE PTR [rdi+rbx] + mov BYTE PTR [rsp+rcx+16], r13b + xor r13b, BYTE PTR [rsp+rcx] + mov BYTE PTR [rsi+rbx], r13b + inc ebx + inc ecx + cmp ebx, edx + jl L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_loop + vmovdqu xmm5, OWORD PTR [rsp+16] + add rsp, 32 + vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm15, xmm15, xmm5 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm15, xmm6, 0 + vpclmulqdq xmm8, xmm15, xmm6, 1 + vpclmulqdq xmm9, xmm15, xmm6, 16 + vpclmulqdq xmm10, xmm15, xmm6, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm15, xmm10 +L_AES_GCM_decrypt_vaes_aesenc_last15_dec_avx_done: +L_AES_GCM_decrypt_vaes_done_dec: + mov edx, r9d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + vmovq xmm0, rdx + vmovq xmm1, rcx + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm15, xmm15, xmm0 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm15, xmm6, 0 + vpclmulqdq xmm8, xmm15, xmm6, 1 + vpclmulqdq xmm9, xmm15, xmm6, 16 + vpclmulqdq xmm10, xmm15, xmm6, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm15, xmm10 + vpshufb xmm15, xmm15, OWORD PTR L_vaes_aes_gcm_bswap_mask + vmovdqu xmm0, OWORD PTR [rsp+528] + vpxor xmm0, xmm0, xmm15 + cmp r14d, 16 + je L_AES_GCM_decrypt_vaes_cmp_tag_16 + sub rsp, 16 + xor rcx, rcx + xor rbx, rbx + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_decrypt_vaes_cmp_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + xor r13b, BYTE PTR [r8+rcx] + or bl, r13b + inc ecx + cmp ecx, r14d + jne L_AES_GCM_decrypt_vaes_cmp_tag_loop + cmp bl, 0 + sete bl + add rsp, 16 + xor rcx, rcx + jmp L_AES_GCM_decrypt_vaes_cmp_tag_done +L_AES_GCM_decrypt_vaes_cmp_tag_16: + vmovdqu xmm1, OWORD PTR [r8] + vpcmpeqb xmm0, xmm0, xmm1 + vpmovmskb rdx, xmm0 + ; %%edx == 0xFFFF then return 1 else => return 0 + xor ebx, ebx + cmp edx, 65535 + sete bl +L_AES_GCM_decrypt_vaes_cmp_tag_done: + mov DWORD PTR [rbp], ebx + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+544] + vmovdqu xmm7, OWORD PTR [rsp+560] + vmovdqu xmm8, OWORD PTR [rsp+576] + vmovdqu xmm9, OWORD PTR [rsp+592] + vmovdqu xmm10, OWORD PTR [rsp+608] + vmovdqu xmm11, OWORD PTR [rsp+624] + vmovdqu xmm12, OWORD PTR [rsp+640] + vmovdqu xmm13, OWORD PTR [rsp+656] + vmovdqu xmm14, OWORD PTR [rsp+672] + vmovdqu xmm15, OWORD PTR [rsp+688] + add rsp, 704 + pop rbp + pop r15 + pop r14 + pop rbx + pop r12 + pop rsi + pop rdi + pop r13 + ret +AES_GCM_decrypt_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_init_vaes PROC + push rdi + push rsi + push r12 + push r13 + mov rdi, rcx + mov rsi, rdx + mov r10, r8 + mov r11d, r9d + mov rax, QWORD PTR [rsp+72] + mov r8, QWORD PTR [rsp+80] + mov r9, QWORD PTR [rsp+88] + sub rsp, 80 + vmovdqu OWORD PTR [rsp+16], xmm6 + vmovdqu OWORD PTR [rsp+32], xmm7 + vmovdqu OWORD PTR [rsp+48], xmm8 + vmovdqu OWORD PTR [rsp+64], xmm15 + vpxor xmm4, xmm4, xmm4 + mov edx, r11d + cmp edx, 12 + jne L_AES_GCM_init_vaes_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + vmovq xmm4, QWORD PTR [r10] + vpinsrd xmm4, xmm4, DWORD PTR [r10+8], 2 + vpinsrd xmm4, xmm4, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqa xmm5, OWORD PTR [rdi] + vpxor xmm1, xmm4, xmm5 + vmovdqa xmm6, OWORD PTR [rdi+16] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+32] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+48] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+64] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+80] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+96] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+112] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+128] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+144] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + cmp esi, 11 + vmovdqa xmm6, OWORD PTR [rdi+160] + jl L_AES_GCM_init_vaes_calc_iv_12_last + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+176] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + cmp esi, 13 + vmovdqa xmm6, OWORD PTR [rdi+192] + jl L_AES_GCM_init_vaes_calc_iv_12_last + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+208] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+224] +L_AES_GCM_init_vaes_calc_iv_12_last: + vaesenclast xmm5, xmm5, xmm6 + vaesenclast xmm1, xmm1, xmm6 + vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_mask + vmovdqu xmm15, xmm1 + jmp L_AES_GCM_init_vaes_iv_done +L_AES_GCM_init_vaes_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqa xmm5, OWORD PTR [rdi] + vaesenc xmm5, xmm5, [rdi+16] + vaesenc xmm5, xmm5, [rdi+32] + vaesenc xmm5, xmm5, [rdi+48] + vaesenc xmm5, xmm5, [rdi+64] + vaesenc xmm5, xmm5, [rdi+80] + vaesenc xmm5, xmm5, [rdi+96] + vaesenc xmm5, xmm5, [rdi+112] + vaesenc xmm5, xmm5, [rdi+128] + vaesenc xmm5, xmm5, [rdi+144] + cmp esi, 11 + vmovdqa xmm8, OWORD PTR [rdi+160] + jl L_AES_GCM_init_vaes_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm8 + vaesenc xmm5, xmm5, [rdi+176] + cmp esi, 13 + vmovdqa xmm8, OWORD PTR [rdi+192] + jl L_AES_GCM_init_vaes_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm8 + vaesenc xmm5, xmm5, [rdi+208] + vmovdqa xmm8, OWORD PTR [rdi+224] +L_AES_GCM_init_vaes_calc_iv_1_aesenc_avx_last: + vaesenclast xmm5, xmm5, xmm8 + vpshufb xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov rcx, 0 + je L_AES_GCM_init_vaes_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_init_vaes_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_init_vaes_calc_iv_16_loop: + vmovdqu xmm7, OWORD PTR [r10+rcx] + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm6, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm6, xmm6, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm6, 31 + vpslld xmm1, xmm6, 30 + vpslld xmm2, xmm6, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm6, xmm6, xmm0 + vpsrld xmm2, xmm6, 1 + vpsrld xmm3, xmm6, 2 + vpsrld xmm0, xmm6, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm6 + vpxor xmm4, xmm4, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_init_vaes_calc_iv_16_loop + mov edx, r11d + cmp ecx, edx + je L_AES_GCM_init_vaes_calc_iv_done +L_AES_GCM_init_vaes_calc_iv_lt16: + sub rsp, 16 + vpxor xmm7, xmm7, xmm7 + xor r13d, r13d + vmovdqu OWORD PTR [rsp], xmm7 +L_AES_GCM_init_vaes_calc_iv_loop: + movzx r12d, BYTE PTR [r10+rcx] + mov BYTE PTR [rsp+r13], r12b + inc ecx + inc r13d + cmp ecx, edx + jl L_AES_GCM_init_vaes_calc_iv_loop + vmovdqu xmm7, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm6, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm6, xmm6, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm6, 31 + vpslld xmm1, xmm6, 30 + vpslld xmm2, xmm6, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm6, xmm6, xmm0 + vpsrld xmm2, xmm6, 1 + vpsrld xmm3, xmm6, 2 + vpsrld xmm0, xmm6, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm6 + vpxor xmm4, xmm4, xmm2 +L_AES_GCM_init_vaes_calc_iv_done: + ; T = Encrypt counter + vpxor xmm0, xmm0, xmm0 + shl edx, 3 + vmovq xmm0, rdx + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm6, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm6, xmm6, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm6, 31 + vpslld xmm1, xmm6, 30 + vpslld xmm2, xmm6, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm6, xmm6, xmm0 + vpsrld xmm2, xmm6, 1 + vpsrld xmm3, xmm6, 2 + vpsrld xmm0, xmm6, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm6 + vpxor xmm4, xmm4, xmm2 + vpshufb xmm4, xmm4, OWORD PTR L_vaes_aes_gcm_bswap_mask + ; Encrypt counter + vmovdqa xmm7, OWORD PTR [rdi] + vpxor xmm7, xmm7, xmm4 + vaesenc xmm7, xmm7, [rdi+16] + vaesenc xmm7, xmm7, [rdi+32] + vaesenc xmm7, xmm7, [rdi+48] + vaesenc xmm7, xmm7, [rdi+64] + vaesenc xmm7, xmm7, [rdi+80] + vaesenc xmm7, xmm7, [rdi+96] + vaesenc xmm7, xmm7, [rdi+112] + vaesenc xmm7, xmm7, [rdi+128] + vaesenc xmm7, xmm7, [rdi+144] + cmp esi, 11 + vmovdqa xmm8, OWORD PTR [rdi+160] + jl L_AES_GCM_init_vaes_calc_iv_2_aesenc_avx_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [rdi+176] + cmp esi, 13 + vmovdqa xmm8, OWORD PTR [rdi+192] + jl L_AES_GCM_init_vaes_calc_iv_2_aesenc_avx_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [rdi+208] + vmovdqa xmm8, OWORD PTR [rdi+224] +L_AES_GCM_init_vaes_calc_iv_2_aesenc_avx_last: + vaesenclast xmm7, xmm7, xmm8 + vmovdqu xmm15, xmm7 +L_AES_GCM_init_vaes_iv_done: + vmovdqa OWORD PTR [r9], xmm15 + vpshufb xmm4, xmm4, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpaddd xmm4, xmm4, OWORD PTR L_vaes_aes_gcm_one + vmovdqa OWORD PTR [rax], xmm5 + vmovdqa OWORD PTR [r8], xmm4 + vmovdqu xmm6, OWORD PTR [rsp+16] + vmovdqu xmm7, OWORD PTR [rsp+32] + vmovdqu xmm8, OWORD PTR [rsp+48] + vmovdqu xmm15, OWORD PTR [rsp+64] + add rsp, 80 + pop r13 + pop r12 + pop rsi + pop rdi + ret +AES_GCM_init_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_aad_update_vaes PROC + mov rax, rcx + sub rsp, 32 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqa xmm5, OWORD PTR [r8] + vmovdqa xmm6, OWORD PTR [r9] + xor ecx, ecx +L_AES_GCM_aad_update_vaes_16_loop: + vmovdqu xmm7, OWORD PTR [rax+rcx] + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm5, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm5, xmm5, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm5, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm5, xmm5, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm5, xmm5, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm5, xmm5, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm5, xmm5, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_aad_update_vaes_16_loop + vmovdqa OWORD PTR [r8], xmm5 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + add rsp, 32 + ret +AES_GCM_aad_update_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_block_vaes PROC + mov r10, r8 + mov r11, r9 + mov rax, QWORD PTR [rsp+40] + vmovdqu xmm1, OWORD PTR [rax] + vpshufb xmm0, xmm1, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpaddd xmm1, xmm1, OWORD PTR L_vaes_aes_gcm_one + vmovdqu OWORD PTR [rax], xmm1 + vpxor xmm0, xmm0, [rcx] + vaesenc xmm0, xmm0, [rcx+16] + vaesenc xmm0, xmm0, [rcx+32] + vaesenc xmm0, xmm0, [rcx+48] + vaesenc xmm0, xmm0, [rcx+64] + vaesenc xmm0, xmm0, [rcx+80] + vaesenc xmm0, xmm0, [rcx+96] + vaesenc xmm0, xmm0, [rcx+112] + vaesenc xmm0, xmm0, [rcx+128] + vaesenc xmm0, xmm0, [rcx+144] + cmp edx, 11 + vmovdqa xmm1, OWORD PTR [rcx+160] + jl L_AES_GCM_encrypt_block_vaes_aesenc_block_last + vaesenc xmm0, xmm0, xmm1 + vaesenc xmm0, xmm0, [rcx+176] + cmp edx, 13 + vmovdqa xmm1, OWORD PTR [rcx+192] + jl L_AES_GCM_encrypt_block_vaes_aesenc_block_last + vaesenc xmm0, xmm0, xmm1 + vaesenc xmm0, xmm0, [rcx+208] + vmovdqa xmm1, OWORD PTR [rcx+224] +L_AES_GCM_encrypt_block_vaes_aesenc_block_last: + vaesenclast xmm0, xmm0, xmm1 + vmovdqu xmm1, OWORD PTR [r11] + vpxor xmm0, xmm0, xmm1 + vmovdqu OWORD PTR [r10], xmm0 + vpshufb xmm0, xmm0, OWORD PTR L_vaes_aes_gcm_bswap_mask + vzeroupper + ret +AES_GCM_encrypt_block_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_ghash_block_vaes PROC + sub rsp, 32 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqa xmm4, OWORD PTR [rdx] + vmovdqa xmm5, OWORD PTR [r8] + vmovdqu xmm7, OWORD PTR [rcx] + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm6, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm6, xmm6, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm6, 31 + vpslld xmm1, xmm6, 30 + vpslld xmm2, xmm6, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm6, xmm6, xmm0 + vpsrld xmm2, xmm6, 1 + vpsrld xmm3, xmm6, 2 + vpsrld xmm0, xmm6, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm6 + vpxor xmm4, xmm4, xmm2 + vmovdqa OWORD PTR [rdx], xmm4 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + add rsp, 32 + ret +AES_GCM_ghash_block_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_update_vaes PROC + push r13 + push r12 + push r14 + push r15 + push rdi + push rsi + push rbx + mov rax, rcx + mov r10, r8 + mov r8d, edx + mov r11, r9 + mov r9d, DWORD PTR [rsp+96] + mov r12, QWORD PTR [rsp+104] + mov r14, QWORD PTR [rsp+112] + mov r15, QWORD PTR [rsp+120] + sub rsp, 688 + vmovdqu OWORD PTR [rsp+528], xmm6 + vmovdqu OWORD PTR [rsp+544], xmm7 + vmovdqu OWORD PTR [rsp+560], xmm8 + vmovdqu OWORD PTR [rsp+576], xmm9 + vmovdqu OWORD PTR [rsp+592], xmm10 + vmovdqu OWORD PTR [rsp+608], xmm11 + vmovdqu OWORD PTR [rsp+624], xmm12 + vmovdqu OWORD PTR [rsp+640], xmm13 + vmovdqu OWORD PTR [rsp+656], xmm14 + vmovdqu OWORD PTR [rsp+672], xmm15 + vmovdqa xmm15, OWORD PTR [r12] + vmovdqa xmm6, OWORD PTR [r14] + vpsrlq xmm8, xmm6, 63 + vpsllq xmm7, xmm6, 1 + vpslldq xmm8, xmm8, 8 + vpor xmm7, xmm7, xmm8 + vpshufd xmm6, xmm6, 255 + vpsrad xmm6, xmm6, 31 + vpand xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpxor xmm6, xmm6, xmm7 + xor edi, edi + cmp r9d, 128 + jl L_AES_GCM_encrypt_update_vaes_done_128 + vmovdqa xmm2, xmm15 + ; H ^ 1 + vmovdqu OWORD PTR [rsp], xmm6 + ; H ^ 2 + vpclmulqdq xmm7, xmm6, xmm6, 0 + vpclmulqdq xmm10, xmm6, xmm6, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm0, xmm10 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm0, xmm6, 0 + vpclmulqdq xmm8, xmm0, xmm6, 1 + vpclmulqdq xmm9, xmm0, xmm6, 16 + vpclmulqdq xmm10, xmm0, xmm6, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm1, xmm10 + vmovdqu OWORD PTR [rsp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm3, xmm10 + vmovdqu OWORD PTR [rsp+48], xmm3 + ; H ^ 5 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+64], xmm4 + ; H ^ 6 + vpclmulqdq xmm7, xmm1, xmm1, 0 + vpclmulqdq xmm10, xmm1, xmm1, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+80], xmm4 + ; H ^ 7 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm3, xmm1, 0 + vpclmulqdq xmm8, xmm3, xmm1, 1 + vpclmulqdq xmm9, xmm3, xmm1, 16 + vpclmulqdq xmm10, xmm3, xmm1, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+96], xmm4 + ; H ^ 8 + vpclmulqdq xmm7, xmm3, xmm3, 0 + vpclmulqdq xmm10, xmm3, xmm3, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+112], xmm4 + cmp r9d, 256 + jl L_AES_GCM_encrypt_update_vaes_no_ext + ; H ^ 9 + vmovdqu xmm0, OWORD PTR [rsp+48] + vmovdqu xmm1, OWORD PTR [rsp+64] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+128], xmm4 + ; H ^ 10 + vmovdqu xmm0, OWORD PTR [rsp+64] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+144], xmm4 + ; H ^ 11 + vmovdqu xmm0, OWORD PTR [rsp+64] + vmovdqu xmm1, OWORD PTR [rsp+80] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+160], xmm4 + ; H ^ 12 + vmovdqu xmm0, OWORD PTR [rsp+80] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+176], xmm4 + ; H ^ 13 + vmovdqu xmm0, OWORD PTR [rsp+80] + vmovdqu xmm1, OWORD PTR [rsp+96] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+192], xmm4 + ; H ^ 14 + vmovdqu xmm0, OWORD PTR [rsp+96] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+208], xmm4 + ; H ^ 15 + vmovdqu xmm0, OWORD PTR [rsp+96] + vmovdqu xmm1, OWORD PTR [rsp+112] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+224], xmm4 + ; H ^ 16 + vmovdqu xmm0, OWORD PTR [rsp+112] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+240], xmm4 + vmovdqu ymm7, YMMWORD PTR [rsp+224] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+192] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+160] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp+128] + vpermq ymm10, ymm10, 78 + vmovdqu YMMWORD PTR [rsp+256], ymm7 + vmovdqu YMMWORD PTR [rsp+288], ymm8 + vmovdqu YMMWORD PTR [rsp+320], ymm9 + vmovdqu YMMWORD PTR [rsp+352], ymm10 + vmovdqu ymm7, YMMWORD PTR [rsp+96] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+64] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+32] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp] + vpermq ymm10, ymm10, 78 + vmovdqu YMMWORD PTR [rsp+384], ymm7 + vmovdqu YMMWORD PTR [rsp+416], ymm8 + vmovdqu YMMWORD PTR [rsp+448], ymm9 + vmovdqu YMMWORD PTR [rsp+480], ymm10 +L_AES_GCM_encrypt_update_vaes_no_ext: + vbroadcasti128 ymm14, ptr_L_vaes_aes_gcm_mod2_128 + cmp r9d, 256 + jl L_AES_GCM_encrypt_update_vaes_after_256 + mov r13d, r9d + and r13d, 4294967040 +L_AES_GCM_encrypt_update_vaes_loop_256: + ; 256 bytes of input + lea rsi, QWORD PTR [r10+rdi] + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 + vbroadcasti128 ymm4, [r15] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [r15] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [r15], xmm7 + vbroadcasti128 ymm4, [rax] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 11 + vbroadcasti128 ymm4, [rax+160] + jl L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 13 + vbroadcasti128 ymm4, [rax+192] + jl L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+224] +L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add edi, 128 + vbroadcasti128 ymm4, [r15] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [r15] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [r15], xmm7 + vbroadcasti128 ymm4, [rax] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 11 + vbroadcasti128 ymm4, [rax+160] + jl L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 13 + vbroadcasti128 ymm4, [rax+192] + jl L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+224] +L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add edi, 128 + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask + vpxor ymm4, ymm4, ymm4 + vinserti128 ymm4, ymm4, xmm15, 0 + vmovdqu ymm7, YMMWORD PTR [rsp+256] + vmovdqu ymm8, YMMWORD PTR [rsp+288] + vmovdqu ymm9, YMMWORD PTR [rsp+320] + vmovdqu ymm10, YMMWORD PTR [rsp+352] + vmovdqu ymm5, YMMWORD PTR [rsi] + vpshufb ymm5, ymm5, ymm6 + vpxor ymm5, ymm5, ymm4 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vmovdqa ymm11, ymm0 + vpxor ymm12, ymm2, ymm1 + vmovdqa ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rsi+32] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rsi+64] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rsi+96] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm7, YMMWORD PTR [rsp+384] + vmovdqu ymm8, YMMWORD PTR [rsp+416] + vmovdqu ymm9, YMMWORD PTR [rsp+448] + vmovdqu ymm10, YMMWORD PTR [rsp+480] + vmovdqu ymm5, YMMWORD PTR [rsi+128] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rsi+160] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rsi+192] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rsi+224] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vpclmulqdq ymm5, ymm14, ymm11, 1 + vpshufd ymm11, ymm11, 78 + vpxor ymm12, ymm12, ymm5 + vpxor ymm12, ymm12, ymm11 + vpclmulqdq ymm5, ymm14, ymm12, 1 + vpshufd ymm12, ymm12, 78 + vpxor ymm13, ymm13, ymm5 + vpxor ymm13, ymm13, ymm12 + vextracti128 xmm0, ymm13, 1 + vpxor xmm15, xmm13, xmm0 + cmp edi, r13d + jl L_AES_GCM_encrypt_update_vaes_loop_256 +L_AES_GCM_encrypt_update_vaes_after_256: + mov r13d, r9d + and r13d, 4294967168 + cmp edi, r13d + jge L_AES_GCM_encrypt_update_vaes_after_128 + ; 128 bytes of input + lea rsi, QWORD PTR [r10+rdi] + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 + vbroadcasti128 ymm4, [r15] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [r15] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [r15], xmm7 + vbroadcasti128 ymm4, [rax] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 11 + vbroadcasti128 ymm4, [rax+160] + jl L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 13 + vbroadcasti128 ymm4, [rax+192] + jl L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+224] +L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add edi, 128 + vmovdqu ymm7, YMMWORD PTR [rsp+96] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+64] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+32] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp] + vpermq ymm10, ymm10, 78 + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask + vpxor ymm4, ymm4, ymm4 + vinserti128 ymm4, ymm4, xmm15, 0 + vmovdqu ymm5, YMMWORD PTR [rsi] + vpshufb ymm5, ymm5, ymm6 + vpxor ymm5, ymm5, ymm4 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vmovdqa ymm11, ymm0 + vpxor ymm12, ymm2, ymm1 + vmovdqa ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rsi+32] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rsi+64] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rsi+96] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vpclmulqdq ymm5, ymm14, ymm11, 1 + vpshufd ymm11, ymm11, 78 + vpxor ymm12, ymm12, ymm5 + vpxor ymm12, ymm12, ymm11 + vpclmulqdq ymm5, ymm14, ymm12, 1 + vpshufd ymm12, ymm12, 78 + vpxor ymm13, ymm13, ymm5 + vpxor ymm13, ymm13, ymm12 + vextracti128 xmm0, ymm13, 1 + vpxor xmm15, xmm13, xmm0 +L_AES_GCM_encrypt_update_vaes_after_128: + vmovdqu xmm6, OWORD PTR [rsp] +L_AES_GCM_encrypt_update_vaes_done_128: + mov edx, r9d + cmp edi, edx + jge L_AES_GCM_encrypt_update_vaes_done_enc + mov r13d, r9d + and r13d, 4294967280 + cmp edi, r13d + jge L_AES_GCM_encrypt_update_vaes_last_block_done + vmovdqu xmm8, OWORD PTR [r15] + vpshufb xmm7, xmm8, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpaddd xmm8, xmm8, OWORD PTR L_vaes_aes_gcm_one + vmovdqu OWORD PTR [r15], xmm8 + vpxor xmm7, xmm7, [rax] + vaesenc xmm7, xmm7, [rax+16] + vaesenc xmm7, xmm7, [rax+32] + vaesenc xmm7, xmm7, [rax+48] + vaesenc xmm7, xmm7, [rax+64] + vaesenc xmm7, xmm7, [rax+80] + vaesenc xmm7, xmm7, [rax+96] + vaesenc xmm7, xmm7, [rax+112] + vaesenc xmm7, xmm7, [rax+128] + vaesenc xmm7, xmm7, [rax+144] + cmp r8d, 11 + vmovdqa xmm8, OWORD PTR [rax+160] + jl L_AES_GCM_encrypt_update_vaes_aesenc_block_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [rax+176] + cmp r8d, 13 + vmovdqa xmm8, OWORD PTR [rax+192] + jl L_AES_GCM_encrypt_update_vaes_aesenc_block_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [rax+208] + vmovdqa xmm8, OWORD PTR [rax+224] +L_AES_GCM_encrypt_update_vaes_aesenc_block_last: + vaesenclast xmm7, xmm7, xmm8 + vmovdqu xmm8, OWORD PTR [r11+rdi] + vpxor xmm7, xmm7, xmm8 + vmovdqu OWORD PTR [r10+rdi], xmm7 + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm15, xmm15, xmm7 + add edi, 16 + cmp edi, r13d + jge L_AES_GCM_encrypt_update_vaes_last_block_ghash +L_AES_GCM_encrypt_update_vaes_last_block_start: + vmovdqu xmm12, OWORD PTR [r11+rdi] + vmovdqu xmm8, OWORD PTR [r15] + vpshufb xmm7, xmm8, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpaddd xmm8, xmm8, OWORD PTR L_vaes_aes_gcm_one + vmovdqu OWORD PTR [r15], xmm8 + vpxor xmm7, xmm7, [rax] + vpclmulqdq xmm9, xmm15, xmm6, 16 + vaesenc xmm7, xmm7, [rax+16] + vaesenc xmm7, xmm7, [rax+32] + vpclmulqdq xmm10, xmm15, xmm6, 1 + vaesenc xmm7, xmm7, [rax+48] + vaesenc xmm7, xmm7, [rax+64] + vpclmulqdq xmm11, xmm15, xmm6, 0 + vaesenc xmm7, xmm7, [rax+80] + vpclmulqdq xmm1, xmm15, xmm6, 17 + vaesenc xmm7, xmm7, [rax+96] + vpxor xmm9, xmm9, xmm10 + vpslldq xmm2, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vaesenc xmm7, xmm7, [rax+112] + vpxor xmm2, xmm2, xmm11 + vpxor xmm3, xmm1, xmm9 + vmovdqa xmm0, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm10, xmm2, xmm0, 16 + vaesenc xmm7, xmm7, [rax+128] + vpshufd xmm9, xmm2, 78 + vpxor xmm9, xmm9, xmm10 + vpclmulqdq xmm10, xmm9, xmm0, 16 + vaesenc xmm7, xmm7, [rax+144] + vpshufd xmm9, xmm9, 78 + vpxor xmm9, xmm9, xmm10 + vpxor xmm15, xmm9, xmm3 + cmp r8d, 11 + vmovdqa xmm8, OWORD PTR [rax+160] + jl L_AES_GCM_encrypt_update_vaes_aesenc_gfmul_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [rax+176] + cmp r8d, 13 + vmovdqa xmm8, OWORD PTR [rax+192] + jl L_AES_GCM_encrypt_update_vaes_aesenc_gfmul_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [rax+208] + vmovdqa xmm8, OWORD PTR [rax+224] +L_AES_GCM_encrypt_update_vaes_aesenc_gfmul_last: + vaesenclast xmm7, xmm7, xmm8 + vmovdqa xmm0, xmm12 + vpxor xmm7, xmm7, xmm0 + vmovdqu OWORD PTR [r10+rdi], xmm7 + vpshufb xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_bswap_mask + add edi, 16 + vpxor xmm15, xmm15, xmm7 + cmp edi, r13d + jl L_AES_GCM_encrypt_update_vaes_last_block_start +L_AES_GCM_encrypt_update_vaes_last_block_ghash: + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm15, xmm6, 0 + vpclmulqdq xmm8, xmm15, xmm6, 1 + vpclmulqdq xmm9, xmm15, xmm6, 16 + vpclmulqdq xmm10, xmm15, xmm6, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm15, xmm10 +L_AES_GCM_encrypt_update_vaes_last_block_done: +L_AES_GCM_encrypt_update_vaes_done_enc: + vmovdqa OWORD PTR [r12], xmm15 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+528] + vmovdqu xmm7, OWORD PTR [rsp+544] + vmovdqu xmm8, OWORD PTR [rsp+560] + vmovdqu xmm9, OWORD PTR [rsp+576] + vmovdqu xmm10, OWORD PTR [rsp+592] + vmovdqu xmm11, OWORD PTR [rsp+608] + vmovdqu xmm12, OWORD PTR [rsp+624] + vmovdqu xmm13, OWORD PTR [rsp+640] + vmovdqu xmm14, OWORD PTR [rsp+656] + vmovdqu xmm15, OWORD PTR [rsp+672] + add rsp, 688 + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r12 + pop r13 + ret +AES_GCM_encrypt_update_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_final_vaes PROC + push r13 + push r12 + push r14 + mov rax, rcx + mov r10d, r9d + mov r9, rdx + mov r11d, DWORD PTR [rsp+64] + mov r12, QWORD PTR [rsp+72] + mov r14, QWORD PTR [rsp+80] + sub rsp, 144 + vmovdqu OWORD PTR [rsp+16], xmm6 + vmovdqu OWORD PTR [rsp+32], xmm7 + vmovdqu OWORD PTR [rsp+48], xmm8 + vmovdqu OWORD PTR [rsp+64], xmm9 + vmovdqu OWORD PTR [rsp+80], xmm10 + vmovdqu OWORD PTR [rsp+96], xmm11 + vmovdqu OWORD PTR [rsp+112], xmm12 + vmovdqu OWORD PTR [rsp+128], xmm13 + vmovdqa xmm4, OWORD PTR [rax] + vmovdqa xmm5, OWORD PTR [r12] + vmovdqa xmm6, OWORD PTR [r14] + vpsrlq xmm8, xmm5, 63 + vpsllq xmm7, xmm5, 1 + vpslldq xmm8, xmm8, 8 + vpor xmm7, xmm7, xmm8 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpxor xmm5, xmm5, xmm7 + mov edx, r10d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + vmovq xmm0, rdx + vmovq xmm1, rcx + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm4, xmm5, 0 + vpclmulqdq xmm8, xmm4, xmm5, 1 + vpclmulqdq xmm9, xmm4, xmm5, 16 + vpclmulqdq xmm10, xmm4, xmm5, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vpshufb xmm4, xmm4, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm0, xmm4, xmm6 + cmp r8d, 16 + je L_AES_GCM_encrypt_final_vaes_store_tag_16 + xor rcx, rcx + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_encrypt_final_vaes_store_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + mov BYTE PTR [r9+rcx], r13b + inc ecx + cmp ecx, r8d + jne L_AES_GCM_encrypt_final_vaes_store_tag_loop + jmp L_AES_GCM_encrypt_final_vaes_store_tag_done +L_AES_GCM_encrypt_final_vaes_store_tag_16: + vmovdqu OWORD PTR [r9], xmm0 +L_AES_GCM_encrypt_final_vaes_store_tag_done: + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+16] + vmovdqu xmm7, OWORD PTR [rsp+32] + vmovdqu xmm8, OWORD PTR [rsp+48] + vmovdqu xmm9, OWORD PTR [rsp+64] + vmovdqu xmm10, OWORD PTR [rsp+80] + vmovdqu xmm11, OWORD PTR [rsp+96] + vmovdqu xmm12, OWORD PTR [rsp+112] + vmovdqu xmm13, OWORD PTR [rsp+128] + add rsp, 144 + pop r14 + pop r12 + pop r13 + ret +AES_GCM_encrypt_final_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_decrypt_update_vaes PROC + push r13 + push r12 + push r14 + push r15 + push rdi + push rsi + push rbx + mov rax, rcx + mov r10, r8 + mov r8d, edx + mov r11, r9 + mov r9d, DWORD PTR [rsp+96] + mov r12, QWORD PTR [rsp+104] + mov r14, QWORD PTR [rsp+112] + mov r15, QWORD PTR [rsp+120] + sub rsp, 688 + vmovdqu OWORD PTR [rsp+528], xmm6 + vmovdqu OWORD PTR [rsp+544], xmm7 + vmovdqu OWORD PTR [rsp+560], xmm8 + vmovdqu OWORD PTR [rsp+576], xmm9 + vmovdqu OWORD PTR [rsp+592], xmm10 + vmovdqu OWORD PTR [rsp+608], xmm11 + vmovdqu OWORD PTR [rsp+624], xmm12 + vmovdqu OWORD PTR [rsp+640], xmm13 + vmovdqu OWORD PTR [rsp+656], xmm14 + vmovdqu OWORD PTR [rsp+672], xmm15 + vmovdqa xmm15, OWORD PTR [r12] + vmovdqa xmm6, OWORD PTR [r14] + vpsrlq xmm8, xmm6, 63 + vpsllq xmm7, xmm6, 1 + vpslldq xmm8, xmm8, 8 + vpor xmm7, xmm7, xmm8 + vpshufd xmm6, xmm6, 255 + vpsrad xmm6, xmm6, 31 + vpand xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpxor xmm6, xmm6, xmm7 + xor edi, edi + cmp r9d, 128 + jl L_AES_GCM_decrypt_update_vaes_done_128 + vmovdqa xmm2, xmm15 + ; H ^ 1 + vmovdqu OWORD PTR [rsp], xmm6 + ; H ^ 2 + vpclmulqdq xmm7, xmm6, xmm6, 0 + vpclmulqdq xmm10, xmm6, xmm6, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm0, xmm10 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm0, xmm6, 0 + vpclmulqdq xmm8, xmm0, xmm6, 1 + vpclmulqdq xmm9, xmm0, xmm6, 16 + vpclmulqdq xmm10, xmm0, xmm6, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm1, xmm10 + vmovdqu OWORD PTR [rsp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm3, xmm10 + vmovdqu OWORD PTR [rsp+48], xmm3 + ; H ^ 5 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+64], xmm4 + ; H ^ 6 + vpclmulqdq xmm7, xmm1, xmm1, 0 + vpclmulqdq xmm10, xmm1, xmm1, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+80], xmm4 + ; H ^ 7 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm3, xmm1, 0 + vpclmulqdq xmm8, xmm3, xmm1, 1 + vpclmulqdq xmm9, xmm3, xmm1, 16 + vpclmulqdq xmm10, xmm3, xmm1, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+96], xmm4 + ; H ^ 8 + vpclmulqdq xmm7, xmm3, xmm3, 0 + vpclmulqdq xmm10, xmm3, xmm3, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+112], xmm4 + cmp r9d, 256 + jl L_AES_GCM_decrypt_update_vaes_no_ext + ; H ^ 9 + vmovdqu xmm0, OWORD PTR [rsp+48] + vmovdqu xmm1, OWORD PTR [rsp+64] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+128], xmm4 + ; H ^ 10 + vmovdqu xmm0, OWORD PTR [rsp+64] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+144], xmm4 + ; H ^ 11 + vmovdqu xmm0, OWORD PTR [rsp+64] + vmovdqu xmm1, OWORD PTR [rsp+80] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+160], xmm4 + ; H ^ 12 + vmovdqu xmm0, OWORD PTR [rsp+80] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+176], xmm4 + ; H ^ 13 + vmovdqu xmm0, OWORD PTR [rsp+80] + vmovdqu xmm1, OWORD PTR [rsp+96] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+192], xmm4 + ; H ^ 14 + vmovdqu xmm0, OWORD PTR [rsp+96] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+208], xmm4 + ; H ^ 15 + vmovdqu xmm0, OWORD PTR [rsp+96] + vmovdqu xmm1, OWORD PTR [rsp+112] + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm1, xmm0, 0 + vpclmulqdq xmm8, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+224], xmm4 + ; H ^ 16 + vmovdqu xmm0, OWORD PTR [rsp+112] + vpclmulqdq xmm7, xmm0, xmm0, 0 + vpclmulqdq xmm10, xmm0, xmm0, 17 + vpxor xmm8, xmm8, xmm8 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm4, xmm10 + vmovdqu OWORD PTR [rsp+240], xmm4 + vmovdqu ymm7, YMMWORD PTR [rsp+224] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+192] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+160] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp+128] + vpermq ymm10, ymm10, 78 + vmovdqu YMMWORD PTR [rsp+256], ymm7 + vmovdqu YMMWORD PTR [rsp+288], ymm8 + vmovdqu YMMWORD PTR [rsp+320], ymm9 + vmovdqu YMMWORD PTR [rsp+352], ymm10 + vmovdqu ymm7, YMMWORD PTR [rsp+96] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+64] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+32] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp] + vpermq ymm10, ymm10, 78 + vmovdqu YMMWORD PTR [rsp+384], ymm7 + vmovdqu YMMWORD PTR [rsp+416], ymm8 + vmovdqu YMMWORD PTR [rsp+448], ymm9 + vmovdqu YMMWORD PTR [rsp+480], ymm10 +L_AES_GCM_decrypt_update_vaes_no_ext: + vbroadcasti128 ymm14, ptr_L_vaes_aes_gcm_mod2_128 + cmp r9d, 256 + jl L_AES_GCM_decrypt_update_vaes_after_256 + mov r13d, r9d + and r13d, 4294967040 +L_AES_GCM_decrypt_update_vaes_loop_256: + ; 256 bytes of input + lea rbx, QWORD PTR [r11+rdi] + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask + vpxor ymm4, ymm4, ymm4 + vinserti128 ymm4, ymm4, xmm15, 0 + vmovdqu ymm7, YMMWORD PTR [rsp+256] + vmovdqu ymm8, YMMWORD PTR [rsp+288] + vmovdqu ymm9, YMMWORD PTR [rsp+320] + vmovdqu ymm10, YMMWORD PTR [rsp+352] + vmovdqu ymm5, YMMWORD PTR [rbx] + vpshufb ymm5, ymm5, ymm6 + vpxor ymm5, ymm5, ymm4 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vmovdqa ymm11, ymm0 + vpxor ymm12, ymm2, ymm1 + vmovdqa ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rbx+32] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rbx+64] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rbx+96] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm7, YMMWORD PTR [rsp+384] + vmovdqu ymm8, YMMWORD PTR [rsp+416] + vmovdqu ymm9, YMMWORD PTR [rsp+448] + vmovdqu ymm10, YMMWORD PTR [rsp+480] + vmovdqu ymm5, YMMWORD PTR [rbx+128] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rbx+160] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rbx+192] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rbx+224] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vpclmulqdq ymm5, ymm14, ymm11, 1 + vpshufd ymm11, ymm11, 78 + vpxor ymm12, ymm12, ymm5 + vpxor ymm12, ymm12, ymm11 + vpclmulqdq ymm5, ymm14, ymm12, 1 + vpshufd ymm12, ymm12, 78 + vpxor ymm13, ymm13, ymm5 + vpxor ymm13, ymm13, ymm12 + vextracti128 xmm0, ymm13, 1 + vpxor xmm15, xmm13, xmm0 + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 + vbroadcasti128 ymm4, [r15] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [r15] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [r15], xmm7 + vbroadcasti128 ymm4, [rax] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 11 + vbroadcasti128 ymm4, [rax+160] + jl L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 13 + vbroadcasti128 ymm4, [rax+192] + jl L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+224] +L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add edi, 128 + vbroadcasti128 ymm4, [r15] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [r15] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [r15], xmm7 + vbroadcasti128 ymm4, [rax] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 11 + vbroadcasti128 ymm4, [rax+160] + jl L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 13 + vbroadcasti128 ymm4, [rax+192] + jl L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+224] +L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add edi, 128 + cmp edi, r13d + jl L_AES_GCM_decrypt_update_vaes_loop_256 +L_AES_GCM_decrypt_update_vaes_after_256: + vmovdqu ymm7, YMMWORD PTR [rsp+96] + vpermq ymm7, ymm7, 78 + vmovdqu ymm8, YMMWORD PTR [rsp+64] + vpermq ymm8, ymm8, 78 + vmovdqu ymm9, YMMWORD PTR [rsp+32] + vpermq ymm9, ymm9, 78 + vmovdqu ymm10, YMMWORD PTR [rsp] + vpermq ymm10, ymm10, 78 + mov r13d, r9d + and r13d, 4294967168 + cmp edi, r13d + jge L_AES_GCM_decrypt_update_vaes_after_128 + ; 128 bytes of input + lea rbx, QWORD PTR [r11+rdi] + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_mask + vpxor ymm4, ymm4, ymm4 + vinserti128 ymm4, ymm4, xmm15, 0 + vmovdqu ymm5, YMMWORD PTR [rbx] + vpshufb ymm5, ymm5, ymm6 + vpxor ymm5, ymm5, ymm4 + vpclmulqdq ymm0, ymm5, ymm7, 0 + vpclmulqdq ymm1, ymm5, ymm7, 1 + vpclmulqdq ymm2, ymm5, ymm7, 16 + vpclmulqdq ymm3, ymm5, ymm7, 17 + vmovdqa ymm11, ymm0 + vpxor ymm12, ymm2, ymm1 + vmovdqa ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rbx+32] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm8, 0 + vpclmulqdq ymm1, ymm5, ymm8, 1 + vpclmulqdq ymm2, ymm5, ymm8, 16 + vpclmulqdq ymm3, ymm5, ymm8, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rbx+64] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm9, 0 + vpclmulqdq ymm1, ymm5, ymm9, 1 + vpclmulqdq ymm2, ymm5, ymm9, 16 + vpclmulqdq ymm3, ymm5, ymm9, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vmovdqu ymm5, YMMWORD PTR [rbx+96] + vpshufb ymm5, ymm5, ymm6 + vpclmulqdq ymm0, ymm5, ymm10, 0 + vpclmulqdq ymm1, ymm5, ymm10, 1 + vpclmulqdq ymm2, ymm5, ymm10, 16 + vpclmulqdq ymm3, ymm5, ymm10, 17 + vpxor ymm11, ymm11, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vpclmulqdq ymm5, ymm14, ymm11, 1 + vpshufd ymm11, ymm11, 78 + vpxor ymm12, ymm12, ymm5 + vpxor ymm12, ymm12, ymm11 + vpclmulqdq ymm5, ymm14, ymm12, 1 + vpshufd ymm12, ymm12, 78 + vpxor ymm13, ymm13, ymm5 + vpxor ymm13, ymm13, ymm12 + vextracti128 xmm0, ymm13, 1 + vpxor xmm15, xmm13, xmm0 + vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 + vbroadcasti128 ymm4, [r15] + vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 + vpshufb ymm0, ymm0, ymm6 + vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 + vpshufb ymm1, ymm1, ymm6 + vpaddd ymm2, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y2 + vpshufb ymm2, ymm2, ymm6 + vpaddd ymm3, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y3 + vpshufb ymm3, ymm3, ymm6 + vmovdqu xmm7, OWORD PTR [r15] + vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight + vmovdqu OWORD PTR [r15], xmm7 + vbroadcasti128 ymm4, [rax] + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm4 + vpxor ymm2, ymm2, ymm4 + vpxor ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+16] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+32] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+48] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+64] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+80] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+96] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+112] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+128] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+144] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 11 + vbroadcasti128 ymm4, [rax+160] + jl L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+176] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + cmp r8d, 13 + vbroadcasti128 ymm4, [rax+192] + jl L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+208] + vaesenc ymm0, ymm0, ymm4 + vaesenc ymm1, ymm1, ymm4 + vaesenc ymm2, ymm2, ymm4 + vaesenc ymm3, ymm3, ymm4 + vbroadcasti128 ymm4, [rax+224] +L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last: + vaesenclast ymm0, ymm0, ymm4 + vaesenclast ymm1, ymm1, ymm4 + vaesenclast ymm2, ymm2, ymm4 + vaesenclast ymm3, ymm3, ymm4 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu ymm5, YMMWORD PTR [rcx] + vpxor ymm0, ymm0, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm5, YMMWORD PTR [rcx+64] + vpxor ymm2, ymm2, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpxor ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + add edi, 128 +L_AES_GCM_decrypt_update_vaes_after_128: + vmovdqu xmm6, OWORD PTR [rsp] +L_AES_GCM_decrypt_update_vaes_done_128: + mov edx, r9d + cmp edi, edx + jge L_AES_GCM_decrypt_update_vaes_done_dec + mov r13d, r9d + and r13d, 4294967280 + cmp edi, r13d + jge L_AES_GCM_decrypt_update_vaes_last_block_done +L_AES_GCM_decrypt_update_vaes_last_block_start: + vmovdqu xmm12, OWORD PTR [r11+rdi] + vmovdqa xmm0, xmm6 + vpshufb xmm1, xmm12, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm1, xmm1, xmm15 + vmovdqu xmm8, OWORD PTR [r15] + vpshufb xmm7, xmm8, OWORD PTR L_vaes_aes_gcm_bswap_epi64 + vpaddd xmm8, xmm8, OWORD PTR L_vaes_aes_gcm_one + vmovdqu OWORD PTR [r15], xmm8 + vpxor xmm7, xmm7, [rax] + vpclmulqdq xmm9, xmm1, xmm0, 16 + vaesenc xmm7, xmm7, [rax+16] + vaesenc xmm7, xmm7, [rax+32] + vpclmulqdq xmm10, xmm1, xmm0, 1 + vaesenc xmm7, xmm7, [rax+48] + vaesenc xmm7, xmm7, [rax+64] + vpclmulqdq xmm11, xmm1, xmm0, 0 + vaesenc xmm7, xmm7, [rax+80] + vpclmulqdq xmm1, xmm1, xmm0, 17 + vaesenc xmm7, xmm7, [rax+96] + vpxor xmm9, xmm9, xmm10 + vpslldq xmm2, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vaesenc xmm7, xmm7, [rax+112] + vpxor xmm2, xmm2, xmm11 + vpxor xmm3, xmm1, xmm9 + vmovdqa xmm0, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm10, xmm2, xmm0, 16 + vaesenc xmm7, xmm7, [rax+128] + vpshufd xmm9, xmm2, 78 + vpxor xmm9, xmm9, xmm10 + vpclmulqdq xmm10, xmm9, xmm0, 16 + vaesenc xmm7, xmm7, [rax+144] + vpshufd xmm9, xmm9, 78 + vpxor xmm9, xmm9, xmm10 + vpxor xmm15, xmm9, xmm3 + cmp r8d, 11 + vmovdqa xmm8, OWORD PTR [rax+160] + jl L_AES_GCM_decrypt_update_vaes_aesenc_gfmul_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [rax+176] + cmp r8d, 13 + vmovdqa xmm8, OWORD PTR [rax+192] + jl L_AES_GCM_decrypt_update_vaes_aesenc_gfmul_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [rax+208] + vmovdqa xmm8, OWORD PTR [rax+224] +L_AES_GCM_decrypt_update_vaes_aesenc_gfmul_last: + vaesenclast xmm7, xmm7, xmm8 + vmovdqa xmm0, xmm12 + vpxor xmm7, xmm7, xmm0 + vmovdqu OWORD PTR [r10+rdi], xmm7 + add edi, 16 + cmp edi, r13d + jl L_AES_GCM_decrypt_update_vaes_last_block_start +L_AES_GCM_decrypt_update_vaes_last_block_done: +L_AES_GCM_decrypt_update_vaes_done_dec: + vmovdqa OWORD PTR [r12], xmm15 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+528] + vmovdqu xmm7, OWORD PTR [rsp+544] + vmovdqu xmm8, OWORD PTR [rsp+560] + vmovdqu xmm9, OWORD PTR [rsp+576] + vmovdqu xmm10, OWORD PTR [rsp+592] + vmovdqu xmm11, OWORD PTR [rsp+608] + vmovdqu xmm12, OWORD PTR [rsp+624] + vmovdqu xmm13, OWORD PTR [rsp+640] + vmovdqu xmm14, OWORD PTR [rsp+656] + vmovdqu xmm15, OWORD PTR [rsp+672] + add rsp, 688 + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r12 + pop r13 + ret +AES_GCM_decrypt_update_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_decrypt_final_vaes PROC + push r13 + push r12 + push r14 + push rbp + push r15 + mov rax, rcx + mov r10d, r9d + mov r9, rdx + mov r11d, DWORD PTR [rsp+80] + mov r12, QWORD PTR [rsp+88] + mov r14, QWORD PTR [rsp+96] + mov rbp, QWORD PTR [rsp+104] + sub rsp, 160 + vmovdqu OWORD PTR [rsp+16], xmm6 + vmovdqu OWORD PTR [rsp+32], xmm7 + vmovdqu OWORD PTR [rsp+48], xmm8 + vmovdqu OWORD PTR [rsp+64], xmm9 + vmovdqu OWORD PTR [rsp+80], xmm10 + vmovdqu OWORD PTR [rsp+96], xmm11 + vmovdqu OWORD PTR [rsp+112], xmm12 + vmovdqu OWORD PTR [rsp+128], xmm13 + vmovdqu OWORD PTR [rsp+144], xmm15 + vmovdqa xmm6, OWORD PTR [rax] + vmovdqa xmm5, OWORD PTR [r12] + vmovdqa xmm15, OWORD PTR [r14] + vpsrlq xmm8, xmm5, 63 + vpsllq xmm7, xmm5, 1 + vpslldq xmm8, xmm8, 8 + vpor xmm7, xmm7, xmm8 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpxor xmm5, xmm5, xmm7 + mov edx, r10d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + vmovq xmm0, rdx + vmovq xmm1, rcx + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm6, xmm6, xmm0 + ; ghash_gfmul_red_avx + vpclmulqdq xmm7, xmm6, xmm5, 0 + vpclmulqdq xmm8, xmm6, xmm5, 1 + vpclmulqdq xmm9, xmm6, xmm5, 16 + vpclmulqdq xmm10, xmm6, xmm5, 17 + vpxor xmm8, xmm8, xmm9 + vmovdqa xmm9, OWORD PTR L_vaes_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpxor xmm8, xmm8, xmm11 + vpxor xmm8, xmm8, xmm7 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm10, xmm10, xmm8 + vmovdqa xmm6, xmm10 + vpshufb xmm6, xmm6, OWORD PTR L_vaes_aes_gcm_bswap_mask + vpxor xmm0, xmm6, xmm15 + cmp r8d, 16 + je L_AES_GCM_decrypt_final_vaes_cmp_tag_16 + sub rsp, 16 + xor rcx, rcx + xor r15, r15 + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_decrypt_final_vaes_cmp_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + xor r13b, BYTE PTR [r9+rcx] + or r15b, r13b + inc ecx + cmp ecx, r8d + jne L_AES_GCM_decrypt_final_vaes_cmp_tag_loop + cmp r15b, 0 + sete r15b + add rsp, 16 + xor rcx, rcx + jmp L_AES_GCM_decrypt_final_vaes_cmp_tag_done +L_AES_GCM_decrypt_final_vaes_cmp_tag_16: + vmovdqu xmm1, OWORD PTR [r9] + vpcmpeqb xmm0, xmm0, xmm1 + vpmovmskb rdx, xmm0 + ; %%edx == 0xFFFF then return 1 else => return 0 + xor r15d, r15d + cmp edx, 65535 + sete r15b +L_AES_GCM_decrypt_final_vaes_cmp_tag_done: + mov DWORD PTR [rbp], r15d + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+16] + vmovdqu xmm7, OWORD PTR [rsp+32] + vmovdqu xmm8, OWORD PTR [rsp+48] + vmovdqu xmm9, OWORD PTR [rsp+64] + vmovdqu xmm10, OWORD PTR [rsp+80] + vmovdqu xmm11, OWORD PTR [rsp+96] + vmovdqu xmm12, OWORD PTR [rsp+112] + vmovdqu xmm13, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + pop r15 + pop rbp + pop r14 + pop r12 + pop r13 + ret +AES_GCM_decrypt_final_vaes ENDP +_TEXT ENDS +ENDIF +IFDEF HAVE_INTEL_AVX512 +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_inc_z0 QWORD \ + 0000000000000000h, 0000000000000000h, + 0000000000000000h, 0000000000000001h, + 0000000000000000h, 0000000000000002h, + 0000000000000000h, 0000000000000003h +ptr_L_avx512_aes_gcm_inc_z0 QWORD L_avx512_aes_gcm_inc_z0 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_inc_z1 QWORD \ + 0000000000000000h, 0000000000000004h, + 0000000000000000h, 0000000000000005h, + 0000000000000000h, 0000000000000006h, + 0000000000000000h, 0000000000000007h +ptr_L_avx512_aes_gcm_inc_z1 QWORD L_avx512_aes_gcm_inc_z1 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_inc_z2 QWORD \ + 0000000000000000h, 0000000000000008h, + 0000000000000000h, 0000000000000009h, + 0000000000000000h, 000000000000000ah, + 0000000000000000h, 000000000000000bh +ptr_L_avx512_aes_gcm_inc_z2 QWORD L_avx512_aes_gcm_inc_z2 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_inc_z3 QWORD \ + 0000000000000000h, 000000000000000ch, + 0000000000000000h, 000000000000000dh, + 0000000000000000h, 000000000000000eh, + 0000000000000000h, 000000000000000fh +ptr_L_avx512_aes_gcm_inc_z3 QWORD L_avx512_aes_gcm_inc_z3 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_sixteen QWORD \ + 0000000000000000h, 0000000000000010h +ptr_L_avx512_aes_gcm_sixteen QWORD L_avx512_aes_gcm_sixteen +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_GCM_generate_m0_avx512_rev8 QWORD \ + 08090a0b0c0d0e0fh, 0001020304050607h +ptr_L_GCM_generate_m0_avx512_rev8 QWORD L_GCM_generate_m0_avx512_rev8 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_GCM_generate_m0_avx512_mod2_128 QWORD \ + 0000000000000000h, 0e100000000000000h +ptr_L_GCM_generate_m0_avx512_mod2_128 QWORD L_GCM_generate_m0_avx512_mod2_128 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +GCM_generate_m0_avx512 PROC + sub rsp, 80 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu xmm9, OWORD PTR L_GCM_generate_m0_avx512_rev8 + vmovdqu xmm10, OWORD PTR L_GCM_generate_m0_avx512_mod2_128 + vpxor xmm8, xmm8, xmm8 + vmovdqu xmm0, OWORD PTR [rcx] + vmovdqu OWORD PTR [rdx], xmm8 + vmovdqu xmm8, xmm0 + vpshufb xmm0, xmm0, xmm9 + vpsllq xmm5, xmm0, 63 + vpsrlq xmm4, xmm0, 1 + vpslldq xmm1, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpshufd xmm1, xmm1, 255 + vpor xmm4, xmm4, xmm5 + vpsrad xmm1, xmm1, 31 + vpand xmm1, xmm1, xmm10 + vpxor xmm1, xmm1, xmm4 + vpsllq xmm5, xmm1, 63 + vpsrlq xmm4, xmm1, 1 + vpslldq xmm2, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpshufd xmm2, xmm2, 255 + vpor xmm4, xmm4, xmm5 + vpsrad xmm2, xmm2, 31 + vpand xmm2, xmm2, xmm10 + vpxor xmm2, xmm2, xmm4 + vpsllq xmm5, xmm2, 63 + vpsrlq xmm4, xmm2, 1 + vpslldq xmm3, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpshufd xmm3, xmm3, 255 + vpor xmm4, xmm4, xmm5 + vpsrad xmm3, xmm3, 31 + vpand xmm3, xmm3, xmm10 + vpxor xmm3, xmm3, xmm4 + vpshufb xmm3, xmm3, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm0, xmm0, xmm9 + vpxor xmm8, xmm3, xmm2 + vmovdqu OWORD PTR [rdx+16], xmm3 + vmovdqu OWORD PTR [rdx+32], xmm2 + vmovdqu OWORD PTR [rdx+48], xmm8 + vmovdqu OWORD PTR [rdx+64], xmm1 + vpxor xmm4, xmm3, xmm1 + vpxor xmm5, xmm2, xmm1 + vpxor xmm6, xmm8, xmm1 + vmovdqu OWORD PTR [rdx+80], xmm4 + vmovdqu OWORD PTR [rdx+96], xmm5 + vmovdqu OWORD PTR [rdx+112], xmm6 + vmovdqu OWORD PTR [rdx+128], xmm0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm3, xmm0 + vpxor xmm6, xmm2, xmm0 + vmovdqu OWORD PTR [rdx+144], xmm4 + vmovdqu OWORD PTR [rdx+160], xmm6 + vpxor xmm6, xmm3, xmm6 + vmovdqu OWORD PTR [rdx+176], xmm6 + vmovdqu OWORD PTR [rdx+192], xmm1 + vpxor xmm4, xmm3, xmm1 + vpxor xmm5, xmm2, xmm1 + vpxor xmm6, xmm8, xmm1 + vmovdqu OWORD PTR [rdx+208], xmm4 + vmovdqu OWORD PTR [rdx+224], xmm5 + vmovdqu OWORD PTR [rdx+240], xmm6 + vmovdqu xmm0, OWORD PTR [rdx] + vmovdqu xmm1, OWORD PTR [rdx+16] + vmovdqu xmm2, OWORD PTR [rdx+32] + vmovdqu xmm3, OWORD PTR [rdx+48] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+256], xmm0 + vmovdqu OWORD PTR [rdx+272], xmm1 + vmovdqu OWORD PTR [rdx+288], xmm2 + vmovdqu OWORD PTR [rdx+304], xmm3 + vmovdqu xmm0, OWORD PTR [rdx+64] + vmovdqu xmm1, OWORD PTR [rdx+80] + vmovdqu xmm2, OWORD PTR [rdx+96] + vmovdqu xmm3, OWORD PTR [rdx+112] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+320], xmm0 + vmovdqu OWORD PTR [rdx+336], xmm1 + vmovdqu OWORD PTR [rdx+352], xmm2 + vmovdqu OWORD PTR [rdx+368], xmm3 + vmovdqu xmm0, OWORD PTR [rdx+128] + vmovdqu xmm1, OWORD PTR [rdx+144] + vmovdqu xmm2, OWORD PTR [rdx+160] + vmovdqu xmm3, OWORD PTR [rdx+176] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+384], xmm0 + vmovdqu OWORD PTR [rdx+400], xmm1 + vmovdqu OWORD PTR [rdx+416], xmm2 + vmovdqu OWORD PTR [rdx+432], xmm3 + vmovdqu xmm0, OWORD PTR [rdx+192] + vmovdqu xmm1, OWORD PTR [rdx+208] + vmovdqu xmm2, OWORD PTR [rdx+224] + vmovdqu xmm3, OWORD PTR [rdx+240] + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vpsllq xmm4, xmm0, 60 + vpsllq xmm5, xmm1, 60 + vpsllq xmm6, xmm2, 60 + vpsllq xmm7, xmm3, 60 + vpsrlq xmm0, xmm0, 4 + vpsrlq xmm1, xmm1, 4 + vpsrlq xmm2, xmm2, 4 + vpsrlq xmm3, xmm3, 4 + vpsrldq xmm4, xmm4, 8 + vpsrldq xmm5, xmm5, 8 + vpsrldq xmm6, xmm6, 8 + vpsrldq xmm7, xmm7, 8 + vpor xmm0, xmm0, xmm4 + vpor xmm1, xmm1, xmm5 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm9 + vpshufb xmm1, xmm1, xmm9 + vpshufb xmm2, xmm2, xmm9 + vpshufb xmm3, xmm3, xmm9 + vmovdqu OWORD PTR [rdx+448], xmm0 + vmovdqu OWORD PTR [rdx+464], xmm1 + vmovdqu OWORD PTR [rdx+480], xmm2 + vmovdqu OWORD PTR [rdx+496], xmm3 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + add rsp, 80 + ret +GCM_generate_m0_avx512 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_one QWORD \ + 0000000000000000h, 0000000000000001h +ptr_L_avx512_aes_gcm_one QWORD L_avx512_aes_gcm_one +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_two QWORD \ + 0000000000000000h, 0000000000000002h +ptr_L_avx512_aes_gcm_two QWORD L_avx512_aes_gcm_two +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_three QWORD \ + 0000000000000000h, 0000000000000003h +ptr_L_avx512_aes_gcm_three QWORD L_avx512_aes_gcm_three +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_four QWORD \ + 0000000000000000h, 0000000000000004h +ptr_L_avx512_aes_gcm_four QWORD L_avx512_aes_gcm_four +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_five QWORD \ + 0000000000000000h, 0000000000000005h +ptr_L_avx512_aes_gcm_five QWORD L_avx512_aes_gcm_five +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_six QWORD \ + 0000000000000000h, 0000000000000006h +ptr_L_avx512_aes_gcm_six QWORD L_avx512_aes_gcm_six +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_seven QWORD \ + 0000000000000000h, 0000000000000007h +ptr_L_avx512_aes_gcm_seven QWORD L_avx512_aes_gcm_seven +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_eight QWORD \ + 0000000000000000h, 0000000000000008h +ptr_L_avx512_aes_gcm_eight QWORD L_avx512_aes_gcm_eight +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_bswap_epi64 QWORD \ + 0001020304050607h, 08090a0b0c0d0e0fh +ptr_L_avx512_aes_gcm_bswap_epi64 QWORD L_avx512_aes_gcm_bswap_epi64 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_bswap_mask QWORD \ + 08090a0b0c0d0e0fh, 0001020304050607h +ptr_L_avx512_aes_gcm_bswap_mask QWORD L_avx512_aes_gcm_bswap_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_gcm_mod2_128 QWORD \ + 0000000000000001h, 0c200000000000000h +ptr_L_avx512_aes_gcm_mod2_128 QWORD L_avx512_aes_gcm_mod2_128 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_avx512 PROC + push r13 + push rdi + push rsi + push r12 + push rbx + push r14 + push r15 + mov rdi, rcx + mov rsi, rdx + mov r12, r8 + mov rax, r9 + mov r8, QWORD PTR [rsp+96] + mov r9d, DWORD PTR [rsp+104] + mov r11d, DWORD PTR [rsp+112] + mov ebx, DWORD PTR [rsp+120] + mov r14d, DWORD PTR [rsp+128] + mov r15, QWORD PTR [rsp+136] + mov r10d, DWORD PTR [rsp+144] + sub rsp, 1248 + vmovdqu OWORD PTR [rsp+1088], xmm6 + vmovdqu OWORD PTR [rsp+1104], xmm7 + vmovdqu OWORD PTR [rsp+1120], xmm8 + vmovdqu OWORD PTR [rsp+1136], xmm9 + vmovdqu OWORD PTR [rsp+1152], xmm10 + vmovdqu OWORD PTR [rsp+1168], xmm11 + vmovdqu OWORD PTR [rsp+1184], xmm12 + vmovdqu OWORD PTR [rsp+1200], xmm13 + vmovdqu OWORD PTR [rsp+1216], xmm14 + vmovdqu OWORD PTR [rsp+1232], xmm15 + vpxor xmm4, xmm4, xmm4 + vpxor xmm6, xmm6, xmm6 + mov edx, ebx + cmp edx, 12 + jne L_AES_GCM_encrypt_avx512_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + vmovq xmm4, QWORD PTR [rax] + vpinsrd xmm4, xmm4, DWORD PTR [rax+8], 2 + vpinsrd xmm4, xmm4, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqa xmm5, OWORD PTR [r15] + vpxor xmm1, xmm4, xmm5 + vmovdqa xmm7, OWORD PTR [r15+16] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+32] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+48] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+64] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+80] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+96] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+112] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+128] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+144] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + cmp r10d, 11 + vmovdqa xmm7, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_avx512_calc_iv_12_last + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+176] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + cmp r10d, 13 + vmovdqa xmm7, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_avx512_calc_iv_12_last + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+208] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+224] +L_AES_GCM_encrypt_avx512_calc_iv_12_last: + vaesenclast xmm5, xmm5, xmm7 + vaesenclast xmm1, xmm1, xmm7 + vpshufb xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_bswap_mask + vmovdqu OWORD PTR [rsp+1040], xmm1 + jmp L_AES_GCM_encrypt_avx512_iv_done +L_AES_GCM_encrypt_avx512_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqa xmm5, OWORD PTR [r15] + vaesenc xmm5, xmm5, [r15+16] + vaesenc xmm5, xmm5, [r15+32] + vaesenc xmm5, xmm5, [r15+48] + vaesenc xmm5, xmm5, [r15+64] + vaesenc xmm5, xmm5, [r15+80] + vaesenc xmm5, xmm5, [r15+96] + vaesenc xmm5, xmm5, [r15+112] + vaesenc xmm5, xmm5, [r15+128] + vaesenc xmm5, xmm5, [r15+144] + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_avx512_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm9 + vaesenc xmm5, xmm5, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_avx512_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm9 + vaesenc xmm5, xmm5, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_encrypt_avx512_calc_iv_1_aesenc_avx_last: + vaesenclast xmm5, xmm5, xmm9 + vpshufb xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov rcx, 0 + je L_AES_GCM_encrypt_avx512_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_encrypt_avx512_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_encrypt_avx512_calc_iv_16_loop: + vmovdqu xmm8, OWORD PTR [rax+rcx] + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_encrypt_avx512_calc_iv_16_loop + mov edx, ebx + cmp ecx, edx + je L_AES_GCM_encrypt_avx512_calc_iv_done +L_AES_GCM_encrypt_avx512_calc_iv_lt16: + sub rsp, 16 + vpxor xmm8, xmm8, xmm8 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm8 +L_AES_GCM_encrypt_avx512_calc_iv_loop: + movzx r13d, BYTE PTR [rax+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_encrypt_avx512_calc_iv_loop + vmovdqu xmm8, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 +L_AES_GCM_encrypt_avx512_calc_iv_done: + ; T = Encrypt counter + vpxor xmm0, xmm0, xmm0 + shl edx, 3 + vmovq xmm0, rdx + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 + vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_mask + ; Encrypt counter + vmovdqa xmm8, OWORD PTR [r15] + vpxor xmm8, xmm8, xmm4 + vaesenc xmm8, xmm8, [r15+16] + vaesenc xmm8, xmm8, [r15+32] + vaesenc xmm8, xmm8, [r15+48] + vaesenc xmm8, xmm8, [r15+64] + vaesenc xmm8, xmm8, [r15+80] + vaesenc xmm8, xmm8, [r15+96] + vaesenc xmm8, xmm8, [r15+112] + vaesenc xmm8, xmm8, [r15+128] + vaesenc xmm8, xmm8, [r15+144] + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_avx512_calc_iv_2_aesenc_avx_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_avx512_calc_iv_2_aesenc_avx_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_encrypt_avx512_calc_iv_2_aesenc_avx_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqu OWORD PTR [rsp+1040], xmm8 +L_AES_GCM_encrypt_avx512_iv_done: + ; Additional authentication data + mov edx, r11d + cmp edx, 0 + je L_AES_GCM_encrypt_avx512_calc_aad_done + xor ecx, ecx + cmp edx, 16 + jl L_AES_GCM_encrypt_avx512_calc_aad_lt16 + and edx, 4294967280 +L_AES_GCM_encrypt_avx512_calc_aad_16_loop: + vmovdqu xmm8, OWORD PTR [r12+rcx] + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm6, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm6, 17 + vpclmulqdq xmm0, xmm5, xmm6, 0 + vpxor xmm1, xmm1, xmm6 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm6, xmm6, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm6, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm6, xmm6, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm6, xmm6, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_encrypt_avx512_calc_aad_16_loop + mov edx, r11d + cmp ecx, edx + je L_AES_GCM_encrypt_avx512_calc_aad_done +L_AES_GCM_encrypt_avx512_calc_aad_lt16: + sub rsp, 16 + vpxor xmm8, xmm8, xmm8 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm8 +L_AES_GCM_encrypt_avx512_calc_aad_loop: + movzx r13d, BYTE PTR [r12+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_encrypt_avx512_calc_aad_loop + vmovdqu xmm8, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm6, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm6, 17 + vpclmulqdq xmm0, xmm5, xmm6, 0 + vpxor xmm1, xmm1, xmm6 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm6, xmm6, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm6, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm6, xmm6, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm6, xmm6, xmm2 +L_AES_GCM_encrypt_avx512_calc_aad_done: + ; Calculate counter and H + vpsrlq xmm9, xmm5, 63 + vpsllq xmm8, xmm5, 1 + vpslldq xmm9, xmm9, 8 + vpor xmm8, xmm8, xmm9 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpand xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpaddd xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_one + vpxor xmm5, xmm5, xmm8 + vmovdqu OWORD PTR [rsp+1024], xmm4 + xor ebx, ebx + cmp r9d, 256 + jl L_AES_GCM_encrypt_avx512_done_128 + vmovdqa xmm2, xmm6 + ; H ^ 1 + vmovdqu OWORD PTR [rsp], xmm5 + ; H ^ 2 + vpclmulqdq xmm8, xmm5, xmm5, 0 + vpclmulqdq xmm11, xmm5, xmm5, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm0, xmm11 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpxor xmm9, xmm9, xmm5 + vpshufd xmm10, xmm0, 78 + vpxor xmm10, xmm10, xmm0 + vpclmulqdq xmm8, xmm0, xmm5, 0 + vpclmulqdq xmm11, xmm0, xmm5, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm1, xmm11 + vmovdqu OWORD PTR [rsp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm3, xmm11 + vmovdqu OWORD PTR [rsp+48], xmm3 + ; H ^ 5 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+64], xmm7 + ; H ^ 6 + vpclmulqdq xmm8, xmm1, xmm1, 0 + vpclmulqdq xmm11, xmm1, xmm1, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+80], xmm7 + ; H ^ 7 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm1, 78 + vpxor xmm9, xmm9, xmm1 + vpshufd xmm10, xmm3, 78 + vpxor xmm10, xmm10, xmm3 + vpclmulqdq xmm8, xmm3, xmm1, 0 + vpclmulqdq xmm11, xmm3, xmm1, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+96], xmm7 + ; H ^ 8 + vpclmulqdq xmm8, xmm3, xmm3, 0 + vpclmulqdq xmm11, xmm3, xmm3, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+112], xmm7 + ; H ^ 9 + vmovdqu xmm0, OWORD PTR [rsp+48] + vmovdqu xmm1, OWORD PTR [rsp+64] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+128], xmm7 + ; H ^ 10 + vmovdqu xmm0, OWORD PTR [rsp+64] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+144], xmm7 + ; H ^ 11 + vmovdqu xmm0, OWORD PTR [rsp+64] + vmovdqu xmm1, OWORD PTR [rsp+80] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+160], xmm7 + ; H ^ 12 + vmovdqu xmm0, OWORD PTR [rsp+80] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+176], xmm7 + ; H ^ 13 + vmovdqu xmm0, OWORD PTR [rsp+80] + vmovdqu xmm1, OWORD PTR [rsp+96] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+192], xmm7 + ; H ^ 14 + vmovdqu xmm0, OWORD PTR [rsp+96] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+208], xmm7 + ; H ^ 15 + vmovdqu xmm0, OWORD PTR [rsp+96] + vmovdqu xmm1, OWORD PTR [rsp+112] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+224], xmm7 + ; H ^ 16 + vmovdqu xmm0, OWORD PTR [rsp+112] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+240], xmm7 + cmp r9d, 512 + jl L_AES_GCM_encrypt_avx512_no_ext + ; H ^ 17 + vmovdqu xmm0, OWORD PTR [rsp+112] + vmovdqu xmm1, OWORD PTR [rsp+128] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+256], xmm7 + ; H ^ 18 + vmovdqu xmm0, OWORD PTR [rsp+128] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+272], xmm7 + ; H ^ 19 + vmovdqu xmm0, OWORD PTR [rsp+128] + vmovdqu xmm1, OWORD PTR [rsp+144] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+288], xmm7 + ; H ^ 20 + vmovdqu xmm0, OWORD PTR [rsp+144] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+304], xmm7 + ; H ^ 21 + vmovdqu xmm0, OWORD PTR [rsp+144] + vmovdqu xmm1, OWORD PTR [rsp+160] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+320], xmm7 + ; H ^ 22 + vmovdqu xmm0, OWORD PTR [rsp+160] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+336], xmm7 + ; H ^ 23 + vmovdqu xmm0, OWORD PTR [rsp+160] + vmovdqu xmm1, OWORD PTR [rsp+176] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+352], xmm7 + ; H ^ 24 + vmovdqu xmm0, OWORD PTR [rsp+176] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+368], xmm7 + ; H ^ 25 + vmovdqu xmm0, OWORD PTR [rsp+176] + vmovdqu xmm1, OWORD PTR [rsp+192] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+384], xmm7 + ; H ^ 26 + vmovdqu xmm0, OWORD PTR [rsp+192] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+400], xmm7 + ; H ^ 27 + vmovdqu xmm0, OWORD PTR [rsp+192] + vmovdqu xmm1, OWORD PTR [rsp+208] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+416], xmm7 + ; H ^ 28 + vmovdqu xmm0, OWORD PTR [rsp+208] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+432], xmm7 + ; H ^ 29 + vmovdqu xmm0, OWORD PTR [rsp+208] + vmovdqu xmm1, OWORD PTR [rsp+224] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+448], xmm7 + ; H ^ 30 + vmovdqu xmm0, OWORD PTR [rsp+224] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+464], xmm7 + ; H ^ 31 + vmovdqu xmm0, OWORD PTR [rsp+224] + vmovdqu xmm1, OWORD PTR [rsp+240] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+480], xmm7 + ; H ^ 32 + vmovdqu xmm0, OWORD PTR [rsp+240] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+496], xmm7 +L_AES_GCM_encrypt_avx512_no_ext: + vbroadcasti32x4 zmm22, ptr_L_avx512_aes_gcm_bswap_epi64 + vbroadcasti32x4 zmm30, ptr_L_avx512_aes_gcm_bswap_mask + vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128 + vbroadcasti32x4 zmm9, [r15] + vbroadcasti32x4 zmm10, [r15+16] + vbroadcasti32x4 zmm11, [r15+32] + vbroadcasti32x4 zmm12, [r15+48] + vbroadcasti32x4 zmm13, [r15+64] + vbroadcasti32x4 zmm14, [r15+80] + vbroadcasti32x4 zmm15, [r15+96] + vbroadcasti32x4 zmm1, [r15+112] + vbroadcasti32x4 zmm2, [r15+128] + vbroadcasti32x4 zmm3, [r15+144] + cmp r9d, 512 + jl L_AES_GCM_encrypt_avx512_no_windows + mov r13d, r9d + and r13d, 4294966784 + vmovdqu64 zmm23, [rsp+448] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+384] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+320] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp+256] + vshufi64x2 zmm26, zmm26, zmm26, 27 + vmovdqu64 [rsp+512], zmm23 + vmovdqu64 [rsp+576], zmm24 + vmovdqu64 [rsp+640], zmm25 + vmovdqu64 [rsp+704], zmm26 + vmovdqu64 zmm23, [rsp+192] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+128] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+64] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp] + vshufi64x2 zmm26, zmm26, zmm26, 27 + vmovdqu64 [rsp+768], zmm23 + vmovdqu64 [rsp+832], zmm24 + vmovdqu64 [rsp+896], zmm25 + vmovdqu64 [rsp+960], zmm26 + ; 512 bytes of input + lea rcx, QWORD PTR [rsi+rbx] + mov QWORD PTR [rsp+1056], rcx + vbroadcasti32x4 zmm20, [rsp+1024] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [rsp+1024] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [rsp+1024], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r10d, 11 + vbroadcasti32x4 zmm20, [r15+160] + jl L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r10d, 13 + vbroadcasti32x4 zmm20, [r15+192] + jl L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+224] +L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add ebx, 256 + vbroadcasti32x4 zmm20, [rsp+1024] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [rsp+1024] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [rsp+1024], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r10d, 11 + vbroadcasti32x4 zmm20, [r15+160] + jl L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r10d, 13 + vbroadcasti32x4 zmm20, [r15+192] + jl L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+224] +L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add ebx, 256 + cmp ebx, r13d + jge L_AES_GCM_encrypt_avx512_last_win +L_AES_GCM_encrypt_avx512_win_loop: + lea rcx, QWORD PTR [rsi+rbx] + mov QWORD PTR [rsp+1072], rcx + mov r12, QWORD PTR [rsp+1056] + vpxorq zmm21, zmm21, zmm21 + vinserti32x4 zmm21, zmm21, xmm6, 0 + vbroadcasti32x4 zmm20, [rsp+1024] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [rsp+1024] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [rsp+1024], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vmovdqu64 zmm31, [r12] + vpshufb zmm31, zmm31, zmm30 + vpxorq zmm31, zmm31, zmm21 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vpclmulqdq zmm23, zmm31, [rsp+512], 0 + vpclmulqdq zmm24, zmm31, [rsp+512], 1 + vpclmulqdq zmm25, zmm31, [rsp+512], 16 + vpclmulqdq zmm26, zmm31, [rsp+512], 17 + vmovdqa64 zmm27, zmm23 + vpxorq zmm28, zmm25, zmm24 + vmovdqa64 zmm29, zmm26 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vmovdqu64 zmm31, [r12+64] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vpclmulqdq zmm23, zmm31, [rsp+576], 0 + vpclmulqdq zmm24, zmm31, [rsp+576], 1 + vpclmulqdq zmm25, zmm31, [rsp+576], 16 + vpclmulqdq zmm26, zmm31, [rsp+576], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vmovdqu64 zmm31, [r12+128] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vpclmulqdq zmm23, zmm31, [rsp+640], 0 + vpclmulqdq zmm24, zmm31, [rsp+640], 1 + vpclmulqdq zmm25, zmm31, [rsp+640], 16 + vpclmulqdq zmm26, zmm31, [rsp+640], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vmovdqu64 zmm31, [r12+192] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vpclmulqdq zmm23, zmm31, [rsp+704], 0 + vpclmulqdq zmm24, zmm31, [rsp+704], 1 + vpclmulqdq zmm25, zmm31, [rsp+704], 16 + vpclmulqdq zmm26, zmm31, [rsp+704], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r10d, 11 + vbroadcasti32x4 zmm20, [r15+160] + jl L_AES_GCM_encrypt_avx512_a_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r10d, 13 + vbroadcasti32x4 zmm20, [r15+192] + jl L_AES_GCM_encrypt_avx512_a_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+224] +L_AES_GCM_encrypt_avx512_a_il_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add ebx, 256 + vbroadcasti32x4 zmm20, [rsp+1024] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [rsp+1024] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [rsp+1024], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vmovdqu64 zmm31, [r12+256] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vpclmulqdq zmm23, zmm31, [rsp+768], 0 + vpclmulqdq zmm24, zmm31, [rsp+768], 1 + vpclmulqdq zmm25, zmm31, [rsp+768], 16 + vpclmulqdq zmm26, zmm31, [rsp+768], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vmovdqu64 zmm31, [r12+320] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vpclmulqdq zmm23, zmm31, [rsp+832], 0 + vpclmulqdq zmm24, zmm31, [rsp+832], 1 + vpclmulqdq zmm25, zmm31, [rsp+832], 16 + vpclmulqdq zmm26, zmm31, [rsp+832], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vmovdqu64 zmm31, [r12+384] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vpclmulqdq zmm23, zmm31, [rsp+896], 0 + vpclmulqdq zmm24, zmm31, [rsp+896], 1 + vpclmulqdq zmm25, zmm31, [rsp+896], 16 + vpclmulqdq zmm26, zmm31, [rsp+896], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vmovdqu64 zmm31, [r12+448] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vpclmulqdq zmm23, zmm31, [rsp+960], 0 + vpclmulqdq zmm24, zmm31, [rsp+960], 1 + vpclmulqdq zmm25, zmm31, [rsp+960], 16 + vpclmulqdq zmm26, zmm31, [rsp+960], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r10d, 11 + vbroadcasti32x4 zmm20, [r15+160] + jl L_AES_GCM_encrypt_avx512_b_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r10d, 13 + vbroadcasti32x4 zmm20, [r15+192] + jl L_AES_GCM_encrypt_avx512_b_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+224] +L_AES_GCM_encrypt_avx512_b_il_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add ebx, 256 + vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128 + vpclmulqdq zmm23, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm23, 150 + vpclmulqdq zmm23, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm23, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 + mov rcx, QWORD PTR [rsp+1072] + mov QWORD PTR [rsp+1056], rcx + cmp ebx, r13d + jl L_AES_GCM_encrypt_avx512_win_loop +L_AES_GCM_encrypt_avx512_last_win: + mov rcx, QWORD PTR [rsp+1056] + vpxorq zmm20, zmm20, zmm20 + vinserti32x4 zmm20, zmm20, xmm6, 0 + vmovdqu64 zmm23, [rsp+512] + vmovdqu64 zmm24, [rsp+576] + vmovdqu64 zmm25, [rsp+640] + vmovdqu64 zmm26, [rsp+704] + vmovdqu64 zmm21, [rcx] + vpshufb zmm21, zmm21, zmm30 + vpxorq zmm21, zmm21, zmm20 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vmovdqa64 zmm27, zmm16 + vpxorq zmm28, zmm18, zmm17 + vmovdqa64 zmm29, zmm19 + vmovdqu64 zmm21, [rcx+64] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rcx+128] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rcx+192] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm23, [rsp+768] + vmovdqu64 zmm24, [rsp+832] + vmovdqu64 zmm25, [rsp+896] + vmovdqu64 zmm26, [rsp+960] + vmovdqu64 zmm21, [rcx+256] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rcx+320] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rcx+384] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rcx+448] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vpclmulqdq zmm21, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm21, 150 + vpclmulqdq zmm21, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm21, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 +L_AES_GCM_encrypt_avx512_no_windows: + vmovdqu64 zmm23, [rsp+192] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+128] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+64] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp] + vshufi64x2 zmm26, zmm26, zmm26, 27 + mov r13d, r9d + and r13d, 4294967040 + cmp ebx, r13d + jge L_AES_GCM_encrypt_avx512_after_256 + ; 256 bytes of input + vbroadcasti32x4 zmm20, [rsp+1024] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [rsp+1024] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [rsp+1024], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r10d, 11 + vbroadcasti32x4 zmm20, [r15+160] + jl L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r10d, 13 + vbroadcasti32x4 zmm20, [r15+192] + jl L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+224] +L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + mov QWORD PTR [rsp+1056], rdx + add ebx, 256 + cmp ebx, r13d + jge L_AES_GCM_encrypt_avx512_last_ghash +L_AES_GCM_encrypt_avx512_ghash_128: + vbroadcasti32x4 zmm20, [rsp+1024] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [rsp+1024] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [rsp+1024], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r10d, 11 + vbroadcasti32x4 zmm20, [r15+160] + jl L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r10d, 13 + vbroadcasti32x4 zmm20, [r15+192] + jl L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+224] +L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + mov rcx, QWORD PTR [rsp+1056] + vpxorq zmm20, zmm20, zmm20 + vinserti32x4 zmm20, zmm20, xmm6, 0 + vmovdqu64 zmm21, [rcx] + vpshufb zmm21, zmm21, zmm30 + vpxorq zmm21, zmm21, zmm20 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vmovdqa64 zmm27, zmm16 + vpxorq zmm28, zmm18, zmm17 + vmovdqa64 zmm29, zmm19 + vmovdqu64 zmm21, [rcx+64] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rcx+128] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rcx+192] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vpclmulqdq zmm21, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm21, 150 + vpclmulqdq zmm21, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm21, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 + mov QWORD PTR [rsp+1056], rdx + add ebx, 256 + cmp ebx, r13d + jl L_AES_GCM_encrypt_avx512_ghash_128 +L_AES_GCM_encrypt_avx512_last_ghash: + mov rcx, QWORD PTR [rsp+1056] + vpxorq zmm20, zmm20, zmm20 + vinserti32x4 zmm20, zmm20, xmm6, 0 + vmovdqu64 zmm21, [rcx] + vpshufb zmm21, zmm21, zmm30 + vpxorq zmm21, zmm21, zmm20 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vmovdqa64 zmm27, zmm16 + vpxorq zmm28, zmm18, zmm17 + vmovdqa64 zmm29, zmm19 + vmovdqu64 zmm21, [rcx+64] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rcx+128] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rcx+192] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vpclmulqdq zmm21, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm21, 150 + vpclmulqdq zmm21, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm21, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 +L_AES_GCM_encrypt_avx512_after_256: + vmovdqu xmm5, OWORD PTR [rsp] +L_AES_GCM_encrypt_avx512_done_128: + mov edx, r9d + cmp ebx, edx + jge L_AES_GCM_encrypt_avx512_done_enc + mov r13d, r9d + and r13d, 4294967280 + cmp ebx, r13d + jge L_AES_GCM_encrypt_avx512_last_block_done + vmovdqu xmm9, OWORD PTR [rsp+1024] + vpshufb xmm8, xmm9, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm9, OWORD PTR L_avx512_aes_gcm_one + vmovdqu OWORD PTR [rsp+1024], xmm9 + vpxor xmm8, xmm8, [r15] + vaesenc xmm8, xmm8, [r15+16] + vaesenc xmm8, xmm8, [r15+32] + vaesenc xmm8, xmm8, [r15+48] + vaesenc xmm8, xmm8, [r15+64] + vaesenc xmm8, xmm8, [r15+80] + vaesenc xmm8, xmm8, [r15+96] + vaesenc xmm8, xmm8, [r15+112] + vaesenc xmm8, xmm8, [r15+128] + vaesenc xmm8, xmm8, [r15+144] + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_avx512_aesenc_block_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_avx512_aesenc_block_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_encrypt_avx512_aesenc_block_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqu xmm9, OWORD PTR [rdi+rbx] + vpxor xmm8, xmm8, xmm9 + vmovdqu OWORD PTR [rsi+rbx], xmm8 + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm8 + add ebx, 16 + cmp ebx, r13d + jge L_AES_GCM_encrypt_avx512_last_block_ghash +L_AES_GCM_encrypt_avx512_last_block_start: + vmovdqu xmm13, OWORD PTR [rdi+rbx] + vmovdqu xmm9, OWORD PTR [rsp+1024] + vpshufb xmm8, xmm9, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm9, OWORD PTR L_avx512_aes_gcm_one + vmovdqu OWORD PTR [rsp+1024], xmm9 + vpxor xmm8, xmm8, [r15] + vpclmulqdq xmm10, xmm6, xmm5, 16 + vaesenc xmm8, xmm8, [r15+16] + vaesenc xmm8, xmm8, [r15+32] + vpclmulqdq xmm11, xmm6, xmm5, 1 + vaesenc xmm8, xmm8, [r15+48] + vaesenc xmm8, xmm8, [r15+64] + vpclmulqdq xmm12, xmm6, xmm5, 0 + vaesenc xmm8, xmm8, [r15+80] + vpclmulqdq xmm1, xmm6, xmm5, 17 + vaesenc xmm8, xmm8, [r15+96] + vpxor xmm10, xmm10, xmm11 + vpslldq xmm2, xmm10, 8 + vpsrldq xmm10, xmm10, 8 + vaesenc xmm8, xmm8, [r15+112] + vpxor xmm2, xmm2, xmm12 + vpxor xmm3, xmm1, xmm10 + vmovdqa xmm0, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm2, xmm0, 16 + vaesenc xmm8, xmm8, [r15+128] + vpshufd xmm10, xmm2, 78 + vpxor xmm10, xmm10, xmm11 + vpclmulqdq xmm11, xmm10, xmm0, 16 + vaesenc xmm8, xmm8, [r15+144] + vpshufd xmm10, xmm10, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm6, xmm10, xmm3 + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_avx512_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_avx512_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_encrypt_avx512_aesenc_gfmul_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqa xmm0, xmm13 + vpxor xmm8, xmm8, xmm0 + vmovdqu OWORD PTR [rsi+rbx], xmm8 + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + add ebx, 16 + vpxor xmm6, xmm6, xmm8 + cmp ebx, r13d + jl L_AES_GCM_encrypt_avx512_last_block_start +L_AES_GCM_encrypt_avx512_last_block_ghash: + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpxor xmm9, xmm9, xmm5 + vpshufd xmm10, xmm6, 78 + vpxor xmm10, xmm10, xmm6 + vpclmulqdq xmm8, xmm6, xmm5, 0 + vpclmulqdq xmm11, xmm6, xmm5, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm6, xmm11 +L_AES_GCM_encrypt_avx512_last_block_done: + mov ecx, r9d + mov edx, ecx + and ecx, 15 + jz L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_done + vmovdqu xmm4, OWORD PTR [rsp+1024] + vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpxor xmm4, xmm4, [r15] + vaesenc xmm4, xmm4, [r15+16] + vaesenc xmm4, xmm4, [r15+32] + vaesenc xmm4, xmm4, [r15+48] + vaesenc xmm4, xmm4, [r15+64] + vaesenc xmm4, xmm4, [r15+80] + vaesenc xmm4, xmm4, [r15+96] + vaesenc xmm4, xmm4, [r15+112] + vaesenc xmm4, xmm4, [r15+128] + vaesenc xmm4, xmm4, [r15+144] + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc xmm4, xmm4, xmm9 + vaesenc xmm4, xmm4, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc xmm4, xmm4, xmm9 + vaesenc xmm4, xmm4, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_aesenc_avx_last: + vaesenclast xmm4, xmm4, xmm9 + sub rsp, 16 + xor ecx, ecx + vmovdqu OWORD PTR [rsp], xmm4 +L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_loop: + movzx r13d, BYTE PTR [rdi+rbx] + xor r13b, BYTE PTR [rsp+rcx] + mov BYTE PTR [rsi+rbx], r13b + mov BYTE PTR [rsp+rcx], r13b + inc ebx + inc ecx + cmp ebx, edx + jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_loop + xor r13, r13 + cmp ecx, 16 + je L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_finish_enc +L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_byte_loop: + mov BYTE PTR [rsp+rcx], r13b + inc ecx + cmp ecx, 16 + jl L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_byte_loop +L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_finish_enc: + vmovdqu xmm4, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm4 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpxor xmm9, xmm9, xmm5 + vpshufd xmm10, xmm6, 78 + vpxor xmm10, xmm10, xmm6 + vpclmulqdq xmm8, xmm6, xmm5, 0 + vpclmulqdq xmm11, xmm6, xmm5, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm6, xmm11 +L_AES_GCM_encrypt_avx512_aesenc_last15_enc_avx_done: +L_AES_GCM_encrypt_avx512_done_enc: + mov edx, r9d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + vmovq xmm0, rdx + vmovq xmm1, rcx + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm6, xmm6, xmm0 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpxor xmm9, xmm9, xmm5 + vpshufd xmm10, xmm6, 78 + vpxor xmm10, xmm10, xmm6 + vpclmulqdq xmm8, xmm6, xmm5, 0 + vpclmulqdq xmm11, xmm6, xmm5, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm6, xmm11 + vpshufb xmm6, xmm6, OWORD PTR L_avx512_aes_gcm_bswap_mask + vmovdqu xmm0, OWORD PTR [rsp+1040] + vpxor xmm0, xmm0, xmm6 + cmp r14d, 16 + je L_AES_GCM_encrypt_avx512_store_tag_16 + xor rcx, rcx + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_encrypt_avx512_store_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + mov BYTE PTR [r8+rcx], r13b + inc ecx + cmp ecx, r14d + jne L_AES_GCM_encrypt_avx512_store_tag_loop + jmp L_AES_GCM_encrypt_avx512_store_tag_done +L_AES_GCM_encrypt_avx512_store_tag_16: + vmovdqu OWORD PTR [r8], xmm0 +L_AES_GCM_encrypt_avx512_store_tag_done: + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+1088] + vmovdqu xmm7, OWORD PTR [rsp+1104] + vmovdqu xmm8, OWORD PTR [rsp+1120] + vmovdqu xmm9, OWORD PTR [rsp+1136] + vmovdqu xmm10, OWORD PTR [rsp+1152] + vmovdqu xmm11, OWORD PTR [rsp+1168] + vmovdqu xmm12, OWORD PTR [rsp+1184] + vmovdqu xmm13, OWORD PTR [rsp+1200] + vmovdqu xmm14, OWORD PTR [rsp+1216] + vmovdqu xmm15, OWORD PTR [rsp+1232] + add rsp, 1248 + pop r15 + pop r14 + pop rbx + pop r12 + pop rsi + pop rdi + pop r13 + ret +AES_GCM_encrypt_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_decrypt_avx512 PROC + push r13 + push rdi + push rsi + push r12 + push rbx + push r14 + push r15 + push rbp + mov rdi, rcx + mov rsi, rdx + mov r12, r8 + mov rax, r9 + mov r8, QWORD PTR [rsp+104] + mov r9d, DWORD PTR [rsp+112] + mov r11d, DWORD PTR [rsp+120] + mov ebx, DWORD PTR [rsp+128] + mov r14d, DWORD PTR [rsp+136] + mov r15, QWORD PTR [rsp+144] + mov r10d, DWORD PTR [rsp+152] + mov rbp, QWORD PTR [rsp+160] + sub rsp, 1216 + vmovdqu OWORD PTR [rsp+1056], xmm6 + vmovdqu OWORD PTR [rsp+1072], xmm7 + vmovdqu OWORD PTR [rsp+1088], xmm8 + vmovdqu OWORD PTR [rsp+1104], xmm9 + vmovdqu OWORD PTR [rsp+1120], xmm10 + vmovdqu OWORD PTR [rsp+1136], xmm11 + vmovdqu OWORD PTR [rsp+1152], xmm12 + vmovdqu OWORD PTR [rsp+1168], xmm13 + vmovdqu OWORD PTR [rsp+1184], xmm14 + vmovdqu OWORD PTR [rsp+1200], xmm15 + vpxor xmm4, xmm4, xmm4 + vpxor xmm6, xmm6, xmm6 + cmp ebx, 12 + mov edx, ebx + jne L_AES_GCM_decrypt_avx512_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + vmovq xmm4, QWORD PTR [rax] + vpinsrd xmm4, xmm4, DWORD PTR [rax+8], 2 + vpinsrd xmm4, xmm4, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqa xmm5, OWORD PTR [r15] + vpxor xmm1, xmm4, xmm5 + vmovdqa xmm7, OWORD PTR [r15+16] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+32] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+48] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+64] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+80] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+96] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+112] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+128] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+144] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + cmp r10d, 11 + vmovdqa xmm7, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_avx512_calc_iv_12_last + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+176] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + cmp r10d, 13 + vmovdqa xmm7, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_avx512_calc_iv_12_last + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+208] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+224] +L_AES_GCM_decrypt_avx512_calc_iv_12_last: + vaesenclast xmm5, xmm5, xmm7 + vaesenclast xmm1, xmm1, xmm7 + vpshufb xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_bswap_mask + vmovdqu OWORD PTR [rsp+1040], xmm1 + jmp L_AES_GCM_decrypt_avx512_iv_done +L_AES_GCM_decrypt_avx512_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqa xmm5, OWORD PTR [r15] + vaesenc xmm5, xmm5, [r15+16] + vaesenc xmm5, xmm5, [r15+32] + vaesenc xmm5, xmm5, [r15+48] + vaesenc xmm5, xmm5, [r15+64] + vaesenc xmm5, xmm5, [r15+80] + vaesenc xmm5, xmm5, [r15+96] + vaesenc xmm5, xmm5, [r15+112] + vaesenc xmm5, xmm5, [r15+128] + vaesenc xmm5, xmm5, [r15+144] + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_avx512_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm9 + vaesenc xmm5, xmm5, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_avx512_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm9 + vaesenc xmm5, xmm5, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_decrypt_avx512_calc_iv_1_aesenc_avx_last: + vaesenclast xmm5, xmm5, xmm9 + vpshufb xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov rcx, 0 + je L_AES_GCM_decrypt_avx512_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_decrypt_avx512_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_decrypt_avx512_calc_iv_16_loop: + vmovdqu xmm8, OWORD PTR [rax+rcx] + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_decrypt_avx512_calc_iv_16_loop + mov edx, ebx + cmp ecx, edx + je L_AES_GCM_decrypt_avx512_calc_iv_done +L_AES_GCM_decrypt_avx512_calc_iv_lt16: + sub rsp, 16 + vpxor xmm8, xmm8, xmm8 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm8 +L_AES_GCM_decrypt_avx512_calc_iv_loop: + movzx r13d, BYTE PTR [rax+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_decrypt_avx512_calc_iv_loop + vmovdqu xmm8, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 +L_AES_GCM_decrypt_avx512_calc_iv_done: + ; T = Encrypt counter + vpxor xmm0, xmm0, xmm0 + shl edx, 3 + vmovq xmm0, rdx + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 + vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_mask + ; Encrypt counter + vmovdqa xmm8, OWORD PTR [r15] + vpxor xmm8, xmm8, xmm4 + vaesenc xmm8, xmm8, [r15+16] + vaesenc xmm8, xmm8, [r15+32] + vaesenc xmm8, xmm8, [r15+48] + vaesenc xmm8, xmm8, [r15+64] + vaesenc xmm8, xmm8, [r15+80] + vaesenc xmm8, xmm8, [r15+96] + vaesenc xmm8, xmm8, [r15+112] + vaesenc xmm8, xmm8, [r15+128] + vaesenc xmm8, xmm8, [r15+144] + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_avx512_calc_iv_2_aesenc_avx_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_avx512_calc_iv_2_aesenc_avx_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_decrypt_avx512_calc_iv_2_aesenc_avx_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqu OWORD PTR [rsp+1040], xmm8 +L_AES_GCM_decrypt_avx512_iv_done: + ; Additional authentication data + mov edx, r11d + cmp edx, 0 + je L_AES_GCM_decrypt_avx512_calc_aad_done + xor ecx, ecx + cmp edx, 16 + jl L_AES_GCM_decrypt_avx512_calc_aad_lt16 + and edx, 4294967280 +L_AES_GCM_decrypt_avx512_calc_aad_16_loop: + vmovdqu xmm8, OWORD PTR [r12+rcx] + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm6, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm6, 17 + vpclmulqdq xmm0, xmm5, xmm6, 0 + vpxor xmm1, xmm1, xmm6 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm6, xmm6, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm6, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm6, xmm6, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm6, xmm6, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_decrypt_avx512_calc_aad_16_loop + mov edx, r11d + cmp ecx, edx + je L_AES_GCM_decrypt_avx512_calc_aad_done +L_AES_GCM_decrypt_avx512_calc_aad_lt16: + sub rsp, 16 + vpxor xmm8, xmm8, xmm8 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm8 +L_AES_GCM_decrypt_avx512_calc_aad_loop: + movzx r13d, BYTE PTR [r12+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_decrypt_avx512_calc_aad_loop + vmovdqu xmm8, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm6, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm6, 17 + vpclmulqdq xmm0, xmm5, xmm6, 0 + vpxor xmm1, xmm1, xmm6 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm6, xmm6, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm6, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm6, xmm6, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm6, xmm6, xmm2 +L_AES_GCM_decrypt_avx512_calc_aad_done: + ; Calculate counter and H + vpsrlq xmm9, xmm5, 63 + vpsllq xmm8, xmm5, 1 + vpslldq xmm9, xmm9, 8 + vpor xmm8, xmm8, xmm9 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpand xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpaddd xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_one + vpxor xmm5, xmm5, xmm8 + vmovdqu OWORD PTR [rsp+1024], xmm4 + xor ebx, ebx + cmp r9d, 256 + jl L_AES_GCM_decrypt_avx512_done_128 + vmovdqa xmm2, xmm6 + ; H ^ 1 + vmovdqu OWORD PTR [rsp], xmm5 + ; H ^ 2 + vpclmulqdq xmm8, xmm5, xmm5, 0 + vpclmulqdq xmm11, xmm5, xmm5, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm0, xmm11 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpxor xmm9, xmm9, xmm5 + vpshufd xmm10, xmm0, 78 + vpxor xmm10, xmm10, xmm0 + vpclmulqdq xmm8, xmm0, xmm5, 0 + vpclmulqdq xmm11, xmm0, xmm5, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm1, xmm11 + vmovdqu OWORD PTR [rsp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm3, xmm11 + vmovdqu OWORD PTR [rsp+48], xmm3 + ; H ^ 5 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+64], xmm7 + ; H ^ 6 + vpclmulqdq xmm8, xmm1, xmm1, 0 + vpclmulqdq xmm11, xmm1, xmm1, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+80], xmm7 + ; H ^ 7 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm1, 78 + vpxor xmm9, xmm9, xmm1 + vpshufd xmm10, xmm3, 78 + vpxor xmm10, xmm10, xmm3 + vpclmulqdq xmm8, xmm3, xmm1, 0 + vpclmulqdq xmm11, xmm3, xmm1, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+96], xmm7 + ; H ^ 8 + vpclmulqdq xmm8, xmm3, xmm3, 0 + vpclmulqdq xmm11, xmm3, xmm3, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+112], xmm7 + ; H ^ 9 + vmovdqu xmm0, OWORD PTR [rsp+48] + vmovdqu xmm1, OWORD PTR [rsp+64] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+128], xmm7 + ; H ^ 10 + vmovdqu xmm0, OWORD PTR [rsp+64] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+144], xmm7 + ; H ^ 11 + vmovdqu xmm0, OWORD PTR [rsp+64] + vmovdqu xmm1, OWORD PTR [rsp+80] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+160], xmm7 + ; H ^ 12 + vmovdqu xmm0, OWORD PTR [rsp+80] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+176], xmm7 + ; H ^ 13 + vmovdqu xmm0, OWORD PTR [rsp+80] + vmovdqu xmm1, OWORD PTR [rsp+96] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+192], xmm7 + ; H ^ 14 + vmovdqu xmm0, OWORD PTR [rsp+96] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+208], xmm7 + ; H ^ 15 + vmovdqu xmm0, OWORD PTR [rsp+96] + vmovdqu xmm1, OWORD PTR [rsp+112] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+224], xmm7 + ; H ^ 16 + vmovdqu xmm0, OWORD PTR [rsp+112] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+240], xmm7 + cmp r9d, 512 + jl L_AES_GCM_decrypt_avx512_no_ext + ; H ^ 17 + vmovdqu xmm0, OWORD PTR [rsp+112] + vmovdqu xmm1, OWORD PTR [rsp+128] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+256], xmm7 + ; H ^ 18 + vmovdqu xmm0, OWORD PTR [rsp+128] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+272], xmm7 + ; H ^ 19 + vmovdqu xmm0, OWORD PTR [rsp+128] + vmovdqu xmm1, OWORD PTR [rsp+144] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+288], xmm7 + ; H ^ 20 + vmovdqu xmm0, OWORD PTR [rsp+144] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+304], xmm7 + ; H ^ 21 + vmovdqu xmm0, OWORD PTR [rsp+144] + vmovdqu xmm1, OWORD PTR [rsp+160] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+320], xmm7 + ; H ^ 22 + vmovdqu xmm0, OWORD PTR [rsp+160] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+336], xmm7 + ; H ^ 23 + vmovdqu xmm0, OWORD PTR [rsp+160] + vmovdqu xmm1, OWORD PTR [rsp+176] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+352], xmm7 + ; H ^ 24 + vmovdqu xmm0, OWORD PTR [rsp+176] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+368], xmm7 + ; H ^ 25 + vmovdqu xmm0, OWORD PTR [rsp+176] + vmovdqu xmm1, OWORD PTR [rsp+192] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+384], xmm7 + ; H ^ 26 + vmovdqu xmm0, OWORD PTR [rsp+192] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+400], xmm7 + ; H ^ 27 + vmovdqu xmm0, OWORD PTR [rsp+192] + vmovdqu xmm1, OWORD PTR [rsp+208] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+416], xmm7 + ; H ^ 28 + vmovdqu xmm0, OWORD PTR [rsp+208] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+432], xmm7 + ; H ^ 29 + vmovdqu xmm0, OWORD PTR [rsp+208] + vmovdqu xmm1, OWORD PTR [rsp+224] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+448], xmm7 + ; H ^ 30 + vmovdqu xmm0, OWORD PTR [rsp+224] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+464], xmm7 + ; H ^ 31 + vmovdqu xmm0, OWORD PTR [rsp+224] + vmovdqu xmm1, OWORD PTR [rsp+240] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+480], xmm7 + ; H ^ 32 + vmovdqu xmm0, OWORD PTR [rsp+240] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+496], xmm7 +L_AES_GCM_decrypt_avx512_no_ext: + vbroadcasti32x4 zmm22, ptr_L_avx512_aes_gcm_bswap_epi64 + vbroadcasti32x4 zmm30, ptr_L_avx512_aes_gcm_bswap_mask + vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128 + vbroadcasti32x4 zmm9, [r15] + vbroadcasti32x4 zmm10, [r15+16] + vbroadcasti32x4 zmm11, [r15+32] + vbroadcasti32x4 zmm12, [r15+48] + vbroadcasti32x4 zmm13, [r15+64] + vbroadcasti32x4 zmm14, [r15+80] + vbroadcasti32x4 zmm15, [r15+96] + vbroadcasti32x4 zmm1, [r15+112] + vbroadcasti32x4 zmm2, [r15+128] + vbroadcasti32x4 zmm3, [r15+144] + cmp r9d, 512 + jl L_AES_GCM_decrypt_avx512_no_windows + mov r13d, r9d + and r13d, 4294966784 + vmovdqu64 zmm23, [rsp+448] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+384] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+320] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp+256] + vshufi64x2 zmm26, zmm26, zmm26, 27 + vmovdqu64 [rsp+512], zmm23 + vmovdqu64 [rsp+576], zmm24 + vmovdqu64 [rsp+640], zmm25 + vmovdqu64 [rsp+704], zmm26 + vmovdqu64 zmm23, [rsp+192] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+128] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+64] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp] + vshufi64x2 zmm26, zmm26, zmm26, 27 + vmovdqu64 [rsp+768], zmm23 + vmovdqu64 [rsp+832], zmm24 + vmovdqu64 [rsp+896], zmm25 + vmovdqu64 [rsp+960], zmm26 + ; 512 bytes of input + xor r12d, r12d + lea rax, QWORD PTR [rdi+rbx] + vpxorq zmm20, zmm20, zmm20 + vinserti32x4 zmm20, zmm20, xmm6, 0 + vmovdqu64 zmm23, [rsp+512] + vmovdqu64 zmm24, [rsp+576] + vmovdqu64 zmm25, [rsp+640] + vmovdqu64 zmm26, [rsp+704] + vmovdqu64 zmm21, [rax] + vpshufb zmm21, zmm21, zmm30 + vpxorq zmm21, zmm21, zmm20 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vmovdqa64 zmm27, zmm16 + vpxorq zmm28, zmm18, zmm17 + vmovdqa64 zmm29, zmm19 + vmovdqu64 zmm21, [rax+64] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rax+128] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rax+192] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm23, [rsp+768] + vmovdqu64 zmm24, [rsp+832] + vmovdqu64 zmm25, [rsp+896] + vmovdqu64 zmm26, [rsp+960] + vmovdqu64 zmm21, [rax+256] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rax+320] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rax+384] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rax+448] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vpclmulqdq zmm21, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm21, 150 + vpclmulqdq zmm21, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm21, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 + add ebx, 512 + cmp ebx, r13d + jge L_AES_GCM_decrypt_avx512_last_aes +L_AES_GCM_decrypt_avx512_win_loop: + lea rax, QWORD PTR [rdi+rbx] + vpxorq zmm21, zmm21, zmm21 + vinserti32x4 zmm21, zmm21, xmm6, 0 + vbroadcasti32x4 zmm20, [rsp+1024] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [rsp+1024] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [rsp+1024], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vmovdqu64 zmm31, [rax] + vpshufb zmm31, zmm31, zmm30 + vpxorq zmm31, zmm31, zmm21 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vpclmulqdq zmm23, zmm31, [rsp+512], 0 + vpclmulqdq zmm24, zmm31, [rsp+512], 1 + vpclmulqdq zmm25, zmm31, [rsp+512], 16 + vpclmulqdq zmm26, zmm31, [rsp+512], 17 + vmovdqa64 zmm27, zmm23 + vpxorq zmm28, zmm25, zmm24 + vmovdqa64 zmm29, zmm26 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vmovdqu64 zmm31, [rax+64] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vpclmulqdq zmm23, zmm31, [rsp+576], 0 + vpclmulqdq zmm24, zmm31, [rsp+576], 1 + vpclmulqdq zmm25, zmm31, [rsp+576], 16 + vpclmulqdq zmm26, zmm31, [rsp+576], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vmovdqu64 zmm31, [rax+128] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vpclmulqdq zmm23, zmm31, [rsp+640], 0 + vpclmulqdq zmm24, zmm31, [rsp+640], 1 + vpclmulqdq zmm25, zmm31, [rsp+640], 16 + vpclmulqdq zmm26, zmm31, [rsp+640], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vmovdqu64 zmm31, [rax+192] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vpclmulqdq zmm23, zmm31, [rsp+704], 0 + vpclmulqdq zmm24, zmm31, [rsp+704], 1 + vpclmulqdq zmm25, zmm31, [rsp+704], 16 + vpclmulqdq zmm26, zmm31, [rsp+704], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r10d, 11 + vbroadcasti32x4 zmm20, [r15+160] + jl L_AES_GCM_decrypt_avx512_a_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r10d, 13 + vbroadcasti32x4 zmm20, [r15+192] + jl L_AES_GCM_decrypt_avx512_a_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+224] +L_AES_GCM_decrypt_avx512_a_il_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add r12d, 256 + vbroadcasti32x4 zmm20, [rsp+1024] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [rsp+1024] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [rsp+1024], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vmovdqu64 zmm31, [rax+256] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vpclmulqdq zmm23, zmm31, [rsp+768], 0 + vpclmulqdq zmm24, zmm31, [rsp+768], 1 + vpclmulqdq zmm25, zmm31, [rsp+768], 16 + vpclmulqdq zmm26, zmm31, [rsp+768], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vmovdqu64 zmm31, [rax+320] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vpclmulqdq zmm23, zmm31, [rsp+832], 0 + vpclmulqdq zmm24, zmm31, [rsp+832], 1 + vpclmulqdq zmm25, zmm31, [rsp+832], 16 + vpclmulqdq zmm26, zmm31, [rsp+832], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vmovdqu64 zmm31, [rax+384] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vpclmulqdq zmm23, zmm31, [rsp+896], 0 + vpclmulqdq zmm24, zmm31, [rsp+896], 1 + vpclmulqdq zmm25, zmm31, [rsp+896], 16 + vpclmulqdq zmm26, zmm31, [rsp+896], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vmovdqu64 zmm31, [rax+448] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vpclmulqdq zmm23, zmm31, [rsp+960], 0 + vpclmulqdq zmm24, zmm31, [rsp+960], 1 + vpclmulqdq zmm25, zmm31, [rsp+960], 16 + vpclmulqdq zmm26, zmm31, [rsp+960], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r10d, 11 + vbroadcasti32x4 zmm20, [r15+160] + jl L_AES_GCM_decrypt_avx512_b_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r10d, 13 + vbroadcasti32x4 zmm20, [r15+192] + jl L_AES_GCM_decrypt_avx512_b_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+224] +L_AES_GCM_decrypt_avx512_b_il_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add r12d, 256 + vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128 + vpclmulqdq zmm23, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm23, 150 + vpclmulqdq zmm23, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm23, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 + add ebx, 512 + cmp ebx, r13d + jl L_AES_GCM_decrypt_avx512_win_loop +L_AES_GCM_decrypt_avx512_last_aes: + vbroadcasti32x4 zmm20, [rsp+1024] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [rsp+1024] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [rsp+1024], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r10d, 11 + vbroadcasti32x4 zmm20, [r15+160] + jl L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r10d, 13 + vbroadcasti32x4 zmm20, [r15+192] + jl L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+224] +L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add r12d, 256 + vbroadcasti32x4 zmm20, [rsp+1024] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [rsp+1024] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [rsp+1024], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r10d, 11 + vbroadcasti32x4 zmm20, [r15+160] + jl L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r10d, 13 + vbroadcasti32x4 zmm20, [r15+192] + jl L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+224] +L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add r12d, 256 +L_AES_GCM_decrypt_avx512_no_windows: + vmovdqu64 zmm23, [rsp+192] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+128] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+64] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp] + vshufi64x2 zmm26, zmm26, zmm26, 27 + mov r13d, r9d + and r13d, 4294967040 + cmp ebx, r13d + jge L_AES_GCM_decrypt_avx512_after_256 + ; 256 bytes of input + lea rax, QWORD PTR [rdi+rbx] + vpxorq zmm20, zmm20, zmm20 + vinserti32x4 zmm20, zmm20, xmm6, 0 + vmovdqu64 zmm21, [rax] + vpshufb zmm21, zmm21, zmm30 + vpxorq zmm21, zmm21, zmm20 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vmovdqa64 zmm27, zmm16 + vpxorq zmm28, zmm18, zmm17 + vmovdqa64 zmm29, zmm19 + vmovdqu64 zmm21, [rax+64] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rax+128] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rax+192] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vpclmulqdq zmm21, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm21, 150 + vpclmulqdq zmm21, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm21, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 + vbroadcasti32x4 zmm20, [rsp+1024] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [rsp+1024] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [rsp+1024], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r10d, 11 + vbroadcasti32x4 zmm20, [r15+160] + jl L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r10d, 13 + vbroadcasti32x4 zmm20, [r15+192] + jl L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [r15+224] +L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add ebx, 256 +L_AES_GCM_decrypt_avx512_after_256: + vmovdqu xmm5, OWORD PTR [rsp] +L_AES_GCM_decrypt_avx512_done_128: + mov edx, r9d + cmp ebx, edx + jge L_AES_GCM_decrypt_avx512_done_dec + mov r13d, r9d + and r13d, 4294967280 + cmp ebx, r13d + jge L_AES_GCM_decrypt_avx512_last_block_done +L_AES_GCM_decrypt_avx512_last_block_start: + vmovdqu xmm13, OWORD PTR [rdi+rbx] + vmovdqa xmm0, xmm5 + vpshufb xmm1, xmm13, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm1, xmm1, xmm6 + vmovdqu xmm9, OWORD PTR [rsp+1024] + vpshufb xmm8, xmm9, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm9, OWORD PTR L_avx512_aes_gcm_one + vmovdqu OWORD PTR [rsp+1024], xmm9 + vpxor xmm8, xmm8, [r15] + vpclmulqdq xmm10, xmm1, xmm0, 16 + vaesenc xmm8, xmm8, [r15+16] + vaesenc xmm8, xmm8, [r15+32] + vpclmulqdq xmm11, xmm1, xmm0, 1 + vaesenc xmm8, xmm8, [r15+48] + vaesenc xmm8, xmm8, [r15+64] + vpclmulqdq xmm12, xmm1, xmm0, 0 + vaesenc xmm8, xmm8, [r15+80] + vpclmulqdq xmm1, xmm1, xmm0, 17 + vaesenc xmm8, xmm8, [r15+96] + vpxor xmm10, xmm10, xmm11 + vpslldq xmm2, xmm10, 8 + vpsrldq xmm10, xmm10, 8 + vaesenc xmm8, xmm8, [r15+112] + vpxor xmm2, xmm2, xmm12 + vpxor xmm3, xmm1, xmm10 + vmovdqa xmm0, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm2, xmm0, 16 + vaesenc xmm8, xmm8, [r15+128] + vpshufd xmm10, xmm2, 78 + vpxor xmm10, xmm10, xmm11 + vpclmulqdq xmm11, xmm10, xmm0, 16 + vaesenc xmm8, xmm8, [r15+144] + vpshufd xmm10, xmm10, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm6, xmm10, xmm3 + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_avx512_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_avx512_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_decrypt_avx512_aesenc_gfmul_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqa xmm0, xmm13 + vpxor xmm8, xmm8, xmm0 + vmovdqu OWORD PTR [rsi+rbx], xmm8 + add ebx, 16 + cmp ebx, r13d + jl L_AES_GCM_decrypt_avx512_last_block_start +L_AES_GCM_decrypt_avx512_last_block_done: + mov ecx, r9d + mov edx, ecx + and ecx, 15 + jz L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_done + vmovdqu xmm4, OWORD PTR [rsp+1024] + vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpxor xmm4, xmm4, [r15] + vaesenc xmm4, xmm4, [r15+16] + vaesenc xmm4, xmm4, [r15+32] + vaesenc xmm4, xmm4, [r15+48] + vaesenc xmm4, xmm4, [r15+64] + vaesenc xmm4, xmm4, [r15+80] + vaesenc xmm4, xmm4, [r15+96] + vaesenc xmm4, xmm4, [r15+112] + vaesenc xmm4, xmm4, [r15+128] + vaesenc xmm4, xmm4, [r15+144] + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc xmm4, xmm4, xmm9 + vaesenc xmm4, xmm4, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc xmm4, xmm4, xmm9 + vaesenc xmm4, xmm4, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_aesenc_avx_last: + vaesenclast xmm4, xmm4, xmm9 + sub rsp, 32 + xor ecx, ecx + vmovdqu OWORD PTR [rsp], xmm4 + vpxor xmm0, xmm0, xmm0 + vmovdqu OWORD PTR [rsp+16], xmm0 +L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_loop: + movzx r13d, BYTE PTR [rdi+rbx] + mov BYTE PTR [rsp+rcx+16], r13b + xor r13b, BYTE PTR [rsp+rcx] + mov BYTE PTR [rsi+rbx], r13b + inc ebx + inc ecx + cmp ebx, edx + jl L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_loop + vmovdqu xmm4, OWORD PTR [rsp+16] + add rsp, 32 + vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm4 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpxor xmm9, xmm9, xmm5 + vpshufd xmm10, xmm6, 78 + vpxor xmm10, xmm10, xmm6 + vpclmulqdq xmm8, xmm6, xmm5, 0 + vpclmulqdq xmm11, xmm6, xmm5, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm6, xmm11 +L_AES_GCM_decrypt_avx512_aesenc_last15_dec_avx_done: +L_AES_GCM_decrypt_avx512_done_dec: + mov edx, r9d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + vmovq xmm0, rdx + vmovq xmm1, rcx + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm6, xmm6, xmm0 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpxor xmm9, xmm9, xmm5 + vpshufd xmm10, xmm6, 78 + vpxor xmm10, xmm10, xmm6 + vpclmulqdq xmm8, xmm6, xmm5, 0 + vpclmulqdq xmm11, xmm6, xmm5, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm6, xmm11 + vpshufb xmm6, xmm6, OWORD PTR L_avx512_aes_gcm_bswap_mask + vmovdqu xmm0, OWORD PTR [rsp+1040] + vpxor xmm0, xmm0, xmm6 + cmp r14d, 16 + je L_AES_GCM_decrypt_avx512_cmp_tag_16 + sub rsp, 16 + xor rcx, rcx + xor rbx, rbx + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_decrypt_avx512_cmp_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + xor r13b, BYTE PTR [r8+rcx] + or bl, r13b + inc ecx + cmp ecx, r14d + jne L_AES_GCM_decrypt_avx512_cmp_tag_loop + cmp bl, 0 + sete bl + add rsp, 16 + xor rcx, rcx + jmp L_AES_GCM_decrypt_avx512_cmp_tag_done +L_AES_GCM_decrypt_avx512_cmp_tag_16: + vmovdqu xmm1, OWORD PTR [r8] + vpcmpeqb xmm0, xmm0, xmm1 + vpmovmskb rdx, xmm0 + ; %%edx == 0xFFFF then return 1 else => return 0 + xor ebx, ebx + cmp edx, 65535 + sete bl +L_AES_GCM_decrypt_avx512_cmp_tag_done: + mov DWORD PTR [rbp], ebx + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+1056] + vmovdqu xmm7, OWORD PTR [rsp+1072] + vmovdqu xmm8, OWORD PTR [rsp+1088] + vmovdqu xmm9, OWORD PTR [rsp+1104] + vmovdqu xmm10, OWORD PTR [rsp+1120] + vmovdqu xmm11, OWORD PTR [rsp+1136] + vmovdqu xmm12, OWORD PTR [rsp+1152] + vmovdqu xmm13, OWORD PTR [rsp+1168] + vmovdqu xmm14, OWORD PTR [rsp+1184] + vmovdqu xmm15, OWORD PTR [rsp+1200] + add rsp, 1216 + pop rbp + pop r15 + pop r14 + pop rbx + pop r12 + pop rsi + pop rdi + pop r13 + ret +AES_GCM_decrypt_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_init_avx512 PROC + push rdi + push rsi + push r12 + push r13 + mov rdi, rcx + mov rsi, rdx + mov r10, r8 + mov r11d, r9d + mov rax, QWORD PTR [rsp+72] + mov r8, QWORD PTR [rsp+80] + mov r9, QWORD PTR [rsp+88] + sub rsp, 80 + vmovdqu OWORD PTR [rsp+16], xmm6 + vmovdqu OWORD PTR [rsp+32], xmm7 + vmovdqu OWORD PTR [rsp+48], xmm8 + vmovdqu OWORD PTR [rsp+64], xmm15 + vpxor xmm4, xmm4, xmm4 + mov edx, r11d + cmp edx, 12 + jne L_AES_GCM_init_avx512_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + vmovq xmm4, QWORD PTR [r10] + vpinsrd xmm4, xmm4, DWORD PTR [r10+8], 2 + vpinsrd xmm4, xmm4, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqa xmm5, OWORD PTR [rdi] + vpxor xmm1, xmm4, xmm5 + vmovdqa xmm6, OWORD PTR [rdi+16] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+32] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+48] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+64] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+80] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+96] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+112] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+128] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+144] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + cmp esi, 11 + vmovdqa xmm6, OWORD PTR [rdi+160] + jl L_AES_GCM_init_avx512_calc_iv_12_last + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+176] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + cmp esi, 13 + vmovdqa xmm6, OWORD PTR [rdi+192] + jl L_AES_GCM_init_avx512_calc_iv_12_last + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+208] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+224] +L_AES_GCM_init_avx512_calc_iv_12_last: + vaesenclast xmm5, xmm5, xmm6 + vaesenclast xmm1, xmm1, xmm6 + vpshufb xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_bswap_mask + vmovdqu xmm15, xmm1 + jmp L_AES_GCM_init_avx512_iv_done +L_AES_GCM_init_avx512_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqa xmm5, OWORD PTR [rdi] + vaesenc xmm5, xmm5, [rdi+16] + vaesenc xmm5, xmm5, [rdi+32] + vaesenc xmm5, xmm5, [rdi+48] + vaesenc xmm5, xmm5, [rdi+64] + vaesenc xmm5, xmm5, [rdi+80] + vaesenc xmm5, xmm5, [rdi+96] + vaesenc xmm5, xmm5, [rdi+112] + vaesenc xmm5, xmm5, [rdi+128] + vaesenc xmm5, xmm5, [rdi+144] + cmp esi, 11 + vmovdqa xmm8, OWORD PTR [rdi+160] + jl L_AES_GCM_init_avx512_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm8 + vaesenc xmm5, xmm5, [rdi+176] + cmp esi, 13 + vmovdqa xmm8, OWORD PTR [rdi+192] + jl L_AES_GCM_init_avx512_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm8 + vaesenc xmm5, xmm5, [rdi+208] + vmovdqa xmm8, OWORD PTR [rdi+224] +L_AES_GCM_init_avx512_calc_iv_1_aesenc_avx_last: + vaesenclast xmm5, xmm5, xmm8 + vpshufb xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov rcx, 0 + je L_AES_GCM_init_avx512_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_init_avx512_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_init_avx512_calc_iv_16_loop: + vmovdqu xmm7, OWORD PTR [r10+rcx] + vpshufb xmm7, xmm7, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm6, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm6, xmm6, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm6, 31 + vpslld xmm1, xmm6, 30 + vpslld xmm2, xmm6, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm6, xmm6, xmm0 + vpsrld xmm2, xmm6, 1 + vpsrld xmm3, xmm6, 2 + vpsrld xmm0, xmm6, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm6 + vpxor xmm4, xmm4, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_init_avx512_calc_iv_16_loop + mov edx, r11d + cmp ecx, edx + je L_AES_GCM_init_avx512_calc_iv_done +L_AES_GCM_init_avx512_calc_iv_lt16: + sub rsp, 16 + vpxor xmm7, xmm7, xmm7 + xor r13d, r13d + vmovdqu OWORD PTR [rsp], xmm7 +L_AES_GCM_init_avx512_calc_iv_loop: + movzx r12d, BYTE PTR [r10+rcx] + mov BYTE PTR [rsp+r13], r12b + inc ecx + inc r13d + cmp ecx, edx + jl L_AES_GCM_init_avx512_calc_iv_loop + vmovdqu xmm7, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm7, xmm7, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm6, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm6, xmm6, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm6, 31 + vpslld xmm1, xmm6, 30 + vpslld xmm2, xmm6, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm6, xmm6, xmm0 + vpsrld xmm2, xmm6, 1 + vpsrld xmm3, xmm6, 2 + vpsrld xmm0, xmm6, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm6 + vpxor xmm4, xmm4, xmm2 +L_AES_GCM_init_avx512_calc_iv_done: + ; T = Encrypt counter + vpxor xmm0, xmm0, xmm0 + shl edx, 3 + vmovq xmm0, rdx + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm6, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm6, xmm6, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm6, 31 + vpslld xmm1, xmm6, 30 + vpslld xmm2, xmm6, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm6, xmm6, xmm0 + vpsrld xmm2, xmm6, 1 + vpsrld xmm3, xmm6, 2 + vpsrld xmm0, xmm6, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm6 + vpxor xmm4, xmm4, xmm2 + vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_mask + ; Encrypt counter + vmovdqa xmm7, OWORD PTR [rdi] + vpxor xmm7, xmm7, xmm4 + vaesenc xmm7, xmm7, [rdi+16] + vaesenc xmm7, xmm7, [rdi+32] + vaesenc xmm7, xmm7, [rdi+48] + vaesenc xmm7, xmm7, [rdi+64] + vaesenc xmm7, xmm7, [rdi+80] + vaesenc xmm7, xmm7, [rdi+96] + vaesenc xmm7, xmm7, [rdi+112] + vaesenc xmm7, xmm7, [rdi+128] + vaesenc xmm7, xmm7, [rdi+144] + cmp esi, 11 + vmovdqa xmm8, OWORD PTR [rdi+160] + jl L_AES_GCM_init_avx512_calc_iv_2_aesenc_avx_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [rdi+176] + cmp esi, 13 + vmovdqa xmm8, OWORD PTR [rdi+192] + jl L_AES_GCM_init_avx512_calc_iv_2_aesenc_avx_last + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [rdi+208] + vmovdqa xmm8, OWORD PTR [rdi+224] +L_AES_GCM_init_avx512_calc_iv_2_aesenc_avx_last: + vaesenclast xmm7, xmm7, xmm8 + vmovdqu xmm15, xmm7 +L_AES_GCM_init_avx512_iv_done: + vmovdqa OWORD PTR [r9], xmm15 + vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpaddd xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_one + vmovdqa OWORD PTR [rax], xmm5 + vmovdqa OWORD PTR [r8], xmm4 + vmovdqu xmm6, OWORD PTR [rsp+16] + vmovdqu xmm7, OWORD PTR [rsp+32] + vmovdqu xmm8, OWORD PTR [rsp+48] + vmovdqu xmm15, OWORD PTR [rsp+64] + add rsp, 80 + pop r13 + pop r12 + pop rsi + pop rdi + ret +AES_GCM_init_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_aad_update_avx512 PROC + mov rax, rcx + sub rsp, 32 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqa xmm5, OWORD PTR [r8] + vmovdqa xmm6, OWORD PTR [r9] + xor ecx, ecx +L_AES_GCM_aad_update_avx512_16_loop: + vmovdqu xmm7, OWORD PTR [rax+rcx] + vpshufb xmm7, xmm7, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm5, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm5, xmm5, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm5, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm5, xmm5, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm5, xmm5, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm5, xmm5, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm5, xmm5, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_aad_update_avx512_16_loop + vmovdqa OWORD PTR [r8], xmm5 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + add rsp, 32 + ret +AES_GCM_aad_update_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_block_avx512 PROC + mov r10, r8 + mov r11, r9 + mov rax, QWORD PTR [rsp+40] + vmovdqu xmm1, OWORD PTR [rax] + vpshufb xmm0, xmm1, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpaddd xmm1, xmm1, OWORD PTR L_avx512_aes_gcm_one + vmovdqu OWORD PTR [rax], xmm1 + vpxor xmm0, xmm0, [rcx] + vaesenc xmm0, xmm0, [rcx+16] + vaesenc xmm0, xmm0, [rcx+32] + vaesenc xmm0, xmm0, [rcx+48] + vaesenc xmm0, xmm0, [rcx+64] + vaesenc xmm0, xmm0, [rcx+80] + vaesenc xmm0, xmm0, [rcx+96] + vaesenc xmm0, xmm0, [rcx+112] + vaesenc xmm0, xmm0, [rcx+128] + vaesenc xmm0, xmm0, [rcx+144] + cmp edx, 11 + vmovdqa xmm1, OWORD PTR [rcx+160] + jl L_AES_GCM_encrypt_block_avx512_aesenc_block_last + vaesenc xmm0, xmm0, xmm1 + vaesenc xmm0, xmm0, [rcx+176] + cmp edx, 13 + vmovdqa xmm1, OWORD PTR [rcx+192] + jl L_AES_GCM_encrypt_block_avx512_aesenc_block_last + vaesenc xmm0, xmm0, xmm1 + vaesenc xmm0, xmm0, [rcx+208] + vmovdqa xmm1, OWORD PTR [rcx+224] +L_AES_GCM_encrypt_block_avx512_aesenc_block_last: + vaesenclast xmm0, xmm0, xmm1 + vmovdqu xmm1, OWORD PTR [r11] + vpxor xmm0, xmm0, xmm1 + vmovdqu OWORD PTR [r10], xmm0 + vpshufb xmm0, xmm0, OWORD PTR L_avx512_aes_gcm_bswap_mask + vzeroupper + ret +AES_GCM_encrypt_block_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_ghash_block_avx512 PROC + sub rsp, 32 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqa xmm4, OWORD PTR [rdx] + vmovdqa xmm5, OWORD PTR [r8] + vmovdqu xmm7, OWORD PTR [rcx] + vpshufb xmm7, xmm7, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm7 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm6, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm6, xmm6, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm6, 31 + vpslld xmm1, xmm6, 30 + vpslld xmm2, xmm6, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm6, xmm6, xmm0 + vpsrld xmm2, xmm6, 1 + vpsrld xmm3, xmm6, 2 + vpsrld xmm0, xmm6, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm6 + vpxor xmm4, xmm4, xmm2 + vmovdqa OWORD PTR [rdx], xmm4 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + add rsp, 32 + ret +AES_GCM_ghash_block_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_update_avx512 PROC + push r13 + push r12 + push r14 + push r15 + push rdi + push rsi + push rbx + push rbp + mov rax, rcx + mov r10, r8 + mov r8d, edx + mov r11, r9 + mov r9d, DWORD PTR [rsp+104] + mov r12, QWORD PTR [rsp+112] + mov r14, QWORD PTR [rsp+120] + mov r15, QWORD PTR [rsp+128] + sub rsp, 1200 + vmovdqu OWORD PTR [rsp+1040], xmm6 + vmovdqu OWORD PTR [rsp+1056], xmm7 + vmovdqu OWORD PTR [rsp+1072], xmm8 + vmovdqu OWORD PTR [rsp+1088], xmm9 + vmovdqu OWORD PTR [rsp+1104], xmm10 + vmovdqu OWORD PTR [rsp+1120], xmm11 + vmovdqu OWORD PTR [rsp+1136], xmm12 + vmovdqu OWORD PTR [rsp+1152], xmm13 + vmovdqu OWORD PTR [rsp+1168], xmm14 + vmovdqu OWORD PTR [rsp+1184], xmm15 + vmovdqa xmm6, OWORD PTR [r12] + vmovdqa xmm5, OWORD PTR [r14] + vpsrlq xmm9, xmm5, 63 + vpsllq xmm8, xmm5, 1 + vpslldq xmm9, xmm9, 8 + vpor xmm8, xmm8, xmm9 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpxor xmm5, xmm5, xmm8 + xor edi, edi + cmp r9d, 256 + jl L_AES_GCM_encrypt_update_avx512_done_128 + vmovdqa xmm2, xmm6 + ; H ^ 1 + vmovdqu OWORD PTR [rsp], xmm5 + ; H ^ 2 + vpclmulqdq xmm8, xmm5, xmm5, 0 + vpclmulqdq xmm11, xmm5, xmm5, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm0, xmm11 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpxor xmm9, xmm9, xmm5 + vpshufd xmm10, xmm0, 78 + vpxor xmm10, xmm10, xmm0 + vpclmulqdq xmm8, xmm0, xmm5, 0 + vpclmulqdq xmm11, xmm0, xmm5, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm1, xmm11 + vmovdqu OWORD PTR [rsp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm3, xmm11 + vmovdqu OWORD PTR [rsp+48], xmm3 + ; H ^ 5 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+64], xmm7 + ; H ^ 6 + vpclmulqdq xmm8, xmm1, xmm1, 0 + vpclmulqdq xmm11, xmm1, xmm1, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+80], xmm7 + ; H ^ 7 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm1, 78 + vpxor xmm9, xmm9, xmm1 + vpshufd xmm10, xmm3, 78 + vpxor xmm10, xmm10, xmm3 + vpclmulqdq xmm8, xmm3, xmm1, 0 + vpclmulqdq xmm11, xmm3, xmm1, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+96], xmm7 + ; H ^ 8 + vpclmulqdq xmm8, xmm3, xmm3, 0 + vpclmulqdq xmm11, xmm3, xmm3, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+112], xmm7 + ; H ^ 9 + vmovdqu xmm0, OWORD PTR [rsp+48] + vmovdqu xmm1, OWORD PTR [rsp+64] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+128], xmm7 + ; H ^ 10 + vmovdqu xmm0, OWORD PTR [rsp+64] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+144], xmm7 + ; H ^ 11 + vmovdqu xmm0, OWORD PTR [rsp+64] + vmovdqu xmm1, OWORD PTR [rsp+80] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+160], xmm7 + ; H ^ 12 + vmovdqu xmm0, OWORD PTR [rsp+80] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+176], xmm7 + ; H ^ 13 + vmovdqu xmm0, OWORD PTR [rsp+80] + vmovdqu xmm1, OWORD PTR [rsp+96] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+192], xmm7 + ; H ^ 14 + vmovdqu xmm0, OWORD PTR [rsp+96] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+208], xmm7 + ; H ^ 15 + vmovdqu xmm0, OWORD PTR [rsp+96] + vmovdqu xmm1, OWORD PTR [rsp+112] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+224], xmm7 + ; H ^ 16 + vmovdqu xmm0, OWORD PTR [rsp+112] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+240], xmm7 + cmp r9d, 512 + jl L_AES_GCM_encrypt_update_avx512_no_ext + ; H ^ 17 + vmovdqu xmm0, OWORD PTR [rsp+112] + vmovdqu xmm1, OWORD PTR [rsp+128] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+256], xmm7 + ; H ^ 18 + vmovdqu xmm0, OWORD PTR [rsp+128] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+272], xmm7 + ; H ^ 19 + vmovdqu xmm0, OWORD PTR [rsp+128] + vmovdqu xmm1, OWORD PTR [rsp+144] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+288], xmm7 + ; H ^ 20 + vmovdqu xmm0, OWORD PTR [rsp+144] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+304], xmm7 + ; H ^ 21 + vmovdqu xmm0, OWORD PTR [rsp+144] + vmovdqu xmm1, OWORD PTR [rsp+160] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+320], xmm7 + ; H ^ 22 + vmovdqu xmm0, OWORD PTR [rsp+160] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+336], xmm7 + ; H ^ 23 + vmovdqu xmm0, OWORD PTR [rsp+160] + vmovdqu xmm1, OWORD PTR [rsp+176] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+352], xmm7 + ; H ^ 24 + vmovdqu xmm0, OWORD PTR [rsp+176] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+368], xmm7 + ; H ^ 25 + vmovdqu xmm0, OWORD PTR [rsp+176] + vmovdqu xmm1, OWORD PTR [rsp+192] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+384], xmm7 + ; H ^ 26 + vmovdqu xmm0, OWORD PTR [rsp+192] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+400], xmm7 + ; H ^ 27 + vmovdqu xmm0, OWORD PTR [rsp+192] + vmovdqu xmm1, OWORD PTR [rsp+208] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+416], xmm7 + ; H ^ 28 + vmovdqu xmm0, OWORD PTR [rsp+208] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+432], xmm7 + ; H ^ 29 + vmovdqu xmm0, OWORD PTR [rsp+208] + vmovdqu xmm1, OWORD PTR [rsp+224] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+448], xmm7 + ; H ^ 30 + vmovdqu xmm0, OWORD PTR [rsp+224] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+464], xmm7 + ; H ^ 31 + vmovdqu xmm0, OWORD PTR [rsp+224] + vmovdqu xmm1, OWORD PTR [rsp+240] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+480], xmm7 + ; H ^ 32 + vmovdqu xmm0, OWORD PTR [rsp+240] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+496], xmm7 +L_AES_GCM_encrypt_update_avx512_no_ext: + vbroadcasti32x4 zmm22, ptr_L_avx512_aes_gcm_bswap_epi64 + vbroadcasti32x4 zmm30, ptr_L_avx512_aes_gcm_bswap_mask + vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128 + vbroadcasti32x4 zmm9, [rax] + vbroadcasti32x4 zmm10, [rax+16] + vbroadcasti32x4 zmm11, [rax+32] + vbroadcasti32x4 zmm12, [rax+48] + vbroadcasti32x4 zmm13, [rax+64] + vbroadcasti32x4 zmm14, [rax+80] + vbroadcasti32x4 zmm15, [rax+96] + vbroadcasti32x4 zmm1, [rax+112] + vbroadcasti32x4 zmm2, [rax+128] + vbroadcasti32x4 zmm3, [rax+144] + cmp r9d, 512 + jl L_AES_GCM_encrypt_update_avx512_no_windows + mov ebp, r9d + and ebp, 4294966784 + vmovdqu64 zmm23, [rsp+448] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+384] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+320] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp+256] + vshufi64x2 zmm26, zmm26, zmm26, 27 + vmovdqu64 [rsp+512], zmm23 + vmovdqu64 [rsp+576], zmm24 + vmovdqu64 [rsp+640], zmm25 + vmovdqu64 [rsp+704], zmm26 + vmovdqu64 zmm23, [rsp+192] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+128] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+64] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp] + vshufi64x2 zmm26, zmm26, zmm26, 27 + vmovdqu64 [rsp+768], zmm23 + vmovdqu64 [rsp+832], zmm24 + vmovdqu64 [rsp+896], zmm25 + vmovdqu64 [rsp+960], zmm26 + ; 512 bytes of input + lea rsi, QWORD PTR [r10+rdi] + vbroadcasti32x4 zmm20, [r15] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [r15] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [r15], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r8d, 11 + vbroadcasti32x4 zmm20, [rax+160] + jl L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r8d, 13 + vbroadcasti32x4 zmm20, [rax+192] + jl L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+224] +L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add edi, 256 + vbroadcasti32x4 zmm20, [r15] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [r15] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [r15], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r8d, 11 + vbroadcasti32x4 zmm20, [rax+160] + jl L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r8d, 13 + vbroadcasti32x4 zmm20, [rax+192] + jl L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+224] +L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add edi, 256 + cmp edi, ebp + jge L_AES_GCM_encrypt_update_avx512_last_win +L_AES_GCM_encrypt_update_avx512_win_loop: + lea rbx, QWORD PTR [r10+rdi] + vpxorq zmm21, zmm21, zmm21 + vinserti32x4 zmm21, zmm21, xmm6, 0 + vbroadcasti32x4 zmm20, [r15] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [r15] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [r15], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vmovdqu64 zmm31, [rsi] + vpshufb zmm31, zmm31, zmm30 + vpxorq zmm31, zmm31, zmm21 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vpclmulqdq zmm23, zmm31, [rsp+512], 0 + vpclmulqdq zmm24, zmm31, [rsp+512], 1 + vpclmulqdq zmm25, zmm31, [rsp+512], 16 + vpclmulqdq zmm26, zmm31, [rsp+512], 17 + vmovdqa64 zmm27, zmm23 + vpxorq zmm28, zmm25, zmm24 + vmovdqa64 zmm29, zmm26 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vmovdqu64 zmm31, [rsi+64] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vpclmulqdq zmm23, zmm31, [rsp+576], 0 + vpclmulqdq zmm24, zmm31, [rsp+576], 1 + vpclmulqdq zmm25, zmm31, [rsp+576], 16 + vpclmulqdq zmm26, zmm31, [rsp+576], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vmovdqu64 zmm31, [rsi+128] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vpclmulqdq zmm23, zmm31, [rsp+640], 0 + vpclmulqdq zmm24, zmm31, [rsp+640], 1 + vpclmulqdq zmm25, zmm31, [rsp+640], 16 + vpclmulqdq zmm26, zmm31, [rsp+640], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vmovdqu64 zmm31, [rsi+192] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vpclmulqdq zmm23, zmm31, [rsp+704], 0 + vpclmulqdq zmm24, zmm31, [rsp+704], 1 + vpclmulqdq zmm25, zmm31, [rsp+704], 16 + vpclmulqdq zmm26, zmm31, [rsp+704], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r8d, 11 + vbroadcasti32x4 zmm20, [rax+160] + jl L_AES_GCM_encrypt_update_avx512_a_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r8d, 13 + vbroadcasti32x4 zmm20, [rax+192] + jl L_AES_GCM_encrypt_update_avx512_a_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+224] +L_AES_GCM_encrypt_update_avx512_a_il_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add edi, 256 + vbroadcasti32x4 zmm20, [r15] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [r15] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [r15], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vmovdqu64 zmm31, [rsi+256] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vpclmulqdq zmm23, zmm31, [rsp+768], 0 + vpclmulqdq zmm24, zmm31, [rsp+768], 1 + vpclmulqdq zmm25, zmm31, [rsp+768], 16 + vpclmulqdq zmm26, zmm31, [rsp+768], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vmovdqu64 zmm31, [rsi+320] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vpclmulqdq zmm23, zmm31, [rsp+832], 0 + vpclmulqdq zmm24, zmm31, [rsp+832], 1 + vpclmulqdq zmm25, zmm31, [rsp+832], 16 + vpclmulqdq zmm26, zmm31, [rsp+832], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vmovdqu64 zmm31, [rsi+384] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vpclmulqdq zmm23, zmm31, [rsp+896], 0 + vpclmulqdq zmm24, zmm31, [rsp+896], 1 + vpclmulqdq zmm25, zmm31, [rsp+896], 16 + vpclmulqdq zmm26, zmm31, [rsp+896], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vmovdqu64 zmm31, [rsi+448] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vpclmulqdq zmm23, zmm31, [rsp+960], 0 + vpclmulqdq zmm24, zmm31, [rsp+960], 1 + vpclmulqdq zmm25, zmm31, [rsp+960], 16 + vpclmulqdq zmm26, zmm31, [rsp+960], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r8d, 11 + vbroadcasti32x4 zmm20, [rax+160] + jl L_AES_GCM_encrypt_update_avx512_b_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r8d, 13 + vbroadcasti32x4 zmm20, [rax+192] + jl L_AES_GCM_encrypt_update_avx512_b_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+224] +L_AES_GCM_encrypt_update_avx512_b_il_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add edi, 256 + vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128 + vpclmulqdq zmm23, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm23, 150 + vpclmulqdq zmm23, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm23, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 + mov rsi, rbx + cmp edi, ebp + jl L_AES_GCM_encrypt_update_avx512_win_loop +L_AES_GCM_encrypt_update_avx512_last_win: + vpxorq zmm20, zmm20, zmm20 + vinserti32x4 zmm20, zmm20, xmm6, 0 + vmovdqu64 zmm23, [rsp+512] + vmovdqu64 zmm24, [rsp+576] + vmovdqu64 zmm25, [rsp+640] + vmovdqu64 zmm26, [rsp+704] + vmovdqu64 zmm21, [rsi] + vpshufb zmm21, zmm21, zmm30 + vpxorq zmm21, zmm21, zmm20 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vmovdqa64 zmm27, zmm16 + vpxorq zmm28, zmm18, zmm17 + vmovdqa64 zmm29, zmm19 + vmovdqu64 zmm21, [rsi+64] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rsi+128] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rsi+192] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm23, [rsp+768] + vmovdqu64 zmm24, [rsp+832] + vmovdqu64 zmm25, [rsp+896] + vmovdqu64 zmm26, [rsp+960] + vmovdqu64 zmm21, [rsi+256] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rsi+320] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rsi+384] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rsi+448] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vpclmulqdq zmm21, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm21, 150 + vpclmulqdq zmm21, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm21, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 +L_AES_GCM_encrypt_update_avx512_no_windows: + vmovdqu64 zmm23, [rsp+192] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+128] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+64] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp] + vshufi64x2 zmm26, zmm26, zmm26, 27 + mov r13d, r9d + and r13d, 4294967040 + cmp edi, r13d + jge L_AES_GCM_encrypt_update_avx512_after_256 + ; 256 bytes of input + vbroadcasti32x4 zmm20, [r15] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [r15] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [r15], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r8d, 11 + vbroadcasti32x4 zmm20, [rax+160] + jl L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r8d, 13 + vbroadcasti32x4 zmm20, [rax+192] + jl L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+224] +L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + mov rsi, rdx + add edi, 256 + cmp edi, r13d + jge L_AES_GCM_encrypt_update_avx512_last_ghash +L_AES_GCM_encrypt_update_avx512_ghash_128: + vbroadcasti32x4 zmm20, [r15] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [r15] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [r15], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r8d, 11 + vbroadcasti32x4 zmm20, [rax+160] + jl L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r8d, 13 + vbroadcasti32x4 zmm20, [rax+192] + jl L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+224] +L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + vpxorq zmm20, zmm20, zmm20 + vinserti32x4 zmm20, zmm20, xmm6, 0 + vmovdqu64 zmm21, [rsi] + vpshufb zmm21, zmm21, zmm30 + vpxorq zmm21, zmm21, zmm20 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vmovdqa64 zmm27, zmm16 + vpxorq zmm28, zmm18, zmm17 + vmovdqa64 zmm29, zmm19 + vmovdqu64 zmm21, [rsi+64] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rsi+128] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rsi+192] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vpclmulqdq zmm21, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm21, 150 + vpclmulqdq zmm21, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm21, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 + mov rsi, rdx + add edi, 256 + cmp edi, r13d + jl L_AES_GCM_encrypt_update_avx512_ghash_128 +L_AES_GCM_encrypt_update_avx512_last_ghash: + vpxorq zmm20, zmm20, zmm20 + vinserti32x4 zmm20, zmm20, xmm6, 0 + vmovdqu64 zmm21, [rsi] + vpshufb zmm21, zmm21, zmm30 + vpxorq zmm21, zmm21, zmm20 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vmovdqa64 zmm27, zmm16 + vpxorq zmm28, zmm18, zmm17 + vmovdqa64 zmm29, zmm19 + vmovdqu64 zmm21, [rsi+64] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rsi+128] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rsi+192] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vpclmulqdq zmm21, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm21, 150 + vpclmulqdq zmm21, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm21, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 +L_AES_GCM_encrypt_update_avx512_after_256: + vmovdqu xmm5, OWORD PTR [rsp] +L_AES_GCM_encrypt_update_avx512_done_128: + mov edx, r9d + cmp edi, edx + jge L_AES_GCM_encrypt_update_avx512_done_enc + mov r13d, r9d + and r13d, 4294967280 + cmp edi, r13d + jge L_AES_GCM_encrypt_update_avx512_last_block_done + vmovdqu xmm9, OWORD PTR [r15] + vpshufb xmm8, xmm9, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm9, OWORD PTR L_avx512_aes_gcm_one + vmovdqu OWORD PTR [r15], xmm9 + vpxor xmm8, xmm8, [rax] + vaesenc xmm8, xmm8, [rax+16] + vaesenc xmm8, xmm8, [rax+32] + vaesenc xmm8, xmm8, [rax+48] + vaesenc xmm8, xmm8, [rax+64] + vaesenc xmm8, xmm8, [rax+80] + vaesenc xmm8, xmm8, [rax+96] + vaesenc xmm8, xmm8, [rax+112] + vaesenc xmm8, xmm8, [rax+128] + vaesenc xmm8, xmm8, [rax+144] + cmp r8d, 11 + vmovdqa xmm9, OWORD PTR [rax+160] + jl L_AES_GCM_encrypt_update_avx512_aesenc_block_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [rax+176] + cmp r8d, 13 + vmovdqa xmm9, OWORD PTR [rax+192] + jl L_AES_GCM_encrypt_update_avx512_aesenc_block_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [rax+208] + vmovdqa xmm9, OWORD PTR [rax+224] +L_AES_GCM_encrypt_update_avx512_aesenc_block_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqu xmm9, OWORD PTR [r11+rdi] + vpxor xmm8, xmm8, xmm9 + vmovdqu OWORD PTR [r10+rdi], xmm8 + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm8 + add edi, 16 + cmp edi, r13d + jge L_AES_GCM_encrypt_update_avx512_last_block_ghash +L_AES_GCM_encrypt_update_avx512_last_block_start: + vmovdqu xmm13, OWORD PTR [r11+rdi] + vmovdqu xmm9, OWORD PTR [r15] + vpshufb xmm8, xmm9, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm9, OWORD PTR L_avx512_aes_gcm_one + vmovdqu OWORD PTR [r15], xmm9 + vpxor xmm8, xmm8, [rax] + vpclmulqdq xmm10, xmm6, xmm5, 16 + vaesenc xmm8, xmm8, [rax+16] + vaesenc xmm8, xmm8, [rax+32] + vpclmulqdq xmm11, xmm6, xmm5, 1 + vaesenc xmm8, xmm8, [rax+48] + vaesenc xmm8, xmm8, [rax+64] + vpclmulqdq xmm12, xmm6, xmm5, 0 + vaesenc xmm8, xmm8, [rax+80] + vpclmulqdq xmm1, xmm6, xmm5, 17 + vaesenc xmm8, xmm8, [rax+96] + vpxor xmm10, xmm10, xmm11 + vpslldq xmm2, xmm10, 8 + vpsrldq xmm10, xmm10, 8 + vaesenc xmm8, xmm8, [rax+112] + vpxor xmm2, xmm2, xmm12 + vpxor xmm3, xmm1, xmm10 + vmovdqa xmm0, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm2, xmm0, 16 + vaesenc xmm8, xmm8, [rax+128] + vpshufd xmm10, xmm2, 78 + vpxor xmm10, xmm10, xmm11 + vpclmulqdq xmm11, xmm10, xmm0, 16 + vaesenc xmm8, xmm8, [rax+144] + vpshufd xmm10, xmm10, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm6, xmm10, xmm3 + cmp r8d, 11 + vmovdqa xmm9, OWORD PTR [rax+160] + jl L_AES_GCM_encrypt_update_avx512_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [rax+176] + cmp r8d, 13 + vmovdqa xmm9, OWORD PTR [rax+192] + jl L_AES_GCM_encrypt_update_avx512_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [rax+208] + vmovdqa xmm9, OWORD PTR [rax+224] +L_AES_GCM_encrypt_update_avx512_aesenc_gfmul_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqa xmm0, xmm13 + vpxor xmm8, xmm8, xmm0 + vmovdqu OWORD PTR [r10+rdi], xmm8 + vpshufb xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_bswap_mask + add edi, 16 + vpxor xmm6, xmm6, xmm8 + cmp edi, r13d + jl L_AES_GCM_encrypt_update_avx512_last_block_start +L_AES_GCM_encrypt_update_avx512_last_block_ghash: + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpxor xmm9, xmm9, xmm5 + vpshufd xmm10, xmm6, 78 + vpxor xmm10, xmm10, xmm6 + vpclmulqdq xmm8, xmm6, xmm5, 0 + vpclmulqdq xmm11, xmm6, xmm5, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm6, xmm11 +L_AES_GCM_encrypt_update_avx512_last_block_done: +L_AES_GCM_encrypt_update_avx512_done_enc: + vmovdqa OWORD PTR [r12], xmm6 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+1040] + vmovdqu xmm7, OWORD PTR [rsp+1056] + vmovdqu xmm8, OWORD PTR [rsp+1072] + vmovdqu xmm9, OWORD PTR [rsp+1088] + vmovdqu xmm10, OWORD PTR [rsp+1104] + vmovdqu xmm11, OWORD PTR [rsp+1120] + vmovdqu xmm12, OWORD PTR [rsp+1136] + vmovdqu xmm13, OWORD PTR [rsp+1152] + vmovdqu xmm14, OWORD PTR [rsp+1168] + vmovdqu xmm15, OWORD PTR [rsp+1184] + add rsp, 1200 + pop rbp + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r12 + pop r13 + ret +AES_GCM_encrypt_update_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_final_avx512 PROC + push r13 + push r12 + push r14 + mov rax, rcx + mov r10d, r9d + mov r9, rdx + mov r11d, DWORD PTR [rsp+64] + mov r12, QWORD PTR [rsp+72] + mov r14, QWORD PTR [rsp+80] + sub rsp, 144 + vmovdqu OWORD PTR [rsp+16], xmm6 + vmovdqu OWORD PTR [rsp+32], xmm7 + vmovdqu OWORD PTR [rsp+48], xmm8 + vmovdqu OWORD PTR [rsp+64], xmm9 + vmovdqu OWORD PTR [rsp+80], xmm10 + vmovdqu OWORD PTR [rsp+96], xmm11 + vmovdqu OWORD PTR [rsp+112], xmm12 + vmovdqu OWORD PTR [rsp+128], xmm13 + vmovdqa xmm4, OWORD PTR [rax] + vmovdqa xmm5, OWORD PTR [r12] + vmovdqa xmm6, OWORD PTR [r14] + vpsrlq xmm8, xmm5, 63 + vpsllq xmm7, xmm5, 1 + vpslldq xmm8, xmm8, 8 + vpor xmm7, xmm7, xmm8 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpxor xmm5, xmm5, xmm7 + mov edx, r10d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + vmovq xmm0, rdx + vmovq xmm1, rcx + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_red_avx + vpshufd xmm8, xmm5, 78 + vpxor xmm8, xmm8, xmm5 + vpshufd xmm9, xmm4, 78 + vpxor xmm9, xmm9, xmm4 + vpclmulqdq xmm7, xmm4, xmm5, 0 + vpclmulqdq xmm10, xmm4, xmm5, 17 + vpclmulqdq xmm8, xmm8, xmm9, 0 + vpternlogq xmm8, xmm10, xmm7, 150 + vmovdqa xmm9, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpternlogq xmm8, xmm7, xmm11, 150 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm10, xmm8, xmm11, 150 + vmovdqa xmm4, xmm10 + vpshufb xmm4, xmm4, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm0, xmm4, xmm6 + cmp r8d, 16 + je L_AES_GCM_encrypt_final_avx512_store_tag_16 + xor rcx, rcx + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_encrypt_final_avx512_store_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + mov BYTE PTR [r9+rcx], r13b + inc ecx + cmp ecx, r8d + jne L_AES_GCM_encrypt_final_avx512_store_tag_loop + jmp L_AES_GCM_encrypt_final_avx512_store_tag_done +L_AES_GCM_encrypt_final_avx512_store_tag_16: + vmovdqu OWORD PTR [r9], xmm0 +L_AES_GCM_encrypt_final_avx512_store_tag_done: + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+16] + vmovdqu xmm7, OWORD PTR [rsp+32] + vmovdqu xmm8, OWORD PTR [rsp+48] + vmovdqu xmm9, OWORD PTR [rsp+64] + vmovdqu xmm10, OWORD PTR [rsp+80] + vmovdqu xmm11, OWORD PTR [rsp+96] + vmovdqu xmm12, OWORD PTR [rsp+112] + vmovdqu xmm13, OWORD PTR [rsp+128] + add rsp, 144 + pop r14 + pop r12 + pop r13 + ret +AES_GCM_encrypt_final_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_decrypt_update_avx512 PROC + push r13 + push r12 + push r14 + push r15 + push rdi + push rsi + push rbx + mov rax, rcx + mov r10, r8 + mov r8d, edx + mov r11, r9 + mov r9d, DWORD PTR [rsp+96] + mov r12, QWORD PTR [rsp+104] + mov r14, QWORD PTR [rsp+112] + mov r15, QWORD PTR [rsp+120] + sub rsp, 1200 + vmovdqu OWORD PTR [rsp+1040], xmm6 + vmovdqu OWORD PTR [rsp+1056], xmm7 + vmovdqu OWORD PTR [rsp+1072], xmm8 + vmovdqu OWORD PTR [rsp+1088], xmm9 + vmovdqu OWORD PTR [rsp+1104], xmm10 + vmovdqu OWORD PTR [rsp+1120], xmm11 + vmovdqu OWORD PTR [rsp+1136], xmm12 + vmovdqu OWORD PTR [rsp+1152], xmm13 + vmovdqu OWORD PTR [rsp+1168], xmm14 + vmovdqu OWORD PTR [rsp+1184], xmm15 + vmovdqa xmm6, OWORD PTR [r12] + vmovdqa xmm5, OWORD PTR [r14] + vpsrlq xmm9, xmm5, 63 + vpsllq xmm8, xmm5, 1 + vpslldq xmm9, xmm9, 8 + vpor xmm8, xmm8, xmm9 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpxor xmm5, xmm5, xmm8 + xor edi, edi + cmp r9d, 256 + jl L_AES_GCM_decrypt_update_avx512_done_128 + vmovdqa xmm2, xmm6 + ; H ^ 1 + vmovdqu OWORD PTR [rsp], xmm5 + ; H ^ 2 + vpclmulqdq xmm8, xmm5, xmm5, 0 + vpclmulqdq xmm11, xmm5, xmm5, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm0, xmm11 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpxor xmm9, xmm9, xmm5 + vpshufd xmm10, xmm0, 78 + vpxor xmm10, xmm10, xmm0 + vpclmulqdq xmm8, xmm0, xmm5, 0 + vpclmulqdq xmm11, xmm0, xmm5, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm1, xmm11 + vmovdqu OWORD PTR [rsp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm3, xmm11 + vmovdqu OWORD PTR [rsp+48], xmm3 + ; H ^ 5 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+64], xmm7 + ; H ^ 6 + vpclmulqdq xmm8, xmm1, xmm1, 0 + vpclmulqdq xmm11, xmm1, xmm1, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+80], xmm7 + ; H ^ 7 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm1, 78 + vpxor xmm9, xmm9, xmm1 + vpshufd xmm10, xmm3, 78 + vpxor xmm10, xmm10, xmm3 + vpclmulqdq xmm8, xmm3, xmm1, 0 + vpclmulqdq xmm11, xmm3, xmm1, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+96], xmm7 + ; H ^ 8 + vpclmulqdq xmm8, xmm3, xmm3, 0 + vpclmulqdq xmm11, xmm3, xmm3, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+112], xmm7 + ; H ^ 9 + vmovdqu xmm0, OWORD PTR [rsp+48] + vmovdqu xmm1, OWORD PTR [rsp+64] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+128], xmm7 + ; H ^ 10 + vmovdqu xmm0, OWORD PTR [rsp+64] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+144], xmm7 + ; H ^ 11 + vmovdqu xmm0, OWORD PTR [rsp+64] + vmovdqu xmm1, OWORD PTR [rsp+80] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+160], xmm7 + ; H ^ 12 + vmovdqu xmm0, OWORD PTR [rsp+80] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+176], xmm7 + ; H ^ 13 + vmovdqu xmm0, OWORD PTR [rsp+80] + vmovdqu xmm1, OWORD PTR [rsp+96] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+192], xmm7 + ; H ^ 14 + vmovdqu xmm0, OWORD PTR [rsp+96] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+208], xmm7 + ; H ^ 15 + vmovdqu xmm0, OWORD PTR [rsp+96] + vmovdqu xmm1, OWORD PTR [rsp+112] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+224], xmm7 + ; H ^ 16 + vmovdqu xmm0, OWORD PTR [rsp+112] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+240], xmm7 + cmp r9d, 512 + jl L_AES_GCM_decrypt_update_avx512_no_ext + ; H ^ 17 + vmovdqu xmm0, OWORD PTR [rsp+112] + vmovdqu xmm1, OWORD PTR [rsp+128] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+256], xmm7 + ; H ^ 18 + vmovdqu xmm0, OWORD PTR [rsp+128] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+272], xmm7 + ; H ^ 19 + vmovdqu xmm0, OWORD PTR [rsp+128] + vmovdqu xmm1, OWORD PTR [rsp+144] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+288], xmm7 + ; H ^ 20 + vmovdqu xmm0, OWORD PTR [rsp+144] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+304], xmm7 + ; H ^ 21 + vmovdqu xmm0, OWORD PTR [rsp+144] + vmovdqu xmm1, OWORD PTR [rsp+160] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+320], xmm7 + ; H ^ 22 + vmovdqu xmm0, OWORD PTR [rsp+160] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+336], xmm7 + ; H ^ 23 + vmovdqu xmm0, OWORD PTR [rsp+160] + vmovdqu xmm1, OWORD PTR [rsp+176] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+352], xmm7 + ; H ^ 24 + vmovdqu xmm0, OWORD PTR [rsp+176] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+368], xmm7 + ; H ^ 25 + vmovdqu xmm0, OWORD PTR [rsp+176] + vmovdqu xmm1, OWORD PTR [rsp+192] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+384], xmm7 + ; H ^ 26 + vmovdqu xmm0, OWORD PTR [rsp+192] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+400], xmm7 + ; H ^ 27 + vmovdqu xmm0, OWORD PTR [rsp+192] + vmovdqu xmm1, OWORD PTR [rsp+208] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+416], xmm7 + ; H ^ 28 + vmovdqu xmm0, OWORD PTR [rsp+208] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+432], xmm7 + ; H ^ 29 + vmovdqu xmm0, OWORD PTR [rsp+208] + vmovdqu xmm1, OWORD PTR [rsp+224] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+448], xmm7 + ; H ^ 30 + vmovdqu xmm0, OWORD PTR [rsp+224] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+464], xmm7 + ; H ^ 31 + vmovdqu xmm0, OWORD PTR [rsp+224] + vmovdqu xmm1, OWORD PTR [rsp+240] + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpxor xmm9, xmm9, xmm0 + vpshufd xmm10, xmm1, 78 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpternlogq xmm9, xmm11, xmm8, 150 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+480], xmm7 + ; H ^ 32 + vmovdqu xmm0, OWORD PTR [rsp+240] + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm11, xmm0, xmm0, 17 + vpxor xmm9, xmm9, xmm9 + vmovdqa xmm10, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm12, xmm10, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm9, xmm8, xmm12, 150 + vpclmulqdq xmm12, xmm10, xmm9, 1 + vpshufd xmm9, xmm9, 78 + vpternlogq xmm11, xmm9, xmm12, 150 + vmovdqa xmm7, xmm11 + vmovdqu OWORD PTR [rsp+496], xmm7 +L_AES_GCM_decrypt_update_avx512_no_ext: + vbroadcasti32x4 zmm22, ptr_L_avx512_aes_gcm_bswap_epi64 + vbroadcasti32x4 zmm30, ptr_L_avx512_aes_gcm_bswap_mask + vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128 + vbroadcasti32x4 zmm9, [rax] + vbroadcasti32x4 zmm10, [rax+16] + vbroadcasti32x4 zmm11, [rax+32] + vbroadcasti32x4 zmm12, [rax+48] + vbroadcasti32x4 zmm13, [rax+64] + vbroadcasti32x4 zmm14, [rax+80] + vbroadcasti32x4 zmm15, [rax+96] + vbroadcasti32x4 zmm1, [rax+112] + vbroadcasti32x4 zmm2, [rax+128] + vbroadcasti32x4 zmm3, [rax+144] + cmp r9d, 512 + jl L_AES_GCM_decrypt_update_avx512_no_windows + mov r13d, r9d + and r13d, 4294966784 + vmovdqu64 zmm23, [rsp+448] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+384] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+320] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp+256] + vshufi64x2 zmm26, zmm26, zmm26, 27 + vmovdqu64 [rsp+512], zmm23 + vmovdqu64 [rsp+576], zmm24 + vmovdqu64 [rsp+640], zmm25 + vmovdqu64 [rsp+704], zmm26 + vmovdqu64 zmm23, [rsp+192] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+128] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+64] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp] + vshufi64x2 zmm26, zmm26, zmm26, 27 + vmovdqu64 [rsp+768], zmm23 + vmovdqu64 [rsp+832], zmm24 + vmovdqu64 [rsp+896], zmm25 + vmovdqu64 [rsp+960], zmm26 + ; 512 bytes of input + xor esi, esi + lea rbx, QWORD PTR [r11+rdi] + vpxorq zmm20, zmm20, zmm20 + vinserti32x4 zmm20, zmm20, xmm6, 0 + vmovdqu64 zmm23, [rsp+512] + vmovdqu64 zmm24, [rsp+576] + vmovdqu64 zmm25, [rsp+640] + vmovdqu64 zmm26, [rsp+704] + vmovdqu64 zmm21, [rbx] + vpshufb zmm21, zmm21, zmm30 + vpxorq zmm21, zmm21, zmm20 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vmovdqa64 zmm27, zmm16 + vpxorq zmm28, zmm18, zmm17 + vmovdqa64 zmm29, zmm19 + vmovdqu64 zmm21, [rbx+64] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rbx+128] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rbx+192] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm23, [rsp+768] + vmovdqu64 zmm24, [rsp+832] + vmovdqu64 zmm25, [rsp+896] + vmovdqu64 zmm26, [rsp+960] + vmovdqu64 zmm21, [rbx+256] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rbx+320] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rbx+384] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rbx+448] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vpclmulqdq zmm21, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm21, 150 + vpclmulqdq zmm21, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm21, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 + add edi, 512 + cmp edi, r13d + jge L_AES_GCM_decrypt_update_avx512_last_aes +L_AES_GCM_decrypt_update_avx512_win_loop: + lea rbx, QWORD PTR [r11+rdi] + vpxorq zmm21, zmm21, zmm21 + vinserti32x4 zmm21, zmm21, xmm6, 0 + vbroadcasti32x4 zmm20, [r15] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [r15] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [r15], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vmovdqu64 zmm31, [rbx] + vpshufb zmm31, zmm31, zmm30 + vpxorq zmm31, zmm31, zmm21 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vpclmulqdq zmm23, zmm31, [rsp+512], 0 + vpclmulqdq zmm24, zmm31, [rsp+512], 1 + vpclmulqdq zmm25, zmm31, [rsp+512], 16 + vpclmulqdq zmm26, zmm31, [rsp+512], 17 + vmovdqa64 zmm27, zmm23 + vpxorq zmm28, zmm25, zmm24 + vmovdqa64 zmm29, zmm26 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vmovdqu64 zmm31, [rbx+64] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vpclmulqdq zmm23, zmm31, [rsp+576], 0 + vpclmulqdq zmm24, zmm31, [rsp+576], 1 + vpclmulqdq zmm25, zmm31, [rsp+576], 16 + vpclmulqdq zmm26, zmm31, [rsp+576], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vmovdqu64 zmm31, [rbx+128] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vpclmulqdq zmm23, zmm31, [rsp+640], 0 + vpclmulqdq zmm24, zmm31, [rsp+640], 1 + vpclmulqdq zmm25, zmm31, [rsp+640], 16 + vpclmulqdq zmm26, zmm31, [rsp+640], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vmovdqu64 zmm31, [rbx+192] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vpclmulqdq zmm23, zmm31, [rsp+704], 0 + vpclmulqdq zmm24, zmm31, [rsp+704], 1 + vpclmulqdq zmm25, zmm31, [rsp+704], 16 + vpclmulqdq zmm26, zmm31, [rsp+704], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r8d, 11 + vbroadcasti32x4 zmm20, [rax+160] + jl L_AES_GCM_decrypt_update_avx512_a_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r8d, 13 + vbroadcasti32x4 zmm20, [rax+192] + jl L_AES_GCM_decrypt_update_avx512_a_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+224] +L_AES_GCM_decrypt_update_avx512_a_il_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [r11+rsi] + lea rdx, QWORD PTR [r10+rsi] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add esi, 256 + vbroadcasti32x4 zmm20, [r15] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [r15] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [r15], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vmovdqu64 zmm31, [rbx+256] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vpclmulqdq zmm23, zmm31, [rsp+768], 0 + vpclmulqdq zmm24, zmm31, [rsp+768], 1 + vpclmulqdq zmm25, zmm31, [rsp+768], 16 + vpclmulqdq zmm26, zmm31, [rsp+768], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vmovdqu64 zmm31, [rbx+320] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vpclmulqdq zmm23, zmm31, [rsp+832], 0 + vpclmulqdq zmm24, zmm31, [rsp+832], 1 + vpclmulqdq zmm25, zmm31, [rsp+832], 16 + vpclmulqdq zmm26, zmm31, [rsp+832], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vmovdqu64 zmm31, [rbx+384] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vpclmulqdq zmm23, zmm31, [rsp+896], 0 + vpclmulqdq zmm24, zmm31, [rsp+896], 1 + vpclmulqdq zmm25, zmm31, [rsp+896], 16 + vpclmulqdq zmm26, zmm31, [rsp+896], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vmovdqu64 zmm31, [rbx+448] + vpshufb zmm31, zmm31, zmm30 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vpclmulqdq zmm23, zmm31, [rsp+960], 0 + vpclmulqdq zmm24, zmm31, [rsp+960], 1 + vpclmulqdq zmm25, zmm31, [rsp+960], 16 + vpclmulqdq zmm26, zmm31, [rsp+960], 17 + vpxorq zmm27, zmm27, zmm23 + vpternlogq zmm28, zmm25, zmm24, 150 + vpxorq zmm29, zmm29, zmm26 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r8d, 11 + vbroadcasti32x4 zmm20, [rax+160] + jl L_AES_GCM_decrypt_update_avx512_b_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r8d, 13 + vbroadcasti32x4 zmm20, [rax+192] + jl L_AES_GCM_decrypt_update_avx512_b_il_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+224] +L_AES_GCM_decrypt_update_avx512_b_il_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [r11+rsi] + lea rdx, QWORD PTR [r10+rsi] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add esi, 256 + vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128 + vpclmulqdq zmm23, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm23, 150 + vpclmulqdq zmm23, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm23, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 + add edi, 512 + cmp edi, r13d + jl L_AES_GCM_decrypt_update_avx512_win_loop +L_AES_GCM_decrypt_update_avx512_last_aes: + vbroadcasti32x4 zmm20, [r15] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [r15] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [r15], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r8d, 11 + vbroadcasti32x4 zmm20, [rax+160] + jl L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r8d, 13 + vbroadcasti32x4 zmm20, [rax+192] + jl L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+224] +L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [r11+rsi] + lea rdx, QWORD PTR [r10+rsi] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add esi, 256 + vbroadcasti32x4 zmm20, [r15] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [r15] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [r15], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r8d, 11 + vbroadcasti32x4 zmm20, [rax+160] + jl L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r8d, 13 + vbroadcasti32x4 zmm20, [rax+192] + jl L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+224] +L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [r11+rsi] + lea rdx, QWORD PTR [r10+rsi] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add esi, 256 +L_AES_GCM_decrypt_update_avx512_no_windows: + vmovdqu64 zmm23, [rsp+192] + vshufi64x2 zmm23, zmm23, zmm23, 27 + vmovdqu64 zmm24, [rsp+128] + vshufi64x2 zmm24, zmm24, zmm24, 27 + vmovdqu64 zmm25, [rsp+64] + vshufi64x2 zmm25, zmm25, zmm25, 27 + vmovdqu64 zmm26, [rsp] + vshufi64x2 zmm26, zmm26, zmm26, 27 + mov r13d, r9d + and r13d, 4294967040 + cmp edi, r13d + jge L_AES_GCM_decrypt_update_avx512_after_256 + ; 256 bytes of input + lea rbx, QWORD PTR [r11+rdi] + vpxorq zmm20, zmm20, zmm20 + vinserti32x4 zmm20, zmm20, xmm6, 0 + vmovdqu64 zmm21, [rbx] + vpshufb zmm21, zmm21, zmm30 + vpxorq zmm21, zmm21, zmm20 + vpclmulqdq zmm16, zmm21, zmm23, 0 + vpclmulqdq zmm17, zmm21, zmm23, 1 + vpclmulqdq zmm18, zmm21, zmm23, 16 + vpclmulqdq zmm19, zmm21, zmm23, 17 + vmovdqa64 zmm27, zmm16 + vpxorq zmm28, zmm18, zmm17 + vmovdqa64 zmm29, zmm19 + vmovdqu64 zmm21, [rbx+64] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm24, 0 + vpclmulqdq zmm17, zmm21, zmm24, 1 + vpclmulqdq zmm18, zmm21, zmm24, 16 + vpclmulqdq zmm19, zmm21, zmm24, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rbx+128] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm25, 0 + vpclmulqdq zmm17, zmm21, zmm25, 1 + vpclmulqdq zmm18, zmm21, zmm25, 16 + vpclmulqdq zmm19, zmm21, zmm25, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vmovdqu64 zmm21, [rbx+192] + vpshufb zmm21, zmm21, zmm30 + vpclmulqdq zmm16, zmm21, zmm26, 0 + vpclmulqdq zmm17, zmm21, zmm26, 1 + vpclmulqdq zmm18, zmm21, zmm26, 16 + vpclmulqdq zmm19, zmm21, zmm26, 17 + vpxorq zmm27, zmm27, zmm16 + vpternlogq zmm28, zmm18, zmm17, 150 + vpxorq zmm29, zmm29, zmm19 + vpclmulqdq zmm21, zmm31, zmm27, 1 + vpshufd zmm27, zmm27, 78 + vpternlogq zmm28, zmm27, zmm21, 150 + vpclmulqdq zmm21, zmm31, zmm28, 1 + vpshufd zmm28, zmm28, 78 + vpternlogq zmm29, zmm28, zmm21, 150 + vextracti32x4 xmm0, zmm29, 1 + vextracti32x4 xmm4, zmm29, 2 + vextracti32x4 xmm5, zmm29, 3 + vpxorq xmm6, xmm29, xmm0 + vpternlogq xmm6, xmm5, xmm4, 150 + vbroadcasti32x4 zmm20, [r15] + vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 + vpshufb zmm16, zmm16, zmm22 + vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 + vpshufb zmm17, zmm17, zmm22 + vpaddd zmm18, zmm20, ptr_L_avx512_aes_gcm_inc_z2 + vpshufb zmm18, zmm18, zmm22 + vpaddd zmm19, zmm20, ptr_L_avx512_aes_gcm_inc_z3 + vpshufb zmm19, zmm19, zmm22 + vmovdqu xmm8, OWORD PTR [r15] + vpaddd xmm8, xmm8, OWORD PTR L_avx512_aes_gcm_sixteen + vmovdqu OWORD PTR [r15], xmm8 + vpxorq zmm16, zmm16, zmm9 + vpxorq zmm17, zmm17, zmm9 + vpxorq zmm18, zmm18, zmm9 + vpxorq zmm19, zmm19, zmm9 + vaesenc zmm16, zmm16, zmm10 + vaesenc zmm17, zmm17, zmm10 + vaesenc zmm18, zmm18, zmm10 + vaesenc zmm19, zmm19, zmm10 + vaesenc zmm16, zmm16, zmm11 + vaesenc zmm17, zmm17, zmm11 + vaesenc zmm18, zmm18, zmm11 + vaesenc zmm19, zmm19, zmm11 + vaesenc zmm16, zmm16, zmm12 + vaesenc zmm17, zmm17, zmm12 + vaesenc zmm18, zmm18, zmm12 + vaesenc zmm19, zmm19, zmm12 + vaesenc zmm16, zmm16, zmm13 + vaesenc zmm17, zmm17, zmm13 + vaesenc zmm18, zmm18, zmm13 + vaesenc zmm19, zmm19, zmm13 + vaesenc zmm16, zmm16, zmm14 + vaesenc zmm17, zmm17, zmm14 + vaesenc zmm18, zmm18, zmm14 + vaesenc zmm19, zmm19, zmm14 + vaesenc zmm16, zmm16, zmm15 + vaesenc zmm17, zmm17, zmm15 + vaesenc zmm18, zmm18, zmm15 + vaesenc zmm19, zmm19, zmm15 + vaesenc zmm16, zmm16, zmm1 + vaesenc zmm17, zmm17, zmm1 + vaesenc zmm18, zmm18, zmm1 + vaesenc zmm19, zmm19, zmm1 + vaesenc zmm16, zmm16, zmm2 + vaesenc zmm17, zmm17, zmm2 + vaesenc zmm18, zmm18, zmm2 + vaesenc zmm19, zmm19, zmm2 + vaesenc zmm16, zmm16, zmm3 + vaesenc zmm17, zmm17, zmm3 + vaesenc zmm18, zmm18, zmm3 + vaesenc zmm19, zmm19, zmm3 + cmp r8d, 11 + vbroadcasti32x4 zmm20, [rax+160] + jl L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+176] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + cmp r8d, 13 + vbroadcasti32x4 zmm20, [rax+192] + jl L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+208] + vaesenc zmm16, zmm16, zmm20 + vaesenc zmm17, zmm17, zmm20 + vaesenc zmm18, zmm18, zmm20 + vaesenc zmm19, zmm19, zmm20 + vbroadcasti32x4 zmm20, [rax+224] +L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last: + vaesenclast zmm16, zmm16, zmm20 + vaesenclast zmm17, zmm17, zmm20 + vaesenclast zmm18, zmm18, zmm20 + vaesenclast zmm19, zmm19, zmm20 + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu64 zmm21, [rcx] + vpxorq zmm16, zmm16, zmm21 + vmovdqu64 [rdx], zmm16 + vmovdqu64 zmm21, [rcx+64] + vpxorq zmm17, zmm17, zmm21 + vmovdqu64 [rdx+64], zmm17 + vmovdqu64 zmm21, [rcx+128] + vpxorq zmm18, zmm18, zmm21 + vmovdqu64 [rdx+128], zmm18 + vmovdqu64 zmm21, [rcx+192] + vpxorq zmm19, zmm19, zmm21 + vmovdqu64 [rdx+192], zmm19 + add edi, 256 +L_AES_GCM_decrypt_update_avx512_after_256: + vmovdqu xmm5, OWORD PTR [rsp] +L_AES_GCM_decrypt_update_avx512_done_128: + mov edx, r9d + cmp edi, edx + jge L_AES_GCM_decrypt_update_avx512_done_dec + mov r13d, r9d + and r13d, 4294967280 + cmp edi, r13d + jge L_AES_GCM_decrypt_update_avx512_last_block_done +L_AES_GCM_decrypt_update_avx512_last_block_start: + vmovdqu xmm13, OWORD PTR [r11+rdi] + vmovdqa xmm0, xmm5 + vpshufb xmm1, xmm13, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm1, xmm1, xmm6 + vmovdqu xmm9, OWORD PTR [r15] + vpshufb xmm8, xmm9, OWORD PTR L_avx512_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm9, OWORD PTR L_avx512_aes_gcm_one + vmovdqu OWORD PTR [r15], xmm9 + vpxor xmm8, xmm8, [rax] + vpclmulqdq xmm10, xmm1, xmm0, 16 + vaesenc xmm8, xmm8, [rax+16] + vaesenc xmm8, xmm8, [rax+32] + vpclmulqdq xmm11, xmm1, xmm0, 1 + vaesenc xmm8, xmm8, [rax+48] + vaesenc xmm8, xmm8, [rax+64] + vpclmulqdq xmm12, xmm1, xmm0, 0 + vaesenc xmm8, xmm8, [rax+80] + vpclmulqdq xmm1, xmm1, xmm0, 17 + vaesenc xmm8, xmm8, [rax+96] + vpxor xmm10, xmm10, xmm11 + vpslldq xmm2, xmm10, 8 + vpsrldq xmm10, xmm10, 8 + vaesenc xmm8, xmm8, [rax+112] + vpxor xmm2, xmm2, xmm12 + vpxor xmm3, xmm1, xmm10 + vmovdqa xmm0, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm2, xmm0, 16 + vaesenc xmm8, xmm8, [rax+128] + vpshufd xmm10, xmm2, 78 + vpxor xmm10, xmm10, xmm11 + vpclmulqdq xmm11, xmm10, xmm0, 16 + vaesenc xmm8, xmm8, [rax+144] + vpshufd xmm10, xmm10, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm6, xmm10, xmm3 + cmp r8d, 11 + vmovdqa xmm9, OWORD PTR [rax+160] + jl L_AES_GCM_decrypt_update_avx512_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [rax+176] + cmp r8d, 13 + vmovdqa xmm9, OWORD PTR [rax+192] + jl L_AES_GCM_decrypt_update_avx512_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [rax+208] + vmovdqa xmm9, OWORD PTR [rax+224] +L_AES_GCM_decrypt_update_avx512_aesenc_gfmul_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqa xmm0, xmm13 + vpxor xmm8, xmm8, xmm0 + vmovdqu OWORD PTR [r10+rdi], xmm8 + add edi, 16 + cmp edi, r13d + jl L_AES_GCM_decrypt_update_avx512_last_block_start +L_AES_GCM_decrypt_update_avx512_last_block_done: +L_AES_GCM_decrypt_update_avx512_done_dec: + vmovdqa OWORD PTR [r12], xmm6 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+1040] + vmovdqu xmm7, OWORD PTR [rsp+1056] + vmovdqu xmm8, OWORD PTR [rsp+1072] + vmovdqu xmm9, OWORD PTR [rsp+1088] + vmovdqu xmm10, OWORD PTR [rsp+1104] + vmovdqu xmm11, OWORD PTR [rsp+1120] + vmovdqu xmm12, OWORD PTR [rsp+1136] + vmovdqu xmm13, OWORD PTR [rsp+1152] + vmovdqu xmm14, OWORD PTR [rsp+1168] + vmovdqu xmm15, OWORD PTR [rsp+1184] + add rsp, 1200 + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r12 + pop r13 + ret +AES_GCM_decrypt_update_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_decrypt_final_avx512 PROC + push r13 + push r12 + push r14 + push rbp + push r15 + mov rax, rcx + mov r10d, r9d + mov r9, rdx + mov r11d, DWORD PTR [rsp+80] + mov r12, QWORD PTR [rsp+88] + mov r14, QWORD PTR [rsp+96] + mov rbp, QWORD PTR [rsp+104] + sub rsp, 160 + vmovdqu OWORD PTR [rsp+16], xmm6 + vmovdqu OWORD PTR [rsp+32], xmm7 + vmovdqu OWORD PTR [rsp+48], xmm8 + vmovdqu OWORD PTR [rsp+64], xmm9 + vmovdqu OWORD PTR [rsp+80], xmm10 + vmovdqu OWORD PTR [rsp+96], xmm11 + vmovdqu OWORD PTR [rsp+112], xmm12 + vmovdqu OWORD PTR [rsp+128], xmm13 + vmovdqu OWORD PTR [rsp+144], xmm15 + vmovdqa xmm6, OWORD PTR [rax] + vmovdqa xmm5, OWORD PTR [r12] + vmovdqa xmm15, OWORD PTR [r14] + vpsrlq xmm8, xmm5, 63 + vpsllq xmm7, xmm5, 1 + vpslldq xmm8, xmm8, 8 + vpor xmm7, xmm7, xmm8 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpxor xmm5, xmm5, xmm7 + mov edx, r10d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + vmovq xmm0, rdx + vmovq xmm1, rcx + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm6, xmm6, xmm0 + ; ghash_gfmul_red_avx + vpshufd xmm8, xmm5, 78 + vpxor xmm8, xmm8, xmm5 + vpshufd xmm9, xmm6, 78 + vpxor xmm9, xmm9, xmm6 + vpclmulqdq xmm7, xmm6, xmm5, 0 + vpclmulqdq xmm10, xmm6, xmm5, 17 + vpclmulqdq xmm8, xmm8, xmm9, 0 + vpternlogq xmm8, xmm10, xmm7, 150 + vmovdqa xmm9, OWORD PTR L_avx512_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm9, xmm7, 1 + vpshufd xmm7, xmm7, 78 + vpternlogq xmm8, xmm7, xmm11, 150 + vpclmulqdq xmm11, xmm9, xmm8, 1 + vpshufd xmm8, xmm8, 78 + vpternlogq xmm10, xmm8, xmm11, 150 + vmovdqa xmm6, xmm10 + vpshufb xmm6, xmm6, OWORD PTR L_avx512_aes_gcm_bswap_mask + vpxor xmm0, xmm6, xmm15 + cmp r8d, 16 + je L_AES_GCM_decrypt_final_avx512_cmp_tag_16 + sub rsp, 16 + xor rcx, rcx + xor r15, r15 + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_decrypt_final_avx512_cmp_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + xor r13b, BYTE PTR [r9+rcx] + or r15b, r13b + inc ecx + cmp ecx, r8d + jne L_AES_GCM_decrypt_final_avx512_cmp_tag_loop + cmp r15b, 0 + sete r15b + add rsp, 16 + xor rcx, rcx + jmp L_AES_GCM_decrypt_final_avx512_cmp_tag_done +L_AES_GCM_decrypt_final_avx512_cmp_tag_16: + vmovdqu xmm1, OWORD PTR [r9] + vpcmpeqb xmm0, xmm0, xmm1 + vpmovmskb rdx, xmm0 + ; %%edx == 0xFFFF then return 1 else => return 0 + xor r15d, r15d + cmp edx, 65535 + sete r15b +L_AES_GCM_decrypt_final_avx512_cmp_tag_done: + mov DWORD PTR [rbp], r15d + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+16] + vmovdqu xmm7, OWORD PTR [rsp+32] + vmovdqu xmm8, OWORD PTR [rsp+48] + vmovdqu xmm9, OWORD PTR [rsp+64] + vmovdqu xmm10, OWORD PTR [rsp+80] + vmovdqu xmm11, OWORD PTR [rsp+96] + vmovdqu xmm12, OWORD PTR [rsp+112] + vmovdqu xmm13, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + pop r15 + pop rbp + pop r14 + pop r12 + pop r13 + ret +AES_GCM_decrypt_final_avx512 ENDP +_TEXT ENDS +ENDIF END diff --git a/wolfcrypt/src/aes_x86_64_asm.S b/wolfcrypt/src/aes_x86_64_asm.S new file mode 100644 index 00000000000..9eb85b49c73 --- /dev/null +++ b/wolfcrypt/src/aes_x86_64_asm.S @@ -0,0 +1,4375 @@ +/* aes_x86_64_asm.S */ +/* + * Copyright (C) 2006-2026 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +#ifdef WOLFSSL_USER_SETTINGS +#ifdef WOLFSSL_USER_SETTINGS_ASM +/* + * user_settings_asm.h is a file generated by the script user_settings_asm.sh. + * The script takes in a user_settings.h and produces user_settings_asm.h, which + * is a stripped down version of user_settings.h containing only preprocessor + * directives. This makes the header safe to include in assembly (.S) files. + */ +#include "user_settings_asm.h" +#else +/* + * Note: if user_settings.h contains any C code (e.g. a typedef or function + * prototype), including it here in an assembly (.S) file will cause an + * assembler failure. See user_settings_asm.h above. + */ +#include "user_settings.h" +#endif /* WOLFSSL_USER_SETTINGS_ASM */ +#endif /* WOLFSSL_USER_SETTINGS */ + +#ifndef HAVE_INTEL_AVX1 +#define HAVE_INTEL_AVX1 +#endif /* HAVE_INTEL_AVX1 */ +#ifndef NO_AVX2_SUPPORT +#ifndef HAVE_INTEL_AVX2 +#define HAVE_INTEL_AVX2 +#endif /* HAVE_INTEL_AVX2 */ +#endif /* NO_AVX2_SUPPORT */ +#ifndef NO_VAES_SUPPORT +#ifndef HAVE_INTEL_VAES +#define HAVE_INTEL_VAES +#endif /* HAVE_INTEL_VAES */ +#endif /* NO_VAES_SUPPORT */ +#ifndef NO_AVX512_SUPPORT +#ifndef HAVE_INTEL_AVX512 +#define HAVE_INTEL_AVX512 +#endif /* HAVE_INTEL_AVX512 */ +#endif /* NO_AVX512_SUPPORT */ + +#ifdef WOLFSSL_X86_64_BUILD +#ifndef __APPLE__ +.text +.globl AES_128_Key_Expansion_AESNI +.type AES_128_Key_Expansion_AESNI,@function +.align 16 +AES_128_Key_Expansion_AESNI: +#else +.section __TEXT,__text +.globl _AES_128_Key_Expansion_AESNI +.p2align 4 +_AES_128_Key_Expansion_AESNI: +#endif /* __APPLE__ */ + movdqu (%rdi), %xmm0 + movdqu %xmm0, (%rsi) + aeskeygenassist $0x01, %xmm0, %xmm1 + pshufd $0xff, %xmm1, %xmm1 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqu %xmm0, 16(%rsi) + aeskeygenassist $2, %xmm0, %xmm1 + pshufd $0xff, %xmm1, %xmm1 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqu %xmm0, 32(%rsi) + aeskeygenassist $4, %xmm0, %xmm1 + pshufd $0xff, %xmm1, %xmm1 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqu %xmm0, 48(%rsi) + aeskeygenassist $8, %xmm0, %xmm1 + pshufd $0xff, %xmm1, %xmm1 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqu %xmm0, 64(%rsi) + aeskeygenassist $16, %xmm0, %xmm1 + pshufd $0xff, %xmm1, %xmm1 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqu %xmm0, 80(%rsi) + aeskeygenassist $32, %xmm0, %xmm1 + pshufd $0xff, %xmm1, %xmm1 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqu %xmm0, 96(%rsi) + aeskeygenassist $0x40, %xmm0, %xmm1 + pshufd $0xff, %xmm1, %xmm1 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqu %xmm0, 112(%rsi) + aeskeygenassist $0x80, %xmm0, %xmm1 + pshufd $0xff, %xmm1, %xmm1 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqu %xmm0, 128(%rsi) + aeskeygenassist $27, %xmm0, %xmm1 + pshufd $0xff, %xmm1, %xmm1 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqu %xmm0, 144(%rsi) + aeskeygenassist $54, %xmm0, %xmm1 + pshufd $0xff, %xmm1, %xmm1 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pslldq $4, %xmm2 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqu %xmm0, 160(%rsi) + repz retq +#ifndef __APPLE__ +.size AES_128_Key_Expansion_AESNI,.-AES_128_Key_Expansion_AESNI +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_192_Key_Expansion_AESNI +.type AES_192_Key_Expansion_AESNI,@function +.align 16 +AES_192_Key_Expansion_AESNI: +#else +.section __TEXT,__text +.globl _AES_192_Key_Expansion_AESNI +.p2align 4 +_AES_192_Key_Expansion_AESNI: +#endif /* __APPLE__ */ + movdqu (%rdi), %xmm0 + pxor %xmm1, %xmm1 + pinsrq $0x00, 16(%rdi), %xmm1 + movdqu %xmm0, (%rsi) + movdqa %xmm1, %xmm4 + aeskeygenassist $0x01, %xmm1, %xmm2 + pshufd $0x55, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + pshufd $0xff, %xmm0, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + shufpd $0x00, %xmm0, %xmm4 + movdqu %xmm4, 16(%rsi) + movdqa %xmm0, %xmm5 + shufpd $0x01, %xmm1, %xmm5 + movdqu %xmm5, 32(%rsi) + aeskeygenassist $2, %xmm1, %xmm2 + pshufd $0x55, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + pshufd $0xff, %xmm0, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + movdqu %xmm0, 48(%rsi) + movdqa %xmm1, %xmm4 + aeskeygenassist $4, %xmm1, %xmm2 + pshufd $0x55, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + pshufd $0xff, %xmm0, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + shufpd $0x00, %xmm0, %xmm4 + movdqu %xmm4, 64(%rsi) + movdqa %xmm0, %xmm5 + shufpd $0x01, %xmm1, %xmm5 + movdqu %xmm5, 80(%rsi) + aeskeygenassist $8, %xmm1, %xmm2 + pshufd $0x55, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + pshufd $0xff, %xmm0, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + movdqu %xmm0, 96(%rsi) + movdqa %xmm1, %xmm4 + aeskeygenassist $16, %xmm1, %xmm2 + pshufd $0x55, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + pshufd $0xff, %xmm0, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + shufpd $0x00, %xmm0, %xmm4 + movdqu %xmm4, 112(%rsi) + movdqa %xmm0, %xmm5 + shufpd $0x01, %xmm1, %xmm5 + movdqu %xmm5, 128(%rsi) + aeskeygenassist $32, %xmm1, %xmm2 + pshufd $0x55, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + pshufd $0xff, %xmm0, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + movdqu %xmm0, 144(%rsi) + movdqa %xmm1, %xmm4 + aeskeygenassist $0x40, %xmm1, %xmm2 + pshufd $0x55, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + pshufd $0xff, %xmm0, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + shufpd $0x00, %xmm0, %xmm4 + movdqu %xmm4, 160(%rsi) + movdqa %xmm0, %xmm5 + shufpd $0x01, %xmm1, %xmm5 + movdqu %xmm5, 176(%rsi) + aeskeygenassist $0x80, %xmm1, %xmm2 + pshufd $0x55, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + pshufd $0xff, %xmm0, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + movdqu %xmm0, 192(%rsi) + movdqu %xmm1, 208(%rsi) + repz retq +#ifndef __APPLE__ +.size AES_192_Key_Expansion_AESNI,.-AES_192_Key_Expansion_AESNI +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_256_Key_Expansion_AESNI +.type AES_256_Key_Expansion_AESNI,@function +.align 16 +AES_256_Key_Expansion_AESNI: +#else +.section __TEXT,__text +.globl _AES_256_Key_Expansion_AESNI +.p2align 4 +_AES_256_Key_Expansion_AESNI: +#endif /* __APPLE__ */ + movdqu (%rdi), %xmm0 + movdqu 16(%rdi), %xmm1 + movdqu %xmm0, (%rsi) + movdqu %xmm1, 16(%rsi) + aeskeygenassist $0x01, %xmm1, %xmm2 + pshufd $0xff, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + movdqu %xmm0, 32(%rsi) + aeskeygenassist $0x00, %xmm0, %xmm2 + pshufd $0xaa, %xmm2, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + movdqu %xmm1, 48(%rsi) + aeskeygenassist $2, %xmm1, %xmm2 + pshufd $0xff, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + movdqu %xmm0, 64(%rsi) + aeskeygenassist $0x00, %xmm0, %xmm2 + pshufd $0xaa, %xmm2, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + movdqu %xmm1, 80(%rsi) + aeskeygenassist $4, %xmm1, %xmm2 + pshufd $0xff, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + movdqu %xmm0, 96(%rsi) + aeskeygenassist $0x00, %xmm0, %xmm2 + pshufd $0xaa, %xmm2, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + movdqu %xmm1, 112(%rsi) + aeskeygenassist $8, %xmm1, %xmm2 + pshufd $0xff, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + movdqu %xmm0, 128(%rsi) + aeskeygenassist $0x00, %xmm0, %xmm2 + pshufd $0xaa, %xmm2, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + movdqu %xmm1, 144(%rsi) + aeskeygenassist $16, %xmm1, %xmm2 + pshufd $0xff, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + movdqu %xmm0, 160(%rsi) + aeskeygenassist $0x00, %xmm0, %xmm2 + pshufd $0xaa, %xmm2, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + movdqu %xmm1, 176(%rsi) + aeskeygenassist $32, %xmm1, %xmm2 + pshufd $0xff, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + movdqu %xmm0, 192(%rsi) + aeskeygenassist $0x00, %xmm0, %xmm2 + pshufd $0xaa, %xmm2, %xmm2 + movdqa %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + movdqu %xmm1, 208(%rsi) + aeskeygenassist $0x40, %xmm1, %xmm2 + pshufd $0xff, %xmm2, %xmm2 + movdqa %xmm0, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pslldq $4, %xmm3 + pxor %xmm3, %xmm0 + pxor %xmm2, %xmm0 + movdqu %xmm0, 224(%rsi) + repz retq +#ifndef __APPLE__ +.size AES_256_Key_Expansion_AESNI,.-AES_256_Key_Expansion_AESNI +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_ECB_encrypt_AESNI +.type AES_ECB_encrypt_AESNI,@function +.align 16 +AES_ECB_encrypt_AESNI: +#else +.section __TEXT,__text +.globl _AES_ECB_encrypt_AESNI +.p2align 4 +_AES_ECB_encrypt_AESNI: +#endif /* __APPLE__ */ + xorl %eax, %eax + cmpl $0x40, %edx + movl %edx, %r9d + jl L_AES_ECB_encrypt_AESNI_done_64 + andl $0xffffffc0, %r9d +L_AES_ECB_encrypt_AESNI_enc_64: + # 64 bytes of input + # aes_ecb_enc_64 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + movdqu (%r10), %xmm0 + movdqu 16(%r10), %xmm1 + movdqu 32(%r10), %xmm2 + movdqu 48(%r10), %xmm3 + # aes_enc_block + movdqu (%rcx), %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm4, %xmm1 + pxor %xmm4, %xmm2 + pxor %xmm4, %xmm3 + movdqu 16(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 32(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 48(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 64(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 80(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 96(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 112(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 128(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 144(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + cmpl $11, %r8d + movdqu 160(%rcx), %xmm4 + jl L_AES_ECB_encrypt_AESNI_64_aes_enc_block_last + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 176(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + cmpl $13, %r8d + movdqu 192(%rcx), %xmm4 + jl L_AES_ECB_encrypt_AESNI_64_aes_enc_block_last + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 208(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 224(%rcx), %xmm4 +L_AES_ECB_encrypt_AESNI_64_aes_enc_block_last: + aesenclast %xmm4, %xmm0 + aesenclast %xmm4, %xmm1 + aesenclast %xmm4, %xmm2 + aesenclast %xmm4, %xmm3 + movdqu %xmm0, (%r11) + movdqu %xmm1, 16(%r11) + movdqu %xmm2, 32(%r11) + movdqu %xmm3, 48(%r11) + addl $0x40, %eax + cmpl %r9d, %eax + jl L_AES_ECB_encrypt_AESNI_enc_64 +L_AES_ECB_encrypt_AESNI_done_64: + cmpl %edx, %eax + movl %edx, %r9d + je L_AES_ECB_encrypt_AESNI_done_enc + andl $0xfffffff0, %r9d +L_AES_ECB_encrypt_AESNI_enc_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + movdqu (%r10), %xmm0 + # aes_enc_block + pxor (%rcx), %xmm0 + movdqu 16(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 32(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 48(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 64(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 80(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 96(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 112(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 128(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 144(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + cmpl $11, %r8d + movdqu 160(%rcx), %xmm5 + jl L_AES_ECB_encrypt_AESNI_16_aes_enc_block_last + aesenc %xmm5, %xmm0 + movdqu 176(%rcx), %xmm6 + aesenc %xmm6, %xmm0 + cmpl $13, %r8d + movdqu 192(%rcx), %xmm5 + jl L_AES_ECB_encrypt_AESNI_16_aes_enc_block_last + aesenc %xmm5, %xmm0 + movdqu 208(%rcx), %xmm6 + aesenc %xmm6, %xmm0 + movdqu 224(%rcx), %xmm5 +L_AES_ECB_encrypt_AESNI_16_aes_enc_block_last: + aesenclast %xmm5, %xmm0 + leaq (%rsi,%rax,1), %r10 + movdqu %xmm0, (%r10) + addl $16, %eax + cmpl %r9d, %eax + jl L_AES_ECB_encrypt_AESNI_enc_16 +L_AES_ECB_encrypt_AESNI_done_enc: + repz retq +#ifndef __APPLE__ +.size AES_ECB_encrypt_AESNI,.-AES_ECB_encrypt_AESNI +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_ECB_decrypt_AESNI +.type AES_ECB_decrypt_AESNI,@function +.align 16 +AES_ECB_decrypt_AESNI: +#else +.section __TEXT,__text +.globl _AES_ECB_decrypt_AESNI +.p2align 4 +_AES_ECB_decrypt_AESNI: +#endif /* __APPLE__ */ + xorl %eax, %eax + cmpl $0x40, %edx + movl %edx, %r9d + jl L_AES_ECB_decrypt_AESNI_done_64 + andl $0xffffffc0, %r9d +L_AES_ECB_decrypt_AESNI_dec_64: + # 64 bytes of input + # aes_ecb_dec_64 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + movdqu (%r10), %xmm0 + movdqu 16(%r10), %xmm1 + movdqu 32(%r10), %xmm2 + movdqu 48(%r10), %xmm3 + # aes_dec_block + movdqu (%rcx), %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm4, %xmm1 + pxor %xmm4, %xmm2 + pxor %xmm4, %xmm3 + movdqu 16(%rcx), %xmm4 + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + movdqu 32(%rcx), %xmm4 + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + movdqu 48(%rcx), %xmm4 + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + movdqu 64(%rcx), %xmm4 + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + movdqu 80(%rcx), %xmm4 + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + movdqu 96(%rcx), %xmm4 + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + movdqu 112(%rcx), %xmm4 + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + movdqu 128(%rcx), %xmm4 + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + movdqu 144(%rcx), %xmm4 + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + cmpl $11, %r8d + movdqu 160(%rcx), %xmm4 + jl L_AES_ECB_decrypt_AESNI_64_aes_dec_block_last + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + movdqu 176(%rcx), %xmm4 + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + cmpl $13, %r8d + movdqu 192(%rcx), %xmm4 + jl L_AES_ECB_decrypt_AESNI_64_aes_dec_block_last + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + movdqu 208(%rcx), %xmm4 + aesdec %xmm4, %xmm0 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm3 + movdqu 224(%rcx), %xmm4 +L_AES_ECB_decrypt_AESNI_64_aes_dec_block_last: + aesdeclast %xmm4, %xmm0 + aesdeclast %xmm4, %xmm1 + aesdeclast %xmm4, %xmm2 + aesdeclast %xmm4, %xmm3 + movdqu %xmm0, (%r11) + movdqu %xmm1, 16(%r11) + movdqu %xmm2, 32(%r11) + movdqu %xmm3, 48(%r11) + addl $0x40, %eax + cmpl %r9d, %eax + jl L_AES_ECB_decrypt_AESNI_dec_64 +L_AES_ECB_decrypt_AESNI_done_64: + cmpl %edx, %eax + movl %edx, %r9d + je L_AES_ECB_decrypt_AESNI_done_dec + andl $0xfffffff0, %r9d +L_AES_ECB_decrypt_AESNI_dec_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + movdqu (%r10), %xmm0 + # aes_dec_block + pxor (%rcx), %xmm0 + movdqu 16(%rcx), %xmm5 + aesdec %xmm5, %xmm0 + movdqu 32(%rcx), %xmm5 + aesdec %xmm5, %xmm0 + movdqu 48(%rcx), %xmm5 + aesdec %xmm5, %xmm0 + movdqu 64(%rcx), %xmm5 + aesdec %xmm5, %xmm0 + movdqu 80(%rcx), %xmm5 + aesdec %xmm5, %xmm0 + movdqu 96(%rcx), %xmm5 + aesdec %xmm5, %xmm0 + movdqu 112(%rcx), %xmm5 + aesdec %xmm5, %xmm0 + movdqu 128(%rcx), %xmm5 + aesdec %xmm5, %xmm0 + movdqu 144(%rcx), %xmm5 + aesdec %xmm5, %xmm0 + cmpl $11, %r8d + movdqu 160(%rcx), %xmm5 + jl L_AES_ECB_decrypt_AESNI_16_aes_dec_block_last + aesdec %xmm5, %xmm0 + movdqu 176(%rcx), %xmm6 + aesdec %xmm6, %xmm0 + cmpl $13, %r8d + movdqu 192(%rcx), %xmm5 + jl L_AES_ECB_decrypt_AESNI_16_aes_dec_block_last + aesdec %xmm5, %xmm0 + movdqu 208(%rcx), %xmm6 + aesdec %xmm6, %xmm0 + movdqu 224(%rcx), %xmm5 +L_AES_ECB_decrypt_AESNI_16_aes_dec_block_last: + aesdeclast %xmm5, %xmm0 + leaq (%rsi,%rax,1), %r10 + movdqu %xmm0, (%r10) + addl $16, %eax + cmpl %r9d, %eax + jl L_AES_ECB_decrypt_AESNI_dec_16 +L_AES_ECB_decrypt_AESNI_done_dec: + repz retq +#ifndef __APPLE__ +.size AES_ECB_decrypt_AESNI,.-AES_ECB_decrypt_AESNI +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_CBC_encrypt_AESNI +.type AES_CBC_encrypt_AESNI,@function +.align 16 +AES_CBC_encrypt_AESNI: +#else +.section __TEXT,__text +.globl _AES_CBC_encrypt_AESNI +.p2align 4 +_AES_CBC_encrypt_AESNI: +#endif /* __APPLE__ */ + movdqu (%rdx), %xmm0 + xorl %eax, %eax + cmpl %ecx, %eax + je L_AES_CBC_encrypt_AESNI_done +L_AES_CBC_encrypt_AESNI_loop: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + movdqu (%r10), %xmm1 + pxor %xmm0, %xmm1 + # aes_enc_block + pxor (%r8), %xmm1 + movdqu 16(%r8), %xmm3 + aesenc %xmm3, %xmm1 + movdqu 32(%r8), %xmm3 + aesenc %xmm3, %xmm1 + movdqu 48(%r8), %xmm3 + aesenc %xmm3, %xmm1 + movdqu 64(%r8), %xmm3 + aesenc %xmm3, %xmm1 + movdqu 80(%r8), %xmm3 + aesenc %xmm3, %xmm1 + movdqu 96(%r8), %xmm3 + aesenc %xmm3, %xmm1 + movdqu 112(%r8), %xmm3 + aesenc %xmm3, %xmm1 + movdqu 128(%r8), %xmm3 + aesenc %xmm3, %xmm1 + movdqu 144(%r8), %xmm3 + aesenc %xmm3, %xmm1 + cmpl $11, %r9d + movdqu 160(%r8), %xmm3 + jl L_AES_CBC_encrypt_AESNI_aes_enc_block_last + aesenc %xmm3, %xmm1 + movdqu 176(%r8), %xmm4 + aesenc %xmm4, %xmm1 + cmpl $13, %r9d + movdqu 192(%r8), %xmm3 + jl L_AES_CBC_encrypt_AESNI_aes_enc_block_last + aesenc %xmm3, %xmm1 + movdqu 208(%r8), %xmm4 + aesenc %xmm4, %xmm1 + movdqu 224(%r8), %xmm3 +L_AES_CBC_encrypt_AESNI_aes_enc_block_last: + aesenclast %xmm3, %xmm1 + leaq (%rsi,%rax,1), %r11 + movdqu %xmm1, (%r11) + movdqa %xmm1, %xmm0 + addl $16, %eax + cmpl %ecx, %eax + jl L_AES_CBC_encrypt_AESNI_loop +L_AES_CBC_encrypt_AESNI_done: + movdqu %xmm0, (%rdx) + repz retq +#ifndef __APPLE__ +.size AES_CBC_encrypt_AESNI,.-AES_CBC_encrypt_AESNI +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_CBC_decrypt_AESNI +.type AES_CBC_decrypt_AESNI,@function +.align 16 +AES_CBC_decrypt_AESNI: +#else +.section __TEXT,__text +.globl _AES_CBC_decrypt_AESNI +.p2align 4 +_AES_CBC_decrypt_AESNI: +#endif /* __APPLE__ */ + pushq %r12 + movdqu (%rdx), %xmm4 + xorl %eax, %eax + cmpl $0x40, %ecx + movl %ecx, %r10d + jl L_AES_CBC_decrypt_AESNI_done_64 + andl $0xffffffc0, %r10d +L_AES_CBC_decrypt_AESNI_dec_64: + # 64 bytes of input + # aes_cbc_dec_64 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %r12 + movdqu (%r11), %xmm0 + movdqu 16(%r11), %xmm1 + movdqu 32(%r11), %xmm2 + movdqu 48(%r11), %xmm3 + # aes_dec_block + movdqu (%r8), %xmm5 + pxor %xmm5, %xmm0 + pxor %xmm5, %xmm1 + pxor %xmm5, %xmm2 + pxor %xmm5, %xmm3 + movdqu 16(%r8), %xmm5 + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + movdqu 32(%r8), %xmm5 + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + movdqu 48(%r8), %xmm5 + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + movdqu 64(%r8), %xmm5 + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + movdqu 80(%r8), %xmm5 + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + movdqu 96(%r8), %xmm5 + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + movdqu 112(%r8), %xmm5 + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + movdqu 128(%r8), %xmm5 + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + movdqu 144(%r8), %xmm5 + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + cmpl $11, %r9d + movdqu 160(%r8), %xmm5 + jl L_AES_CBC_decrypt_AESNI_64_aes_dec_block_last + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + movdqu 176(%r8), %xmm5 + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + cmpl $13, %r9d + movdqu 192(%r8), %xmm5 + jl L_AES_CBC_decrypt_AESNI_64_aes_dec_block_last + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + movdqu 208(%r8), %xmm5 + aesdec %xmm5, %xmm0 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm3 + movdqu 224(%r8), %xmm5 +L_AES_CBC_decrypt_AESNI_64_aes_dec_block_last: + aesdeclast %xmm5, %xmm0 + aesdeclast %xmm5, %xmm1 + aesdeclast %xmm5, %xmm2 + aesdeclast %xmm5, %xmm3 + pxor %xmm4, %xmm0 + movdqu (%r11), %xmm5 + pxor %xmm5, %xmm1 + movdqu 16(%r11), %xmm5 + pxor %xmm5, %xmm2 + movdqu 32(%r11), %xmm5 + pxor %xmm5, %xmm3 + movdqu 48(%r11), %xmm4 + movdqu %xmm0, (%r12) + movdqu %xmm1, 16(%r12) + movdqu %xmm2, 32(%r12) + movdqu %xmm3, 48(%r12) + addl $0x40, %eax + cmpl %r10d, %eax + jl L_AES_CBC_decrypt_AESNI_dec_64 +L_AES_CBC_decrypt_AESNI_done_64: + cmpl %ecx, %eax + movl %ecx, %r10d + je L_AES_CBC_decrypt_AESNI_done_dec + andl $0xfffffff0, %r10d +L_AES_CBC_decrypt_AESNI_dec_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r11 + movdqu (%r11), %xmm0 + movdqa %xmm0, %xmm8 + # aes_dec_block + pxor (%r8), %xmm0 + movdqu 16(%r8), %xmm6 + aesdec %xmm6, %xmm0 + movdqu 32(%r8), %xmm6 + aesdec %xmm6, %xmm0 + movdqu 48(%r8), %xmm6 + aesdec %xmm6, %xmm0 + movdqu 64(%r8), %xmm6 + aesdec %xmm6, %xmm0 + movdqu 80(%r8), %xmm6 + aesdec %xmm6, %xmm0 + movdqu 96(%r8), %xmm6 + aesdec %xmm6, %xmm0 + movdqu 112(%r8), %xmm6 + aesdec %xmm6, %xmm0 + movdqu 128(%r8), %xmm6 + aesdec %xmm6, %xmm0 + movdqu 144(%r8), %xmm6 + aesdec %xmm6, %xmm0 + cmpl $11, %r9d + movdqu 160(%r8), %xmm6 + jl L_AES_CBC_decrypt_AESNI_16_aes_dec_block_last + aesdec %xmm6, %xmm0 + movdqu 176(%r8), %xmm7 + aesdec %xmm7, %xmm0 + cmpl $13, %r9d + movdqu 192(%r8), %xmm6 + jl L_AES_CBC_decrypt_AESNI_16_aes_dec_block_last + aesdec %xmm6, %xmm0 + movdqu 208(%r8), %xmm7 + aesdec %xmm7, %xmm0 + movdqu 224(%r8), %xmm6 +L_AES_CBC_decrypt_AESNI_16_aes_dec_block_last: + aesdeclast %xmm6, %xmm0 + pxor %xmm4, %xmm0 + movdqa %xmm8, %xmm4 + leaq (%rsi,%rax,1), %r11 + movdqu %xmm0, (%r11) + addl $16, %eax + cmpl %r10d, %eax + jl L_AES_CBC_decrypt_AESNI_dec_16 +L_AES_CBC_decrypt_AESNI_done_dec: + movdqu %xmm4, (%rdx) + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_CBC_decrypt_AESNI,.-AES_CBC_decrypt_AESNI +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_aes_ctr_aesni_bswap: +.quad 0x08090a0b0c0d0e0f,0x0001020304050607 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_aes_ctr_aesni_one: +.quad 0x0000000000000001,0x0000000000000000 +#ifndef __APPLE__ +.text +.globl AES_CTR_encrypt_AESNI +.type AES_CTR_encrypt_AESNI,@function +.align 16 +AES_CTR_encrypt_AESNI: +#else +.section __TEXT,__text +.globl _AES_CTR_encrypt_AESNI +.p2align 4 +_AES_CTR_encrypt_AESNI: +#endif /* __APPLE__ */ + pushq %rbx + movdqu L_aes_ctr_aesni_bswap(%rip), %xmm8 + movdqu L_aes_ctr_aesni_one(%rip), %xmm9 + pxor %xmm10, %xmm10 + movdqu (%r9), %xmm7 + pshufb %xmm8, %xmm7 + xorl %eax, %eax + cmpl $0x40, %edx + movl %edx, %r10d + jl L_AES_CTR_encrypt_AESNI_done_64 + andl $0xffffffc0, %r10d +L_AES_CTR_encrypt_AESNI_enc_64: + # 64 bytes of input + # aes_ctr_enc_64 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %rbx + movdqa %xmm7, %xmm0 + pshufb %xmm8, %xmm0 + paddq %xmm9, %xmm7 + movdqa %xmm7, %xmm11 + pcmpeqq %xmm10, %xmm11 + pslldq $8, %xmm11 + psrlq $63, %xmm11 + paddq %xmm11, %xmm7 + movdqa %xmm7, %xmm1 + pshufb %xmm8, %xmm1 + paddq %xmm9, %xmm7 + movdqa %xmm7, %xmm11 + pcmpeqq %xmm10, %xmm11 + pslldq $8, %xmm11 + psrlq $63, %xmm11 + paddq %xmm11, %xmm7 + movdqa %xmm7, %xmm2 + pshufb %xmm8, %xmm2 + paddq %xmm9, %xmm7 + movdqa %xmm7, %xmm11 + pcmpeqq %xmm10, %xmm11 + pslldq $8, %xmm11 + psrlq $63, %xmm11 + paddq %xmm11, %xmm7 + movdqa %xmm7, %xmm3 + pshufb %xmm8, %xmm3 + paddq %xmm9, %xmm7 + movdqa %xmm7, %xmm11 + pcmpeqq %xmm10, %xmm11 + pslldq $8, %xmm11 + psrlq $63, %xmm11 + paddq %xmm11, %xmm7 + # aes_enc_block + movdqu (%rcx), %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm4, %xmm1 + pxor %xmm4, %xmm2 + pxor %xmm4, %xmm3 + movdqu 16(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 32(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 48(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 64(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 80(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 96(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 112(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 128(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 144(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + cmpl $11, %r8d + movdqu 160(%rcx), %xmm4 + jl L_AES_CTR_encrypt_AESNI_64_aes_enc_block_last + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 176(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + cmpl $13, %r8d + movdqu 192(%rcx), %xmm4 + jl L_AES_CTR_encrypt_AESNI_64_aes_enc_block_last + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 208(%rcx), %xmm4 + aesenc %xmm4, %xmm0 + aesenc %xmm4, %xmm1 + aesenc %xmm4, %xmm2 + aesenc %xmm4, %xmm3 + movdqu 224(%rcx), %xmm4 +L_AES_CTR_encrypt_AESNI_64_aes_enc_block_last: + aesenclast %xmm4, %xmm0 + aesenclast %xmm4, %xmm1 + aesenclast %xmm4, %xmm2 + aesenclast %xmm4, %xmm3 + movdqu (%r11), %xmm4 + pxor %xmm4, %xmm0 + movdqu 16(%r11), %xmm4 + pxor %xmm4, %xmm1 + movdqu 32(%r11), %xmm4 + pxor %xmm4, %xmm2 + movdqu 48(%r11), %xmm4 + pxor %xmm4, %xmm3 + movdqu %xmm0, (%rbx) + movdqu %xmm1, 16(%rbx) + movdqu %xmm2, 32(%rbx) + movdqu %xmm3, 48(%rbx) + addl $0x40, %eax + cmpl %r10d, %eax + jl L_AES_CTR_encrypt_AESNI_enc_64 +L_AES_CTR_encrypt_AESNI_done_64: + cmpl %edx, %eax + movl %edx, %r10d + je L_AES_CTR_encrypt_AESNI_done_enc + andl $0xfffffff0, %r10d +L_AES_CTR_encrypt_AESNI_enc_16: + # 16 bytes of input + movdqa %xmm7, %xmm0 + pshufb %xmm8, %xmm0 + paddq %xmm9, %xmm7 + movdqa %xmm7, %xmm11 + pcmpeqq %xmm10, %xmm11 + pslldq $8, %xmm11 + psrlq $63, %xmm11 + paddq %xmm11, %xmm7 + # aes_enc_block + pxor (%rcx), %xmm0 + movdqu 16(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 32(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 48(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 64(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 80(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 96(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 112(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 128(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 144(%rcx), %xmm5 + aesenc %xmm5, %xmm0 + cmpl $11, %r8d + movdqu 160(%rcx), %xmm5 + jl L_AES_CTR_encrypt_AESNI_16_aes_enc_block_last + aesenc %xmm5, %xmm0 + movdqu 176(%rcx), %xmm6 + aesenc %xmm6, %xmm0 + cmpl $13, %r8d + movdqu 192(%rcx), %xmm5 + jl L_AES_CTR_encrypt_AESNI_16_aes_enc_block_last + aesenc %xmm5, %xmm0 + movdqu 208(%rcx), %xmm6 + aesenc %xmm6, %xmm0 + movdqu 224(%rcx), %xmm5 +L_AES_CTR_encrypt_AESNI_16_aes_enc_block_last: + aesenclast %xmm5, %xmm0 + leaq (%rdi,%rax,1), %r11 + movdqu (%r11), %xmm4 + pxor %xmm4, %xmm0 + leaq (%rsi,%rax,1), %r11 + movdqu %xmm0, (%r11) + addl $16, %eax + cmpl %r10d, %eax + jl L_AES_CTR_encrypt_AESNI_enc_16 +L_AES_CTR_encrypt_AESNI_done_enc: + pshufb %xmm8, %xmm7 + movdqu %xmm7, (%r9) + popq %rbx + repz retq +#ifndef __APPLE__ +.size AES_CTR_encrypt_AESNI,.-AES_CTR_encrypt_AESNI +#endif /* __APPLE__ */ +#ifdef HAVE_INTEL_AVX1 +#ifndef __APPLE__ +.text +.globl AES_ECB_encrypt_avx1 +.type AES_ECB_encrypt_avx1,@function +.align 16 +AES_ECB_encrypt_avx1: +#else +.section __TEXT,__text +.globl _AES_ECB_encrypt_avx1 +.p2align 4 +_AES_ECB_encrypt_avx1: +#endif /* __APPLE__ */ + xorl %eax, %eax + cmpl $0x40, %edx + movl %edx, %r9d + jl L_AES_ECB_encrypt_avx1_done_64 + andl $0xffffffc0, %r9d +L_AES_ECB_encrypt_avx1_enc_64: + # 64 bytes of input + # aes_ecb_enc_64 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + vmovdqu (%r10), %xmm0 + vmovdqu 16(%r10), %xmm1 + vmovdqu 32(%r10), %xmm2 + vmovdqu 48(%r10), %xmm3 + # aes_enc_block + vmovdqu (%rcx), %xmm4 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm4, %xmm3, %xmm3 + vmovdqu 16(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 32(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 48(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 64(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 80(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 96(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 112(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 128(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 144(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm4 + jl L_AES_ECB_encrypt_avx1_64_aes_enc_block_last + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 176(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm4 + jl L_AES_ECB_encrypt_avx1_64_aes_enc_block_last + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 208(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 224(%rcx), %xmm4 +L_AES_ECB_encrypt_avx1_64_aes_enc_block_last: + vaesenclast %xmm4, %xmm0, %xmm0 + vaesenclast %xmm4, %xmm1, %xmm1 + vaesenclast %xmm4, %xmm2, %xmm2 + vaesenclast %xmm4, %xmm3, %xmm3 + vmovdqu %xmm0, (%r11) + vmovdqu %xmm1, 16(%r11) + vmovdqu %xmm2, 32(%r11) + vmovdqu %xmm3, 48(%r11) + addl $0x40, %eax + cmpl %r9d, %eax + jl L_AES_ECB_encrypt_avx1_enc_64 +L_AES_ECB_encrypt_avx1_done_64: + cmpl %edx, %eax + movl %edx, %r9d + je L_AES_ECB_encrypt_avx1_done_enc + andl $0xfffffff0, %r9d +L_AES_ECB_encrypt_avx1_enc_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + vmovdqu (%r10), %xmm0 + # aes_enc_block + vpxor (%rcx), %xmm0, %xmm0 + vmovdqu 16(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm5 + jl L_AES_ECB_encrypt_avx1_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm5 + jl L_AES_ECB_encrypt_avx1_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%rcx), %xmm5 +L_AES_ECB_encrypt_avx1_16_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + leaq (%rsi,%rax,1), %r10 + vmovdqu %xmm0, (%r10) + addl $16, %eax + cmpl %r9d, %eax + jl L_AES_ECB_encrypt_avx1_enc_16 +L_AES_ECB_encrypt_avx1_done_enc: + repz retq +#ifndef __APPLE__ +.size AES_ECB_encrypt_avx1,.-AES_ECB_encrypt_avx1 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_ECB_decrypt_avx1 +.type AES_ECB_decrypt_avx1,@function +.align 16 +AES_ECB_decrypt_avx1: +#else +.section __TEXT,__text +.globl _AES_ECB_decrypt_avx1 +.p2align 4 +_AES_ECB_decrypt_avx1: +#endif /* __APPLE__ */ + xorl %eax, %eax + cmpl $0x40, %edx + movl %edx, %r9d + jl L_AES_ECB_decrypt_avx1_done_64 + andl $0xffffffc0, %r9d +L_AES_ECB_decrypt_avx1_dec_64: + # 64 bytes of input + # aes_ecb_dec_64 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + vmovdqu (%r10), %xmm0 + vmovdqu 16(%r10), %xmm1 + vmovdqu 32(%r10), %xmm2 + vmovdqu 48(%r10), %xmm3 + # aes_dec_block + vmovdqu (%rcx), %xmm4 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm4, %xmm3, %xmm3 + vmovdqu 16(%rcx), %xmm4 + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + vmovdqu 32(%rcx), %xmm4 + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + vmovdqu 48(%rcx), %xmm4 + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + vmovdqu 64(%rcx), %xmm4 + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + vmovdqu 80(%rcx), %xmm4 + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + vmovdqu 96(%rcx), %xmm4 + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + vmovdqu 112(%rcx), %xmm4 + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + vmovdqu 128(%rcx), %xmm4 + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + vmovdqu 144(%rcx), %xmm4 + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm4 + jl L_AES_ECB_decrypt_avx1_64_aes_dec_block_last + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + vmovdqu 176(%rcx), %xmm4 + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm4 + jl L_AES_ECB_decrypt_avx1_64_aes_dec_block_last + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + vmovdqu 208(%rcx), %xmm4 + vaesdec %xmm4, %xmm0, %xmm0 + vaesdec %xmm4, %xmm1, %xmm1 + vaesdec %xmm4, %xmm2, %xmm2 + vaesdec %xmm4, %xmm3, %xmm3 + vmovdqu 224(%rcx), %xmm4 +L_AES_ECB_decrypt_avx1_64_aes_dec_block_last: + vaesdeclast %xmm4, %xmm0, %xmm0 + vaesdeclast %xmm4, %xmm1, %xmm1 + vaesdeclast %xmm4, %xmm2, %xmm2 + vaesdeclast %xmm4, %xmm3, %xmm3 + vmovdqu %xmm0, (%r11) + vmovdqu %xmm1, 16(%r11) + vmovdqu %xmm2, 32(%r11) + vmovdqu %xmm3, 48(%r11) + addl $0x40, %eax + cmpl %r9d, %eax + jl L_AES_ECB_decrypt_avx1_dec_64 +L_AES_ECB_decrypt_avx1_done_64: + cmpl %edx, %eax + movl %edx, %r9d + je L_AES_ECB_decrypt_avx1_done_dec + andl $0xfffffff0, %r9d +L_AES_ECB_decrypt_avx1_dec_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + vmovdqu (%r10), %xmm0 + # aes_dec_block + vpxor (%rcx), %xmm0, %xmm0 + vmovdqu 16(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm5 + jl L_AES_ECB_decrypt_avx1_16_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%rcx), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm5 + jl L_AES_ECB_decrypt_avx1_16_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%rcx), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%rcx), %xmm5 +L_AES_ECB_decrypt_avx1_16_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + leaq (%rsi,%rax,1), %r10 + vmovdqu %xmm0, (%r10) + addl $16, %eax + cmpl %r9d, %eax + jl L_AES_ECB_decrypt_avx1_dec_16 +L_AES_ECB_decrypt_avx1_done_dec: + repz retq +#ifndef __APPLE__ +.size AES_ECB_decrypt_avx1,.-AES_ECB_decrypt_avx1 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_CBC_encrypt_avx1 +.type AES_CBC_encrypt_avx1,@function +.align 16 +AES_CBC_encrypt_avx1: +#else +.section __TEXT,__text +.globl _AES_CBC_encrypt_avx1 +.p2align 4 +_AES_CBC_encrypt_avx1: +#endif /* __APPLE__ */ + vmovdqu (%rdx), %xmm0 + xorl %eax, %eax + cmpl %ecx, %eax + je L_AES_CBC_encrypt_avx1_done +L_AES_CBC_encrypt_avx1_loop: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + vmovdqu (%r10), %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + # aes_enc_block + vpxor (%r8), %xmm1, %xmm1 + vmovdqu 16(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 32(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 48(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 64(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 80(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 96(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 112(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 128(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 144(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + cmpl $11, %r9d + vmovdqu 160(%r8), %xmm3 + jl L_AES_CBC_encrypt_avx1_aes_enc_block_last + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 176(%r8), %xmm4 + vaesenc %xmm4, %xmm1, %xmm1 + cmpl $13, %r9d + vmovdqu 192(%r8), %xmm3 + jl L_AES_CBC_encrypt_avx1_aes_enc_block_last + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 208(%r8), %xmm4 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqu 224(%r8), %xmm3 +L_AES_CBC_encrypt_avx1_aes_enc_block_last: + vaesenclast %xmm3, %xmm1, %xmm1 + leaq (%rsi,%rax,1), %r11 + vmovdqu %xmm1, (%r11) + vmovdqa %xmm1, %xmm0 + addl $16, %eax + cmpl %ecx, %eax + jl L_AES_CBC_encrypt_avx1_loop +L_AES_CBC_encrypt_avx1_done: + vmovdqu %xmm0, (%rdx) + repz retq +#ifndef __APPLE__ +.size AES_CBC_encrypt_avx1,.-AES_CBC_encrypt_avx1 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_CBC_decrypt_avx1 +.type AES_CBC_decrypt_avx1,@function +.align 16 +AES_CBC_decrypt_avx1: +#else +.section __TEXT,__text +.globl _AES_CBC_decrypt_avx1 +.p2align 4 +_AES_CBC_decrypt_avx1: +#endif /* __APPLE__ */ + pushq %r12 + vmovdqu (%rdx), %xmm4 + xorl %eax, %eax + cmpl $0x40, %ecx + movl %ecx, %r10d + jl L_AES_CBC_decrypt_avx1_done_64 + andl $0xffffffc0, %r10d +L_AES_CBC_decrypt_avx1_dec_64: + # 64 bytes of input + # aes_cbc_dec_64 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %r12 + vmovdqu (%r11), %xmm0 + vmovdqu 16(%r11), %xmm1 + vmovdqu 32(%r11), %xmm2 + vmovdqu 48(%r11), %xmm3 + # aes_dec_block + vmovdqu (%r8), %xmm5 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm5, %xmm3, %xmm3 + vmovdqu 16(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + vmovdqu 32(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + vmovdqu 48(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + vmovdqu 64(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + vmovdqu 80(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + vmovdqu 96(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + vmovdqu 112(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + vmovdqu 128(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + vmovdqu 144(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + cmpl $11, %r9d + vmovdqu 160(%r8), %xmm5 + jl L_AES_CBC_decrypt_avx1_64_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + vmovdqu 176(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + cmpl $13, %r9d + vmovdqu 192(%r8), %xmm5 + jl L_AES_CBC_decrypt_avx1_64_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + vmovdqu 208(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vaesdec %xmm5, %xmm1, %xmm1 + vaesdec %xmm5, %xmm2, %xmm2 + vaesdec %xmm5, %xmm3, %xmm3 + vmovdqu 224(%r8), %xmm5 +L_AES_CBC_decrypt_avx1_64_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vaesdeclast %xmm5, %xmm1, %xmm1 + vaesdeclast %xmm5, %xmm2, %xmm2 + vaesdeclast %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm0, %xmm0 + vpxor (%r11), %xmm1, %xmm1 + vpxor 16(%r11), %xmm2, %xmm2 + vpxor 32(%r11), %xmm3, %xmm3 + vmovdqu 48(%r11), %xmm4 + vmovdqu %xmm0, (%r12) + vmovdqu %xmm1, 16(%r12) + vmovdqu %xmm2, 32(%r12) + vmovdqu %xmm3, 48(%r12) + addl $0x40, %eax + cmpl %r10d, %eax + jl L_AES_CBC_decrypt_avx1_dec_64 +L_AES_CBC_decrypt_avx1_done_64: + cmpl %ecx, %eax + movl %ecx, %r10d + je L_AES_CBC_decrypt_avx1_done_dec + andl $0xfffffff0, %r10d +L_AES_CBC_decrypt_avx1_dec_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r11 + vmovdqu (%r11), %xmm0 + vmovdqa %xmm0, %xmm8 + # aes_dec_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r8), %xmm6 + jl L_AES_CBC_decrypt_avx1_16_aes_dec_block_last + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm7 + vaesdec %xmm7, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r8), %xmm6 + jl L_AES_CBC_decrypt_avx1_16_aes_dec_block_last + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm7 + vaesdec %xmm7, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm6 +L_AES_CBC_decrypt_avx1_16_aes_dec_block_last: + vaesdeclast %xmm6, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vmovdqa %xmm8, %xmm4 + leaq (%rsi,%rax,1), %r11 + vmovdqu %xmm0, (%r11) + addl $16, %eax + cmpl %r10d, %eax + jl L_AES_CBC_decrypt_avx1_dec_16 +L_AES_CBC_decrypt_avx1_done_dec: + vmovdqu %xmm4, (%rdx) + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_CBC_decrypt_avx1,.-AES_CBC_decrypt_avx1 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_aes_ctr_avx1_bswap: +.quad 0x08090a0b0c0d0e0f,0x0001020304050607 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_aes_ctr_avx1_one: +.quad 0x0000000000000001,0x0000000000000000 +#ifndef __APPLE__ +.text +.globl AES_CTR_encrypt_avx1 +.type AES_CTR_encrypt_avx1,@function +.align 16 +AES_CTR_encrypt_avx1: +#else +.section __TEXT,__text +.globl _AES_CTR_encrypt_avx1 +.p2align 4 +_AES_CTR_encrypt_avx1: +#endif /* __APPLE__ */ + pushq %rbx + vmovdqu L_aes_ctr_avx1_bswap(%rip), %xmm8 + vmovdqu L_aes_ctr_avx1_one(%rip), %xmm9 + vpxor %xmm10, %xmm10, %xmm10 + vmovdqu (%r9), %xmm7 + vpshufb %xmm8, %xmm7, %xmm7 + xorl %eax, %eax + cmpl $0x40, %edx + movl %edx, %r10d + jl L_AES_CTR_encrypt_avx1_done_64 + andl $0xffffffc0, %r10d +L_AES_CTR_encrypt_avx1_enc_64: + # 64 bytes of input + # aes_ctr_enc_64 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %rbx + vpshufb %xmm8, %xmm7, %xmm0 + vpaddq %xmm9, %xmm7, %xmm7 + vpcmpeqq %xmm10, %xmm7, %xmm11 + vpslldq $8, %xmm11, %xmm11 + vpsrlq $63, %xmm11, %xmm11 + vpaddq %xmm11, %xmm7, %xmm7 + vpshufb %xmm8, %xmm7, %xmm1 + vpaddq %xmm9, %xmm7, %xmm7 + vpcmpeqq %xmm10, %xmm7, %xmm11 + vpslldq $8, %xmm11, %xmm11 + vpsrlq $63, %xmm11, %xmm11 + vpaddq %xmm11, %xmm7, %xmm7 + vpshufb %xmm8, %xmm7, %xmm2 + vpaddq %xmm9, %xmm7, %xmm7 + vpcmpeqq %xmm10, %xmm7, %xmm11 + vpslldq $8, %xmm11, %xmm11 + vpsrlq $63, %xmm11, %xmm11 + vpaddq %xmm11, %xmm7, %xmm7 + vpshufb %xmm8, %xmm7, %xmm3 + vpaddq %xmm9, %xmm7, %xmm7 + vpcmpeqq %xmm10, %xmm7, %xmm11 + vpslldq $8, %xmm11, %xmm11 + vpsrlq $63, %xmm11, %xmm11 + vpaddq %xmm11, %xmm7, %xmm7 + # aes_enc_block + vmovdqu (%rcx), %xmm4 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm4, %xmm3, %xmm3 + vmovdqu 16(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 32(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 48(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 64(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 80(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 96(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 112(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 128(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 144(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm4 + jl L_AES_CTR_encrypt_avx1_64_aes_enc_block_last + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 176(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm4 + jl L_AES_CTR_encrypt_avx1_64_aes_enc_block_last + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 208(%rcx), %xmm4 + vaesenc %xmm4, %xmm0, %xmm0 + vaesenc %xmm4, %xmm1, %xmm1 + vaesenc %xmm4, %xmm2, %xmm2 + vaesenc %xmm4, %xmm3, %xmm3 + vmovdqu 224(%rcx), %xmm4 +L_AES_CTR_encrypt_avx1_64_aes_enc_block_last: + vaesenclast %xmm4, %xmm0, %xmm0 + vaesenclast %xmm4, %xmm1, %xmm1 + vaesenclast %xmm4, %xmm2, %xmm2 + vaesenclast %xmm4, %xmm3, %xmm3 + vpxor (%r11), %xmm0, %xmm0 + vpxor 16(%r11), %xmm1, %xmm1 + vpxor 32(%r11), %xmm2, %xmm2 + vpxor 48(%r11), %xmm3, %xmm3 + vmovdqu %xmm0, (%rbx) + vmovdqu %xmm1, 16(%rbx) + vmovdqu %xmm2, 32(%rbx) + vmovdqu %xmm3, 48(%rbx) + addl $0x40, %eax + cmpl %r10d, %eax + jl L_AES_CTR_encrypt_avx1_enc_64 +L_AES_CTR_encrypt_avx1_done_64: + cmpl %edx, %eax + movl %edx, %r10d + je L_AES_CTR_encrypt_avx1_done_enc + andl $0xfffffff0, %r10d +L_AES_CTR_encrypt_avx1_enc_16: + # 16 bytes of input + vpshufb %xmm8, %xmm7, %xmm0 + vpaddq %xmm9, %xmm7, %xmm7 + vpcmpeqq %xmm10, %xmm7, %xmm11 + vpslldq $8, %xmm11, %xmm11 + vpsrlq $63, %xmm11, %xmm11 + vpaddq %xmm11, %xmm7, %xmm7 + # aes_enc_block + vpxor (%rcx), %xmm0, %xmm0 + vmovdqu 16(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm5 + jl L_AES_CTR_encrypt_avx1_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm5 + jl L_AES_CTR_encrypt_avx1_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%rcx), %xmm5 +L_AES_CTR_encrypt_avx1_16_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + leaq (%rdi,%rax,1), %r11 + vpxor (%r11), %xmm0, %xmm0 + leaq (%rsi,%rax,1), %r11 + vmovdqu %xmm0, (%r11) + addl $16, %eax + cmpl %r10d, %eax + jl L_AES_CTR_encrypt_avx1_enc_16 +L_AES_CTR_encrypt_avx1_done_enc: + vpshufb %xmm8, %xmm7, %xmm7 + vmovdqu %xmm7, (%r9) + popq %rbx + repz retq +#ifndef __APPLE__ +.size AES_CTR_encrypt_avx1,.-AES_CTR_encrypt_avx1 +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX1 */ +#ifdef HAVE_INTEL_VAES +#ifndef __APPLE__ +.text +.globl AES_ECB_encrypt_vaes +.type AES_ECB_encrypt_vaes,@function +.align 16 +AES_ECB_encrypt_vaes: +#else +.section __TEXT,__text +.globl _AES_ECB_encrypt_vaes +.p2align 4 +_AES_ECB_encrypt_vaes: +#endif /* __APPLE__ */ + xorl %eax, %eax + cmpl $0x80, %edx + movl %edx, %r9d + jl L_AES_ECB_encrypt_vaes_done_128 + andl $0xffffff80, %r9d +L_AES_ECB_encrypt_vaes_enc_128: + # 128 bytes of input + # aes_ecb_enc_128 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + vmovdqu (%r10), %ymm0 + vmovdqu 32(%r10), %ymm1 + vmovdqu 64(%r10), %ymm2 + vmovdqu 96(%r10), %ymm3 + # aes_enc_block + vbroadcasti128 (%rcx), %ymm7 + vpxor %ymm7, %ymm0, %ymm0 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm7, %ymm2, %ymm2 + vpxor %ymm7, %ymm3, %ymm3 + vbroadcasti128 16(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + vbroadcasti128 32(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + vbroadcasti128 48(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + vbroadcasti128 64(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + vbroadcasti128 80(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + vbroadcasti128 96(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + vbroadcasti128 112(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + vbroadcasti128 128(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + vbroadcasti128 144(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + cmpl $11, %r8d + vbroadcasti128 160(%rcx), %ymm7 + jl L_AES_ECB_encrypt_vaes_128_aes_enc_block_last + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + vbroadcasti128 176(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + cmpl $13, %r8d + vbroadcasti128 192(%rcx), %ymm7 + jl L_AES_ECB_encrypt_vaes_128_aes_enc_block_last + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + vbroadcasti128 208(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vaesenc %ymm7, %ymm1, %ymm1 + vaesenc %ymm7, %ymm2, %ymm2 + vaesenc %ymm7, %ymm3, %ymm3 + vbroadcasti128 224(%rcx), %ymm7 +L_AES_ECB_encrypt_vaes_128_aes_enc_block_last: + vaesenclast %ymm7, %ymm0, %ymm0 + vaesenclast %ymm7, %ymm1, %ymm1 + vaesenclast %ymm7, %ymm2, %ymm2 + vaesenclast %ymm7, %ymm3, %ymm3 + vmovdqu %ymm0, (%r11) + vmovdqu %ymm1, 32(%r11) + vmovdqu %ymm2, 64(%r11) + vmovdqu %ymm3, 96(%r11) + addl $0x80, %eax + cmpl %r9d, %eax + jl L_AES_ECB_encrypt_vaes_enc_128 +L_AES_ECB_encrypt_vaes_done_128: + movl %edx, %r9d + andl $0xffffffe0, %r9d + cmpl %r9d, %eax + je L_AES_ECB_encrypt_vaes_done_32 +L_AES_ECB_encrypt_vaes_enc_32: + # 32 bytes of input + # aes_ecb_enc_32 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + vmovdqu (%r10), %ymm0 + # aes_enc_block + vbroadcasti128 (%rcx), %ymm7 + vpxor %ymm7, %ymm0, %ymm0 + vbroadcasti128 16(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vbroadcasti128 32(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vbroadcasti128 48(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vbroadcasti128 64(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vbroadcasti128 80(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vbroadcasti128 96(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vbroadcasti128 112(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vbroadcasti128 128(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vbroadcasti128 144(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + cmpl $11, %r8d + vbroadcasti128 160(%rcx), %ymm7 + jl L_AES_ECB_encrypt_vaes_32_aes_enc_block_last + vaesenc %ymm7, %ymm0, %ymm0 + vbroadcasti128 176(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + cmpl $13, %r8d + vbroadcasti128 192(%rcx), %ymm7 + jl L_AES_ECB_encrypt_vaes_32_aes_enc_block_last + vaesenc %ymm7, %ymm0, %ymm0 + vbroadcasti128 208(%rcx), %ymm7 + vaesenc %ymm7, %ymm0, %ymm0 + vbroadcasti128 224(%rcx), %ymm7 +L_AES_ECB_encrypt_vaes_32_aes_enc_block_last: + vaesenclast %ymm7, %ymm0, %ymm0 + vmovdqu %ymm0, (%r11) + addl $32, %eax + cmpl %r9d, %eax + jl L_AES_ECB_encrypt_vaes_enc_32 +L_AES_ECB_encrypt_vaes_done_32: + cmpl %edx, %eax + movl %edx, %r9d + je L_AES_ECB_encrypt_vaes_done_enc + andl $0xfffffff0, %r9d +L_AES_ECB_encrypt_vaes_enc_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + vmovdqu (%r10), %xmm0 + # aes_enc_block + vpxor (%rcx), %xmm0, %xmm0 + vmovdqu 16(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm5 + jl L_AES_ECB_encrypt_vaes_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm5 + jl L_AES_ECB_encrypt_vaes_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%rcx), %xmm5 +L_AES_ECB_encrypt_vaes_16_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + leaq (%rsi,%rax,1), %r10 + vmovdqu %xmm0, (%r10) + addl $16, %eax + cmpl %r9d, %eax + jl L_AES_ECB_encrypt_vaes_enc_16 +L_AES_ECB_encrypt_vaes_done_enc: + repz retq +#ifndef __APPLE__ +.size AES_ECB_encrypt_vaes,.-AES_ECB_encrypt_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_ECB_decrypt_vaes +.type AES_ECB_decrypt_vaes,@function +.align 16 +AES_ECB_decrypt_vaes: +#else +.section __TEXT,__text +.globl _AES_ECB_decrypt_vaes +.p2align 4 +_AES_ECB_decrypt_vaes: +#endif /* __APPLE__ */ + xorl %eax, %eax + cmpl $0x80, %edx + movl %edx, %r9d + jl L_AES_ECB_decrypt_vaes_done_128 + andl $0xffffff80, %r9d +L_AES_ECB_decrypt_vaes_dec_128: + # 128 bytes of input + # aes_ecb_dec_128 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + vmovdqu (%r10), %ymm0 + vmovdqu 32(%r10), %ymm1 + vmovdqu 64(%r10), %ymm2 + vmovdqu 96(%r10), %ymm3 + # aes_dec_block + vbroadcasti128 (%rcx), %ymm7 + vpxor %ymm7, %ymm0, %ymm0 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm7, %ymm2, %ymm2 + vpxor %ymm7, %ymm3, %ymm3 + vbroadcasti128 16(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + vbroadcasti128 32(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + vbroadcasti128 48(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + vbroadcasti128 64(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + vbroadcasti128 80(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + vbroadcasti128 96(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + vbroadcasti128 112(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + vbroadcasti128 128(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + vbroadcasti128 144(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + cmpl $11, %r8d + vbroadcasti128 160(%rcx), %ymm7 + jl L_AES_ECB_decrypt_vaes_128_aes_dec_block_last + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + vbroadcasti128 176(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + cmpl $13, %r8d + vbroadcasti128 192(%rcx), %ymm7 + jl L_AES_ECB_decrypt_vaes_128_aes_dec_block_last + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + vbroadcasti128 208(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vaesdec %ymm7, %ymm1, %ymm1 + vaesdec %ymm7, %ymm2, %ymm2 + vaesdec %ymm7, %ymm3, %ymm3 + vbroadcasti128 224(%rcx), %ymm7 +L_AES_ECB_decrypt_vaes_128_aes_dec_block_last: + vaesdeclast %ymm7, %ymm0, %ymm0 + vaesdeclast %ymm7, %ymm1, %ymm1 + vaesdeclast %ymm7, %ymm2, %ymm2 + vaesdeclast %ymm7, %ymm3, %ymm3 + vmovdqu %ymm0, (%r11) + vmovdqu %ymm1, 32(%r11) + vmovdqu %ymm2, 64(%r11) + vmovdqu %ymm3, 96(%r11) + addl $0x80, %eax + cmpl %r9d, %eax + jl L_AES_ECB_decrypt_vaes_dec_128 +L_AES_ECB_decrypt_vaes_done_128: + movl %edx, %r9d + andl $0xffffffe0, %r9d + cmpl %r9d, %eax + je L_AES_ECB_decrypt_vaes_done_32 +L_AES_ECB_decrypt_vaes_dec_32: + # 32 bytes of input + # aes_ecb_dec_32 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + vmovdqu (%r10), %ymm0 + # aes_dec_block + vbroadcasti128 (%rcx), %ymm7 + vpxor %ymm7, %ymm0, %ymm0 + vbroadcasti128 16(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vbroadcasti128 32(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vbroadcasti128 48(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vbroadcasti128 64(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vbroadcasti128 80(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vbroadcasti128 96(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vbroadcasti128 112(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vbroadcasti128 128(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vbroadcasti128 144(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + cmpl $11, %r8d + vbroadcasti128 160(%rcx), %ymm7 + jl L_AES_ECB_decrypt_vaes_32_aes_dec_block_last + vaesdec %ymm7, %ymm0, %ymm0 + vbroadcasti128 176(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + cmpl $13, %r8d + vbroadcasti128 192(%rcx), %ymm7 + jl L_AES_ECB_decrypt_vaes_32_aes_dec_block_last + vaesdec %ymm7, %ymm0, %ymm0 + vbroadcasti128 208(%rcx), %ymm7 + vaesdec %ymm7, %ymm0, %ymm0 + vbroadcasti128 224(%rcx), %ymm7 +L_AES_ECB_decrypt_vaes_32_aes_dec_block_last: + vaesdeclast %ymm7, %ymm0, %ymm0 + vmovdqu %ymm0, (%r11) + addl $32, %eax + cmpl %r9d, %eax + jl L_AES_ECB_decrypt_vaes_dec_32 +L_AES_ECB_decrypt_vaes_done_32: + cmpl %edx, %eax + movl %edx, %r9d + je L_AES_ECB_decrypt_vaes_done_dec + andl $0xfffffff0, %r9d +L_AES_ECB_decrypt_vaes_dec_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + vmovdqu (%r10), %xmm0 + # aes_dec_block + vpxor (%rcx), %xmm0, %xmm0 + vmovdqu 16(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm5 + jl L_AES_ECB_decrypt_vaes_16_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%rcx), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm5 + jl L_AES_ECB_decrypt_vaes_16_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%rcx), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%rcx), %xmm5 +L_AES_ECB_decrypt_vaes_16_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + leaq (%rsi,%rax,1), %r10 + vmovdqu %xmm0, (%r10) + addl $16, %eax + cmpl %r9d, %eax + jl L_AES_ECB_decrypt_vaes_dec_16 +L_AES_ECB_decrypt_vaes_done_dec: + repz retq +#ifndef __APPLE__ +.size AES_ECB_decrypt_vaes,.-AES_ECB_decrypt_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_CBC_encrypt_vaes +.type AES_CBC_encrypt_vaes,@function +.align 16 +AES_CBC_encrypt_vaes: +#else +.section __TEXT,__text +.globl _AES_CBC_encrypt_vaes +.p2align 4 +_AES_CBC_encrypt_vaes: +#endif /* __APPLE__ */ + vmovdqu (%rdx), %xmm0 + xorl %eax, %eax + cmpl %ecx, %eax + je L_AES_CBC_encrypt_vaes_done +L_AES_CBC_encrypt_vaes_loop: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + vmovdqu (%r10), %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + # aes_enc_block + vpxor (%r8), %xmm1, %xmm1 + vmovdqu 16(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 32(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 48(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 64(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 80(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 96(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 112(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 128(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 144(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + cmpl $11, %r9d + vmovdqu 160(%r8), %xmm3 + jl L_AES_CBC_encrypt_vaes_aes_enc_block_last + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 176(%r8), %xmm4 + vaesenc %xmm4, %xmm1, %xmm1 + cmpl $13, %r9d + vmovdqu 192(%r8), %xmm3 + jl L_AES_CBC_encrypt_vaes_aes_enc_block_last + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 208(%r8), %xmm4 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqu 224(%r8), %xmm3 +L_AES_CBC_encrypt_vaes_aes_enc_block_last: + vaesenclast %xmm3, %xmm1, %xmm1 + leaq (%rsi,%rax,1), %r11 + vmovdqu %xmm1, (%r11) + vmovdqa %xmm1, %xmm0 + addl $16, %eax + cmpl %ecx, %eax + jl L_AES_CBC_encrypt_vaes_loop +L_AES_CBC_encrypt_vaes_done: + vmovdqu %xmm0, (%rdx) + repz retq +#ifndef __APPLE__ +.size AES_CBC_encrypt_vaes,.-AES_CBC_encrypt_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_CBC_decrypt_vaes +.type AES_CBC_decrypt_vaes,@function +.align 16 +AES_CBC_decrypt_vaes: +#else +.section __TEXT,__text +.globl _AES_CBC_decrypt_vaes +.p2align 4 +_AES_CBC_decrypt_vaes: +#endif /* __APPLE__ */ + pushq %r12 + vmovdqu (%rdx), %xmm8 + xorl %eax, %eax + cmpl $0x80, %ecx + movl %ecx, %r10d + jl L_AES_CBC_decrypt_vaes_done_128 + andl $0xffffff80, %r10d +L_AES_CBC_decrypt_vaes_dec_128: + # 128 bytes of input + # aes_cbc_dec_128 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %r12 + vmovdqu (%r11), %ymm0 + vmovdqu 32(%r11), %ymm1 + vmovdqu 64(%r11), %ymm2 + vmovdqu 96(%r11), %ymm3 + vinserti128 $0x01, %xmm0, %ymm8, %ymm10 + vmovdqu 16(%r11), %ymm11 + vmovdqu 48(%r11), %ymm12 + vmovdqu 80(%r11), %ymm13 + vextracti128 $0x01, %ymm3, %xmm8 + # aes_dec_block + vbroadcasti128 (%r8), %ymm9 + vpxor %ymm9, %ymm0, %ymm0 + vpxor %ymm9, %ymm1, %ymm1 + vpxor %ymm9, %ymm2, %ymm2 + vpxor %ymm9, %ymm3, %ymm3 + vbroadcasti128 16(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 32(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 48(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 64(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 80(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 96(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 112(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 128(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 144(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + cmpl $11, %r9d + vbroadcasti128 160(%r8), %ymm9 + jl L_AES_CBC_decrypt_vaes_128_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 176(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + cmpl $13, %r9d + vbroadcasti128 192(%r8), %ymm9 + jl L_AES_CBC_decrypt_vaes_128_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 208(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 224(%r8), %ymm9 +L_AES_CBC_decrypt_vaes_128_aes_dec_block_last: + vaesdeclast %ymm9, %ymm0, %ymm0 + vaesdeclast %ymm9, %ymm1, %ymm1 + vaesdeclast %ymm9, %ymm2, %ymm2 + vaesdeclast %ymm9, %ymm3, %ymm3 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vmovdqu %ymm0, (%r12) + vmovdqu %ymm1, 32(%r12) + vmovdqu %ymm2, 64(%r12) + vmovdqu %ymm3, 96(%r12) + addl $0x80, %eax + cmpl %r10d, %eax + jl L_AES_CBC_decrypt_vaes_dec_128 +L_AES_CBC_decrypt_vaes_done_128: + movl %ecx, %r10d + andl $0xffffffe0, %r10d + cmpl %r10d, %eax + je L_AES_CBC_decrypt_vaes_done_32 +L_AES_CBC_decrypt_vaes_dec_32: + # 32 bytes of input + # aes_cbc_dec_32 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %r12 + vmovdqu (%r11), %ymm0 + vinserti128 $0x01, %xmm0, %ymm8, %ymm10 + vextracti128 $0x01, %ymm0, %xmm8 + # aes_dec_block + vbroadcasti128 (%r8), %ymm9 + vpxor %ymm9, %ymm0, %ymm0 + vbroadcasti128 16(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 32(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 48(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 64(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 80(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 96(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 112(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 128(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 144(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + cmpl $11, %r9d + vbroadcasti128 160(%r8), %ymm9 + jl L_AES_CBC_decrypt_vaes_32_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 176(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + cmpl $13, %r9d + vbroadcasti128 192(%r8), %ymm9 + jl L_AES_CBC_decrypt_vaes_32_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 208(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 224(%r8), %ymm9 +L_AES_CBC_decrypt_vaes_32_aes_dec_block_last: + vaesdeclast %ymm9, %ymm0, %ymm0 + vpxor %ymm10, %ymm0, %ymm0 + vmovdqu %ymm0, (%r12) + addl $32, %eax + cmpl %r10d, %eax + jl L_AES_CBC_decrypt_vaes_dec_32 +L_AES_CBC_decrypt_vaes_done_32: + cmpl %ecx, %eax + movl %ecx, %r10d + je L_AES_CBC_decrypt_vaes_done_dec + andl $0xfffffff0, %r10d +L_AES_CBC_decrypt_vaes_dec_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r11 + vmovdqu (%r11), %xmm0 + vmovdqa %xmm0, %xmm7 + # aes_dec_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r8), %xmm5 + jl L_AES_CBC_decrypt_vaes_16_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r8), %xmm5 + jl L_AES_CBC_decrypt_vaes_16_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_CBC_decrypt_vaes_16_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + vmovdqa %xmm7, %xmm8 + leaq (%rsi,%rax,1), %r11 + vmovdqu %xmm0, (%r11) + addl $16, %eax + cmpl %r10d, %eax + jl L_AES_CBC_decrypt_vaes_dec_16 +L_AES_CBC_decrypt_vaes_done_dec: + vmovdqu %xmm8, (%rdx) + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_CBC_decrypt_vaes,.-AES_CBC_decrypt_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_aes_ctr_bswap_vaes: +.quad 0x08090a0b0c0d0e0f,0x0001020304050607 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_aes_ctr_inc_vaes: +.quad 0x0000000000000000,0x0000000000000000 +.quad 0x0000000000000001,0x0000000000000000 +.quad 0x0000000000000002,0x0000000000000000 +.quad 0x0000000000000003,0x0000000000000000 +.quad 0x0000000000000004,0x0000000000000000 +.quad 0x0000000000000005,0x0000000000000000 +.quad 0x0000000000000006,0x0000000000000000 +.quad 0x0000000000000007,0x0000000000000000 +.quad 0x0000000000000008,0x0000000000000000 +.quad 0x0000000000000009,0x0000000000000000 +.quad 0x000000000000000a,0x0000000000000000 +.quad 0x000000000000000b,0x0000000000000000 +.quad 0x000000000000000c,0x0000000000000000 +.quad 0x000000000000000d,0x0000000000000000 +.quad 0x000000000000000e,0x0000000000000000 +.quad 0x000000000000000f,0x0000000000000000 +.quad 0x0000000000000010,0x0000000000000000 +#ifndef __APPLE__ +.text +.globl AES_CTR_encrypt_vaes +.type AES_CTR_encrypt_vaes,@function +.align 16 +AES_CTR_encrypt_vaes: +#else +.section __TEXT,__text +.globl _AES_CTR_encrypt_vaes +.p2align 4 +_AES_CTR_encrypt_vaes: +#endif /* __APPLE__ */ + pushq %rbx + vbroadcasti128 L_aes_ctr_bswap_vaes(%rip), %ymm8 + vbroadcasti128 (%r9), %ymm7 + vpshufb %ymm8, %ymm7, %ymm7 + vbroadcasti128 128+L_aes_ctr_inc_vaes(%rip), %ymm10 + vbroadcasti128 32+L_aes_ctr_inc_vaes(%rip), %ymm11 + vbroadcasti128 16+L_aes_ctr_inc_vaes(%rip), %ymm12 + xorl %eax, %eax + cmpl $0x80, %edx + movl %edx, %r10d + jl L_AES_CTR_encrypt_vaes_done_128 + andl $0xffffff80, %r10d + vmovdqa %ymm7, %ymm9 + vpaddq 0+L_aes_ctr_inc_vaes(%rip), %ymm7, %ymm4 + vpand 0+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm14 + vpor 0+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm9 + vpandn %ymm9, %ymm4, %ymm9 + vpor %ymm14, %ymm9, %ymm9 + vpsrlq $63, %ymm9, %ymm9 + vpslldq $8, %ymm9, %ymm9 + vpaddq %ymm9, %ymm4, %ymm4 + vmovdqa %ymm7, %ymm9 + vpaddq 32+L_aes_ctr_inc_vaes(%rip), %ymm7, %ymm5 + vpand 32+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm14 + vpor 32+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm9 + vpandn %ymm9, %ymm5, %ymm9 + vpor %ymm14, %ymm9, %ymm9 + vpsrlq $63, %ymm9, %ymm9 + vpslldq $8, %ymm9, %ymm9 + vpaddq %ymm9, %ymm5, %ymm5 + vmovdqa %ymm7, %ymm9 + vpaddq 64+L_aes_ctr_inc_vaes(%rip), %ymm7, %ymm6 + vpand 64+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm14 + vpor 64+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm9 + vpandn %ymm9, %ymm6, %ymm9 + vpor %ymm14, %ymm9, %ymm9 + vpsrlq $63, %ymm9, %ymm9 + vpslldq $8, %ymm9, %ymm9 + vpaddq %ymm9, %ymm6, %ymm6 + vmovdqa %ymm7, %ymm9 + vpaddq 96+L_aes_ctr_inc_vaes(%rip), %ymm7, %ymm7 + vpand 96+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm14 + vpor 96+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm9 + vpandn %ymm9, %ymm7, %ymm9 + vpor %ymm14, %ymm9, %ymm9 + vpsrlq $63, %ymm9, %ymm9 + vpslldq $8, %ymm9, %ymm9 + vpaddq %ymm9, %ymm7, %ymm7 +L_AES_CTR_encrypt_vaes_enc_128: + # 128 bytes of input + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %rbx + vpshufb %ymm8, %ymm4, %ymm0 + vpshufb %ymm8, %ymm5, %ymm1 + vpshufb %ymm8, %ymm6, %ymm2 + vpshufb %ymm8, %ymm7, %ymm3 + vmovdqa %ymm4, %ymm9 + vpaddq %ymm10, %ymm4, %ymm4 + vpand %ymm10, %ymm9, %ymm14 + vpor %ymm10, %ymm9, %ymm9 + vpandn %ymm9, %ymm4, %ymm9 + vpor %ymm14, %ymm9, %ymm9 + vpsrlq $63, %ymm9, %ymm9 + vpslldq $8, %ymm9, %ymm9 + vpaddq %ymm9, %ymm4, %ymm4 + vmovdqa %ymm5, %ymm9 + vpaddq %ymm10, %ymm5, %ymm5 + vpand %ymm10, %ymm9, %ymm14 + vpor %ymm10, %ymm9, %ymm9 + vpandn %ymm9, %ymm5, %ymm9 + vpor %ymm14, %ymm9, %ymm9 + vpsrlq $63, %ymm9, %ymm9 + vpslldq $8, %ymm9, %ymm9 + vpaddq %ymm9, %ymm5, %ymm5 + vmovdqa %ymm6, %ymm9 + vpaddq %ymm10, %ymm6, %ymm6 + vpand %ymm10, %ymm9, %ymm14 + vpor %ymm10, %ymm9, %ymm9 + vpandn %ymm9, %ymm6, %ymm9 + vpor %ymm14, %ymm9, %ymm9 + vpsrlq $63, %ymm9, %ymm9 + vpslldq $8, %ymm9, %ymm9 + vpaddq %ymm9, %ymm6, %ymm6 + vmovdqa %ymm7, %ymm9 + vpaddq %ymm10, %ymm7, %ymm7 + vpand %ymm10, %ymm9, %ymm14 + vpor %ymm10, %ymm9, %ymm9 + vpandn %ymm9, %ymm7, %ymm9 + vpor %ymm14, %ymm9, %ymm9 + vpsrlq $63, %ymm9, %ymm9 + vpslldq $8, %ymm9, %ymm9 + vpaddq %ymm9, %ymm7, %ymm7 + # aes_enc_block + vbroadcasti128 (%rcx), %ymm13 + vpxor %ymm13, %ymm0, %ymm0 + vpxor %ymm13, %ymm1, %ymm1 + vpxor %ymm13, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vbroadcasti128 16(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + vbroadcasti128 32(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + vbroadcasti128 48(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + vbroadcasti128 64(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + vbroadcasti128 80(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + vbroadcasti128 96(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + vbroadcasti128 112(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + vbroadcasti128 128(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + vbroadcasti128 144(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + cmpl $11, %r8d + vbroadcasti128 160(%rcx), %ymm13 + jl L_AES_CTR_encrypt_vaes_128_aes_enc_block_last + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + vbroadcasti128 176(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + cmpl $13, %r8d + vbroadcasti128 192(%rcx), %ymm13 + jl L_AES_CTR_encrypt_vaes_128_aes_enc_block_last + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + vbroadcasti128 208(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vaesenc %ymm13, %ymm1, %ymm1 + vaesenc %ymm13, %ymm2, %ymm2 + vaesenc %ymm13, %ymm3, %ymm3 + vbroadcasti128 224(%rcx), %ymm13 +L_AES_CTR_encrypt_vaes_128_aes_enc_block_last: + vaesenclast %ymm13, %ymm0, %ymm0 + vaesenclast %ymm13, %ymm1, %ymm1 + vaesenclast %ymm13, %ymm2, %ymm2 + vaesenclast %ymm13, %ymm3, %ymm3 + vpxor (%r11), %ymm0, %ymm0 + vpxor 32(%r11), %ymm1, %ymm1 + vpxor 64(%r11), %ymm2, %ymm2 + vpxor 96(%r11), %ymm3, %ymm3 + vmovdqu %ymm0, (%rbx) + vmovdqu %ymm1, 32(%rbx) + vmovdqu %ymm2, 64(%rbx) + vmovdqu %ymm3, 96(%rbx) + addl $0x80, %eax + cmpl %r10d, %eax + jl L_AES_CTR_encrypt_vaes_enc_128 + vperm2i128 $0x00, %ymm4, %ymm4, %ymm7 +L_AES_CTR_encrypt_vaes_done_128: + movl %edx, %r10d + andl $0xffffffe0, %r10d + cmpl %r10d, %eax + je L_AES_CTR_encrypt_vaes_done_32 +L_AES_CTR_encrypt_vaes_enc_32: + # 32 bytes of input + # aes_ctr_enc_32 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %rbx + vpaddq 0+L_aes_ctr_inc_vaes(%rip), %ymm7, %ymm0 + vmovdqa %ymm7, %ymm9 + vpand 0+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm14 + vpor 0+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm9 + vpandn %ymm9, %ymm0, %ymm9 + vpor %ymm14, %ymm9, %ymm9 + vpsrlq $63, %ymm9, %ymm9 + vpslldq $8, %ymm9, %ymm9 + vpaddq %ymm9, %ymm0, %ymm0 + vpshufb %ymm8, %ymm0, %ymm0 + vmovdqa %ymm7, %ymm9 + vpaddq %ymm11, %ymm7, %ymm7 + vpand %ymm11, %ymm9, %ymm14 + vpor %ymm11, %ymm9, %ymm9 + vpandn %ymm9, %ymm7, %ymm9 + vpor %ymm14, %ymm9, %ymm9 + vpsrlq $63, %ymm9, %ymm9 + vpslldq $8, %ymm9, %ymm9 + vpaddq %ymm9, %ymm7, %ymm7 + # aes_enc_block + vbroadcasti128 (%rcx), %ymm13 + vpxor %ymm13, %ymm0, %ymm0 + vbroadcasti128 16(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vbroadcasti128 32(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vbroadcasti128 48(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vbroadcasti128 64(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vbroadcasti128 80(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vbroadcasti128 96(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vbroadcasti128 112(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vbroadcasti128 128(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vbroadcasti128 144(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + cmpl $11, %r8d + vbroadcasti128 160(%rcx), %ymm13 + jl L_AES_CTR_encrypt_vaes_32_aes_enc_block_last + vaesenc %ymm13, %ymm0, %ymm0 + vbroadcasti128 176(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + cmpl $13, %r8d + vbroadcasti128 192(%rcx), %ymm13 + jl L_AES_CTR_encrypt_vaes_32_aes_enc_block_last + vaesenc %ymm13, %ymm0, %ymm0 + vbroadcasti128 208(%rcx), %ymm13 + vaesenc %ymm13, %ymm0, %ymm0 + vbroadcasti128 224(%rcx), %ymm13 +L_AES_CTR_encrypt_vaes_32_aes_enc_block_last: + vaesenclast %ymm13, %ymm0, %ymm0 + vpxor (%r11), %ymm0, %ymm0 + vmovdqu %ymm0, (%rbx) + addl $32, %eax + cmpl %r10d, %eax + jl L_AES_CTR_encrypt_vaes_enc_32 +L_AES_CTR_encrypt_vaes_done_32: + cmpl %edx, %eax + movl %edx, %r10d + je L_AES_CTR_encrypt_vaes_done_enc + andl $0xfffffff0, %r10d +L_AES_CTR_encrypt_vaes_enc_16: + # 16 bytes of input + vpshufb %xmm8, %xmm7, %xmm0 + vmovdqa %ymm7, %ymm9 + vpaddq %ymm12, %ymm7, %ymm7 + vpand %ymm12, %ymm9, %ymm14 + vpor %ymm12, %ymm9, %ymm9 + vpandn %ymm9, %ymm7, %ymm9 + vpor %ymm14, %ymm9, %ymm9 + vpsrlq $63, %ymm9, %ymm9 + vpslldq $8, %ymm9, %ymm9 + vpaddq %ymm9, %ymm7, %ymm7 + # aes_enc_block + vpxor (%rcx), %xmm0, %xmm0 + vmovdqu 16(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm5 + jl L_AES_CTR_encrypt_vaes_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm5 + jl L_AES_CTR_encrypt_vaes_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%rcx), %xmm5 +L_AES_CTR_encrypt_vaes_16_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + leaq (%rdi,%rax,1), %r11 + vpxor (%r11), %xmm0, %xmm0 + leaq (%rsi,%rax,1), %r11 + vmovdqu %xmm0, (%r11) + addl $16, %eax + cmpl %r10d, %eax + jl L_AES_CTR_encrypt_vaes_enc_16 +L_AES_CTR_encrypt_vaes_done_enc: + vpshufb %xmm8, %xmm7, %xmm0 + vmovdqu %xmm0, (%r9) + popq %rbx + repz retq +#ifndef __APPLE__ +.size AES_CTR_encrypt_vaes,.-AES_CTR_encrypt_vaes +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_VAES */ +#ifdef HAVE_INTEL_AVX512 +#ifndef __APPLE__ +.text +.globl AES_ECB_encrypt_avx512 +.type AES_ECB_encrypt_avx512,@function +.align 16 +AES_ECB_encrypt_avx512: +#else +.section __TEXT,__text +.globl _AES_ECB_encrypt_avx512 +.p2align 4 +_AES_ECB_encrypt_avx512: +#endif /* __APPLE__ */ + xorl %eax, %eax + cmpl $0x40, %edx + jl L_AES_ECB_encrypt_avx512_done_64 + vbroadcasti32x4 (%rcx), %zmm8 + vbroadcasti32x4 16(%rcx), %zmm9 + vbroadcasti32x4 32(%rcx), %zmm10 + vbroadcasti32x4 48(%rcx), %zmm11 + vbroadcasti32x4 64(%rcx), %zmm12 + vbroadcasti32x4 80(%rcx), %zmm13 + vbroadcasti32x4 96(%rcx), %zmm14 + vbroadcasti32x4 112(%rcx), %zmm15 + vbroadcasti32x4 128(%rcx), %zmm16 + vbroadcasti32x4 144(%rcx), %zmm17 + vbroadcasti32x4 160(%rcx), %zmm18 + cmpl $11, %r8d + jl L_AES_ECB_encrypt_avx512_key_cached + vbroadcasti32x4 176(%rcx), %zmm19 + vbroadcasti32x4 192(%rcx), %zmm20 + cmpl $13, %r8d + jl L_AES_ECB_encrypt_avx512_key_cached + vbroadcasti32x4 208(%rcx), %zmm21 + vbroadcasti32x4 224(%rcx), %zmm22 +L_AES_ECB_encrypt_avx512_key_cached: + cmpl $0x100, %edx + movl %edx, %r9d + jl L_AES_ECB_encrypt_avx512_done_256 + andl $0xffffff00, %r9d +L_AES_ECB_encrypt_avx512_enc_256: + # 256 bytes of input + # aes_ecb_enc_256 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + vmovdqu64 (%r10), %zmm0 + vmovdqu64 64(%r10), %zmm1 + vmovdqu64 128(%r10), %zmm2 + vmovdqu64 192(%r10), %zmm3 + # aes_enc_block + vpxorq %zmm8, %zmm0, %zmm0 + vpxorq %zmm8, %zmm1, %zmm1 + vpxorq %zmm8, %zmm2, %zmm2 + vpxorq %zmm8, %zmm3, %zmm3 + vaesenc %zmm9, %zmm0, %zmm0 + vaesenc %zmm9, %zmm1, %zmm1 + vaesenc %zmm9, %zmm2, %zmm2 + vaesenc %zmm9, %zmm3, %zmm3 + vaesenc %zmm10, %zmm0, %zmm0 + vaesenc %zmm10, %zmm1, %zmm1 + vaesenc %zmm10, %zmm2, %zmm2 + vaesenc %zmm10, %zmm3, %zmm3 + vaesenc %zmm11, %zmm0, %zmm0 + vaesenc %zmm11, %zmm1, %zmm1 + vaesenc %zmm11, %zmm2, %zmm2 + vaesenc %zmm11, %zmm3, %zmm3 + vaesenc %zmm12, %zmm0, %zmm0 + vaesenc %zmm12, %zmm1, %zmm1 + vaesenc %zmm12, %zmm2, %zmm2 + vaesenc %zmm12, %zmm3, %zmm3 + vaesenc %zmm13, %zmm0, %zmm0 + vaesenc %zmm13, %zmm1, %zmm1 + vaesenc %zmm13, %zmm2, %zmm2 + vaesenc %zmm13, %zmm3, %zmm3 + vaesenc %zmm14, %zmm0, %zmm0 + vaesenc %zmm14, %zmm1, %zmm1 + vaesenc %zmm14, %zmm2, %zmm2 + vaesenc %zmm14, %zmm3, %zmm3 + vaesenc %zmm15, %zmm0, %zmm0 + vaesenc %zmm15, %zmm1, %zmm1 + vaesenc %zmm15, %zmm2, %zmm2 + vaesenc %zmm15, %zmm3, %zmm3 + vaesenc %zmm16, %zmm0, %zmm0 + vaesenc %zmm16, %zmm1, %zmm1 + vaesenc %zmm16, %zmm2, %zmm2 + vaesenc %zmm16, %zmm3, %zmm3 + vaesenc %zmm17, %zmm0, %zmm0 + vaesenc %zmm17, %zmm1, %zmm1 + vaesenc %zmm17, %zmm2, %zmm2 + vaesenc %zmm17, %zmm3, %zmm3 + cmpl $11, %r8d + vmovdqa64 %zmm18, %zmm7 + jl L_AES_ECB_encrypt_avx512_256_aes_enc_block_last + vaesenc %zmm18, %zmm0, %zmm0 + vaesenc %zmm18, %zmm1, %zmm1 + vaesenc %zmm18, %zmm2, %zmm2 + vaesenc %zmm18, %zmm3, %zmm3 + vaesenc %zmm19, %zmm0, %zmm0 + vaesenc %zmm19, %zmm1, %zmm1 + vaesenc %zmm19, %zmm2, %zmm2 + vaesenc %zmm19, %zmm3, %zmm3 + cmpl $13, %r8d + vmovdqa64 %zmm20, %zmm7 + jl L_AES_ECB_encrypt_avx512_256_aes_enc_block_last + vaesenc %zmm20, %zmm0, %zmm0 + vaesenc %zmm20, %zmm1, %zmm1 + vaesenc %zmm20, %zmm2, %zmm2 + vaesenc %zmm20, %zmm3, %zmm3 + vaesenc %zmm21, %zmm0, %zmm0 + vaesenc %zmm21, %zmm1, %zmm1 + vaesenc %zmm21, %zmm2, %zmm2 + vaesenc %zmm21, %zmm3, %zmm3 + vmovdqa64 %zmm22, %zmm7 +L_AES_ECB_encrypt_avx512_256_aes_enc_block_last: + vaesenclast %zmm7, %zmm0, %zmm0 + vaesenclast %zmm7, %zmm1, %zmm1 + vaesenclast %zmm7, %zmm2, %zmm2 + vaesenclast %zmm7, %zmm3, %zmm3 + vmovdqu64 %zmm0, (%r11) + vmovdqu64 %zmm1, 64(%r11) + vmovdqu64 %zmm2, 128(%r11) + vmovdqu64 %zmm3, 192(%r11) + addl $0x100, %eax + cmpl %r9d, %eax + jl L_AES_ECB_encrypt_avx512_enc_256 +L_AES_ECB_encrypt_avx512_done_256: + movl %edx, %r9d + andl $0xffffffc0, %r9d + cmpl %r9d, %eax + je L_AES_ECB_encrypt_avx512_done_64 +L_AES_ECB_encrypt_avx512_enc_64: + # 64 bytes of input + # aes_ecb_enc_64 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + vmovdqu64 (%r10), %zmm0 + # aes_enc_block + vpxorq %zmm8, %zmm0, %zmm0 + vaesenc %zmm9, %zmm0, %zmm0 + vaesenc %zmm10, %zmm0, %zmm0 + vaesenc %zmm11, %zmm0, %zmm0 + vaesenc %zmm12, %zmm0, %zmm0 + vaesenc %zmm13, %zmm0, %zmm0 + vaesenc %zmm14, %zmm0, %zmm0 + vaesenc %zmm15, %zmm0, %zmm0 + vaesenc %zmm16, %zmm0, %zmm0 + vaesenc %zmm17, %zmm0, %zmm0 + cmpl $11, %r8d + vmovdqa64 %zmm18, %zmm7 + jl L_AES_ECB_encrypt_avx512_64_aes_enc_block_last + vaesenc %zmm18, %zmm0, %zmm0 + vaesenc %zmm19, %zmm0, %zmm0 + cmpl $13, %r8d + vmovdqa64 %zmm20, %zmm7 + jl L_AES_ECB_encrypt_avx512_64_aes_enc_block_last + vaesenc %zmm20, %zmm0, %zmm0 + vaesenc %zmm21, %zmm0, %zmm0 + vmovdqa64 %zmm22, %zmm7 +L_AES_ECB_encrypt_avx512_64_aes_enc_block_last: + vaesenclast %zmm7, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%r11) + addl $0x40, %eax + cmpl %r9d, %eax + jl L_AES_ECB_encrypt_avx512_enc_64 +L_AES_ECB_encrypt_avx512_done_64: + cmpl %edx, %eax + movl %edx, %r9d + je L_AES_ECB_encrypt_avx512_done_enc + andl $0xfffffff0, %r9d +L_AES_ECB_encrypt_avx512_enc_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + vmovdqu (%r10), %xmm0 + # aes_enc_block + vpxor (%rcx), %xmm0, %xmm0 + vmovdqu 16(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm5 + jl L_AES_ECB_encrypt_avx512_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm5 + jl L_AES_ECB_encrypt_avx512_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%rcx), %xmm5 +L_AES_ECB_encrypt_avx512_16_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + leaq (%rsi,%rax,1), %r10 + vmovdqu %xmm0, (%r10) + addl $16, %eax + cmpl %r9d, %eax + jl L_AES_ECB_encrypt_avx512_enc_16 +L_AES_ECB_encrypt_avx512_done_enc: + repz retq +#ifndef __APPLE__ +.size AES_ECB_encrypt_avx512,.-AES_ECB_encrypt_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_ECB_decrypt_avx512 +.type AES_ECB_decrypt_avx512,@function +.align 16 +AES_ECB_decrypt_avx512: +#else +.section __TEXT,__text +.globl _AES_ECB_decrypt_avx512 +.p2align 4 +_AES_ECB_decrypt_avx512: +#endif /* __APPLE__ */ + xorl %eax, %eax + cmpl $0x40, %edx + jl L_AES_ECB_decrypt_avx512_done_64 + vbroadcasti32x4 (%rcx), %zmm8 + vbroadcasti32x4 16(%rcx), %zmm9 + vbroadcasti32x4 32(%rcx), %zmm10 + vbroadcasti32x4 48(%rcx), %zmm11 + vbroadcasti32x4 64(%rcx), %zmm12 + vbroadcasti32x4 80(%rcx), %zmm13 + vbroadcasti32x4 96(%rcx), %zmm14 + vbroadcasti32x4 112(%rcx), %zmm15 + vbroadcasti32x4 128(%rcx), %zmm16 + vbroadcasti32x4 144(%rcx), %zmm17 + vbroadcasti32x4 160(%rcx), %zmm18 + cmpl $11, %r8d + jl L_AES_ECB_decrypt_avx512_key_cached + vbroadcasti32x4 176(%rcx), %zmm19 + vbroadcasti32x4 192(%rcx), %zmm20 + cmpl $13, %r8d + jl L_AES_ECB_decrypt_avx512_key_cached + vbroadcasti32x4 208(%rcx), %zmm21 + vbroadcasti32x4 224(%rcx), %zmm22 +L_AES_ECB_decrypt_avx512_key_cached: + cmpl $0x100, %edx + movl %edx, %r9d + jl L_AES_ECB_decrypt_avx512_done_256 + andl $0xffffff00, %r9d +L_AES_ECB_decrypt_avx512_dec_256: + # 256 bytes of input + # aes_ecb_dec_256 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + vmovdqu64 (%r10), %zmm0 + vmovdqu64 64(%r10), %zmm1 + vmovdqu64 128(%r10), %zmm2 + vmovdqu64 192(%r10), %zmm3 + # aes_dec_block + vpxorq %zmm8, %zmm0, %zmm0 + vpxorq %zmm8, %zmm1, %zmm1 + vpxorq %zmm8, %zmm2, %zmm2 + vpxorq %zmm8, %zmm3, %zmm3 + vaesdec %zmm9, %zmm0, %zmm0 + vaesdec %zmm9, %zmm1, %zmm1 + vaesdec %zmm9, %zmm2, %zmm2 + vaesdec %zmm9, %zmm3, %zmm3 + vaesdec %zmm10, %zmm0, %zmm0 + vaesdec %zmm10, %zmm1, %zmm1 + vaesdec %zmm10, %zmm2, %zmm2 + vaesdec %zmm10, %zmm3, %zmm3 + vaesdec %zmm11, %zmm0, %zmm0 + vaesdec %zmm11, %zmm1, %zmm1 + vaesdec %zmm11, %zmm2, %zmm2 + vaesdec %zmm11, %zmm3, %zmm3 + vaesdec %zmm12, %zmm0, %zmm0 + vaesdec %zmm12, %zmm1, %zmm1 + vaesdec %zmm12, %zmm2, %zmm2 + vaesdec %zmm12, %zmm3, %zmm3 + vaesdec %zmm13, %zmm0, %zmm0 + vaesdec %zmm13, %zmm1, %zmm1 + vaesdec %zmm13, %zmm2, %zmm2 + vaesdec %zmm13, %zmm3, %zmm3 + vaesdec %zmm14, %zmm0, %zmm0 + vaesdec %zmm14, %zmm1, %zmm1 + vaesdec %zmm14, %zmm2, %zmm2 + vaesdec %zmm14, %zmm3, %zmm3 + vaesdec %zmm15, %zmm0, %zmm0 + vaesdec %zmm15, %zmm1, %zmm1 + vaesdec %zmm15, %zmm2, %zmm2 + vaesdec %zmm15, %zmm3, %zmm3 + vaesdec %zmm16, %zmm0, %zmm0 + vaesdec %zmm16, %zmm1, %zmm1 + vaesdec %zmm16, %zmm2, %zmm2 + vaesdec %zmm16, %zmm3, %zmm3 + vaesdec %zmm17, %zmm0, %zmm0 + vaesdec %zmm17, %zmm1, %zmm1 + vaesdec %zmm17, %zmm2, %zmm2 + vaesdec %zmm17, %zmm3, %zmm3 + cmpl $11, %r8d + vmovdqa64 %zmm18, %zmm7 + jl L_AES_ECB_decrypt_avx512_256_aes_dec_block_last + vaesdec %zmm18, %zmm0, %zmm0 + vaesdec %zmm18, %zmm1, %zmm1 + vaesdec %zmm18, %zmm2, %zmm2 + vaesdec %zmm18, %zmm3, %zmm3 + vaesdec %zmm19, %zmm0, %zmm0 + vaesdec %zmm19, %zmm1, %zmm1 + vaesdec %zmm19, %zmm2, %zmm2 + vaesdec %zmm19, %zmm3, %zmm3 + cmpl $13, %r8d + vmovdqa64 %zmm20, %zmm7 + jl L_AES_ECB_decrypt_avx512_256_aes_dec_block_last + vaesdec %zmm20, %zmm0, %zmm0 + vaesdec %zmm20, %zmm1, %zmm1 + vaesdec %zmm20, %zmm2, %zmm2 + vaesdec %zmm20, %zmm3, %zmm3 + vaesdec %zmm21, %zmm0, %zmm0 + vaesdec %zmm21, %zmm1, %zmm1 + vaesdec %zmm21, %zmm2, %zmm2 + vaesdec %zmm21, %zmm3, %zmm3 + vmovdqa64 %zmm22, %zmm7 +L_AES_ECB_decrypt_avx512_256_aes_dec_block_last: + vaesdeclast %zmm7, %zmm0, %zmm0 + vaesdeclast %zmm7, %zmm1, %zmm1 + vaesdeclast %zmm7, %zmm2, %zmm2 + vaesdeclast %zmm7, %zmm3, %zmm3 + vmovdqu64 %zmm0, (%r11) + vmovdqu64 %zmm1, 64(%r11) + vmovdqu64 %zmm2, 128(%r11) + vmovdqu64 %zmm3, 192(%r11) + addl $0x100, %eax + cmpl %r9d, %eax + jl L_AES_ECB_decrypt_avx512_dec_256 +L_AES_ECB_decrypt_avx512_done_256: + movl %edx, %r9d + andl $0xffffffc0, %r9d + cmpl %r9d, %eax + je L_AES_ECB_decrypt_avx512_done_64 +L_AES_ECB_decrypt_avx512_dec_64: + # 64 bytes of input + # aes_ecb_dec_64 + leaq (%rdi,%rax,1), %r10 + leaq (%rsi,%rax,1), %r11 + vmovdqu64 (%r10), %zmm0 + # aes_dec_block + vpxorq %zmm8, %zmm0, %zmm0 + vaesdec %zmm9, %zmm0, %zmm0 + vaesdec %zmm10, %zmm0, %zmm0 + vaesdec %zmm11, %zmm0, %zmm0 + vaesdec %zmm12, %zmm0, %zmm0 + vaesdec %zmm13, %zmm0, %zmm0 + vaesdec %zmm14, %zmm0, %zmm0 + vaesdec %zmm15, %zmm0, %zmm0 + vaesdec %zmm16, %zmm0, %zmm0 + vaesdec %zmm17, %zmm0, %zmm0 + cmpl $11, %r8d + vmovdqa64 %zmm18, %zmm7 + jl L_AES_ECB_decrypt_avx512_64_aes_dec_block_last + vaesdec %zmm18, %zmm0, %zmm0 + vaesdec %zmm19, %zmm0, %zmm0 + cmpl $13, %r8d + vmovdqa64 %zmm20, %zmm7 + jl L_AES_ECB_decrypt_avx512_64_aes_dec_block_last + vaesdec %zmm20, %zmm0, %zmm0 + vaesdec %zmm21, %zmm0, %zmm0 + vmovdqa64 %zmm22, %zmm7 +L_AES_ECB_decrypt_avx512_64_aes_dec_block_last: + vaesdeclast %zmm7, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%r11) + addl $0x40, %eax + cmpl %r9d, %eax + jl L_AES_ECB_decrypt_avx512_dec_64 +L_AES_ECB_decrypt_avx512_done_64: + cmpl %edx, %eax + movl %edx, %r9d + je L_AES_ECB_decrypt_avx512_done_dec + andl $0xfffffff0, %r9d +L_AES_ECB_decrypt_avx512_dec_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + vmovdqu (%r10), %xmm0 + # aes_dec_block + vpxor (%rcx), %xmm0, %xmm0 + vmovdqu 16(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%rcx), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm5 + jl L_AES_ECB_decrypt_avx512_16_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%rcx), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm5 + jl L_AES_ECB_decrypt_avx512_16_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%rcx), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%rcx), %xmm5 +L_AES_ECB_decrypt_avx512_16_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + leaq (%rsi,%rax,1), %r10 + vmovdqu %xmm0, (%r10) + addl $16, %eax + cmpl %r9d, %eax + jl L_AES_ECB_decrypt_avx512_dec_16 +L_AES_ECB_decrypt_avx512_done_dec: + repz retq +#ifndef __APPLE__ +.size AES_ECB_decrypt_avx512,.-AES_ECB_decrypt_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_CBC_encrypt_avx512 +.type AES_CBC_encrypt_avx512,@function +.align 16 +AES_CBC_encrypt_avx512: +#else +.section __TEXT,__text +.globl _AES_CBC_encrypt_avx512 +.p2align 4 +_AES_CBC_encrypt_avx512: +#endif /* __APPLE__ */ + vmovdqu (%rdx), %xmm0 + xorl %eax, %eax + cmpl %ecx, %eax + je L_AES_CBC_encrypt_avx512_done +L_AES_CBC_encrypt_avx512_loop: + # 16 bytes of input + leaq (%rdi,%rax,1), %r10 + vmovdqu (%r10), %xmm1 + vpternlogq $0x96, (%r8), %xmm0, %xmm1 + # aes_enc_block + vmovdqu 16(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 32(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 48(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 64(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 80(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 96(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 112(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 128(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 144(%r8), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + cmpl $11, %r9d + vmovdqu 160(%r8), %xmm3 + jl L_AES_CBC_encrypt_avx512_aes_enc_block_last + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 176(%r8), %xmm4 + vaesenc %xmm4, %xmm1, %xmm1 + cmpl $13, %r9d + vmovdqu 192(%r8), %xmm3 + jl L_AES_CBC_encrypt_avx512_aes_enc_block_last + vaesenc %xmm3, %xmm1, %xmm1 + vmovdqu 208(%r8), %xmm4 + vaesenc %xmm4, %xmm1, %xmm1 + vmovdqu 224(%r8), %xmm3 +L_AES_CBC_encrypt_avx512_aes_enc_block_last: + vaesenclast %xmm3, %xmm1, %xmm1 + leaq (%rsi,%rax,1), %r11 + vmovdqu %xmm1, (%r11) + vmovdqa %xmm1, %xmm0 + addl $16, %eax + cmpl %ecx, %eax + jl L_AES_CBC_encrypt_avx512_loop +L_AES_CBC_encrypt_avx512_done: + vmovdqu %xmm0, (%rdx) + repz retq +#ifndef __APPLE__ +.size AES_CBC_encrypt_avx512,.-AES_CBC_encrypt_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_CBC_decrypt_avx512 +.type AES_CBC_decrypt_avx512,@function +.align 16 +AES_CBC_decrypt_avx512: +#else +.section __TEXT,__text +.globl _AES_CBC_decrypt_avx512 +.p2align 4 +_AES_CBC_decrypt_avx512: +#endif /* __APPLE__ */ + pushq %r12 + vmovdqu (%rdx), %xmm8 + xorl %eax, %eax + cmpl $0x40, %ecx + jl L_AES_CBC_decrypt_avx512_done_64 + vbroadcasti32x4 (%r8), %zmm14 + vbroadcasti32x4 16(%r8), %zmm15 + vbroadcasti32x4 32(%r8), %zmm16 + vbroadcasti32x4 48(%r8), %zmm17 + vbroadcasti32x4 64(%r8), %zmm18 + vbroadcasti32x4 80(%r8), %zmm19 + vbroadcasti32x4 96(%r8), %zmm20 + vbroadcasti32x4 112(%r8), %zmm21 + vbroadcasti32x4 128(%r8), %zmm22 + vbroadcasti32x4 144(%r8), %zmm23 + vbroadcasti32x4 160(%r8), %zmm24 + cmpl $11, %r9d + jl L_AES_CBC_decrypt_avx512_key_cached + vbroadcasti32x4 176(%r8), %zmm25 + vbroadcasti32x4 192(%r8), %zmm26 + cmpl $13, %r9d + jl L_AES_CBC_decrypt_avx512_key_cached + vbroadcasti32x4 208(%r8), %zmm27 + vbroadcasti32x4 224(%r8), %zmm28 +L_AES_CBC_decrypt_avx512_key_cached: + cmpl $0x100, %ecx + movl %ecx, %r10d + jl L_AES_CBC_decrypt_avx512_done_256 + andl $0xffffff00, %r10d +L_AES_CBC_decrypt_avx512_dec_256: + # 256 bytes of input + # aes_cbc_dec_256 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %r12 + vmovdqu64 (%r11), %zmm0 + vmovdqu64 64(%r11), %zmm1 + vmovdqu64 128(%r11), %zmm2 + vmovdqu64 192(%r11), %zmm3 + vshufi64x2 $0x90, %zmm0, %zmm0, %zmm10 + vinserti32x4 $0x00, %xmm8, %zmm10, %zmm10 + vmovdqu64 48(%r11), %zmm11 + vmovdqu64 112(%r11), %zmm12 + vmovdqu64 176(%r11), %zmm13 + vextracti32x4 $3, %zmm3, %xmm8 + # aes_dec_block + vpxorq %zmm14, %zmm0, %zmm0 + vpxorq %zmm14, %zmm1, %zmm1 + vpxorq %zmm14, %zmm2, %zmm2 + vpxorq %zmm14, %zmm3, %zmm3 + vaesdec %zmm15, %zmm0, %zmm0 + vaesdec %zmm15, %zmm1, %zmm1 + vaesdec %zmm15, %zmm2, %zmm2 + vaesdec %zmm15, %zmm3, %zmm3 + vaesdec %zmm16, %zmm0, %zmm0 + vaesdec %zmm16, %zmm1, %zmm1 + vaesdec %zmm16, %zmm2, %zmm2 + vaesdec %zmm16, %zmm3, %zmm3 + vaesdec %zmm17, %zmm0, %zmm0 + vaesdec %zmm17, %zmm1, %zmm1 + vaesdec %zmm17, %zmm2, %zmm2 + vaesdec %zmm17, %zmm3, %zmm3 + vaesdec %zmm18, %zmm0, %zmm0 + vaesdec %zmm18, %zmm1, %zmm1 + vaesdec %zmm18, %zmm2, %zmm2 + vaesdec %zmm18, %zmm3, %zmm3 + vaesdec %zmm19, %zmm0, %zmm0 + vaesdec %zmm19, %zmm1, %zmm1 + vaesdec %zmm19, %zmm2, %zmm2 + vaesdec %zmm19, %zmm3, %zmm3 + vaesdec %zmm20, %zmm0, %zmm0 + vaesdec %zmm20, %zmm1, %zmm1 + vaesdec %zmm20, %zmm2, %zmm2 + vaesdec %zmm20, %zmm3, %zmm3 + vaesdec %zmm21, %zmm0, %zmm0 + vaesdec %zmm21, %zmm1, %zmm1 + vaesdec %zmm21, %zmm2, %zmm2 + vaesdec %zmm21, %zmm3, %zmm3 + vaesdec %zmm22, %zmm0, %zmm0 + vaesdec %zmm22, %zmm1, %zmm1 + vaesdec %zmm22, %zmm2, %zmm2 + vaesdec %zmm22, %zmm3, %zmm3 + vaesdec %zmm23, %zmm0, %zmm0 + vaesdec %zmm23, %zmm1, %zmm1 + vaesdec %zmm23, %zmm2, %zmm2 + vaesdec %zmm23, %zmm3, %zmm3 + cmpl $11, %r9d + vmovdqa64 %zmm24, %zmm9 + jl L_AES_CBC_decrypt_avx512_256_aes_dec_block_last + vaesdec %zmm24, %zmm0, %zmm0 + vaesdec %zmm24, %zmm1, %zmm1 + vaesdec %zmm24, %zmm2, %zmm2 + vaesdec %zmm24, %zmm3, %zmm3 + vaesdec %zmm25, %zmm0, %zmm0 + vaesdec %zmm25, %zmm1, %zmm1 + vaesdec %zmm25, %zmm2, %zmm2 + vaesdec %zmm25, %zmm3, %zmm3 + cmpl $13, %r9d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_CBC_decrypt_avx512_256_aes_dec_block_last + vaesdec %zmm26, %zmm0, %zmm0 + vaesdec %zmm26, %zmm1, %zmm1 + vaesdec %zmm26, %zmm2, %zmm2 + vaesdec %zmm26, %zmm3, %zmm3 + vaesdec %zmm27, %zmm0, %zmm0 + vaesdec %zmm27, %zmm1, %zmm1 + vaesdec %zmm27, %zmm2, %zmm2 + vaesdec %zmm27, %zmm3, %zmm3 + vmovdqa64 %zmm28, %zmm9 +L_AES_CBC_decrypt_avx512_256_aes_dec_block_last: + vaesdeclast %zmm9, %zmm0, %zmm0 + vaesdeclast %zmm9, %zmm1, %zmm1 + vaesdeclast %zmm9, %zmm2, %zmm2 + vaesdeclast %zmm9, %zmm3, %zmm3 + vpxorq %zmm10, %zmm0, %zmm0 + vpxorq %zmm11, %zmm1, %zmm1 + vpxorq %zmm12, %zmm2, %zmm2 + vpxorq %zmm13, %zmm3, %zmm3 + vmovdqu64 %zmm0, (%r12) + vmovdqu64 %zmm1, 64(%r12) + vmovdqu64 %zmm2, 128(%r12) + vmovdqu64 %zmm3, 192(%r12) + addl $0x100, %eax + cmpl %r10d, %eax + jl L_AES_CBC_decrypt_avx512_dec_256 +L_AES_CBC_decrypt_avx512_done_256: + movl %ecx, %r10d + andl $0xffffffc0, %r10d + cmpl %r10d, %eax + je L_AES_CBC_decrypt_avx512_done_64 +L_AES_CBC_decrypt_avx512_dec_64: + # 64 bytes of input + # aes_cbc_dec_64 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %r12 + vmovdqu64 (%r11), %zmm0 + vshufi64x2 $0x90, %zmm0, %zmm0, %zmm10 + vinserti32x4 $0x00, %xmm8, %zmm10, %zmm10 + vextracti32x4 $3, %zmm0, %xmm8 + # aes_dec_block + vpxorq %zmm14, %zmm0, %zmm0 + vaesdec %zmm15, %zmm0, %zmm0 + vaesdec %zmm16, %zmm0, %zmm0 + vaesdec %zmm17, %zmm0, %zmm0 + vaesdec %zmm18, %zmm0, %zmm0 + vaesdec %zmm19, %zmm0, %zmm0 + vaesdec %zmm20, %zmm0, %zmm0 + vaesdec %zmm21, %zmm0, %zmm0 + vaesdec %zmm22, %zmm0, %zmm0 + vaesdec %zmm23, %zmm0, %zmm0 + cmpl $11, %r9d + vmovdqa64 %zmm24, %zmm9 + jl L_AES_CBC_decrypt_avx512_64_aes_dec_block_last + vaesdec %zmm24, %zmm0, %zmm0 + vaesdec %zmm25, %zmm0, %zmm0 + cmpl $13, %r9d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_CBC_decrypt_avx512_64_aes_dec_block_last + vaesdec %zmm26, %zmm0, %zmm0 + vaesdec %zmm27, %zmm0, %zmm0 + vmovdqa64 %zmm28, %zmm9 +L_AES_CBC_decrypt_avx512_64_aes_dec_block_last: + vaesdeclast %zmm9, %zmm0, %zmm0 + vpxorq %zmm10, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%r12) + addl $0x40, %eax + cmpl %r10d, %eax + jl L_AES_CBC_decrypt_avx512_dec_64 +L_AES_CBC_decrypt_avx512_done_64: + cmpl %ecx, %eax + movl %ecx, %r10d + je L_AES_CBC_decrypt_avx512_done_dec + andl $0xfffffff0, %r10d +L_AES_CBC_decrypt_avx512_dec_16: + # 16 bytes of input + leaq (%rdi,%rax,1), %r11 + vmovdqu (%r11), %xmm0 + vmovdqa %xmm0, %xmm7 + # aes_dec_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r8), %xmm5 + jl L_AES_CBC_decrypt_avx512_16_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r8), %xmm5 + jl L_AES_CBC_decrypt_avx512_16_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_CBC_decrypt_avx512_16_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + vmovdqa %xmm7, %xmm8 + leaq (%rsi,%rax,1), %r11 + vmovdqu %xmm0, (%r11) + addl $16, %eax + cmpl %r10d, %eax + jl L_AES_CBC_decrypt_avx512_dec_16 +L_AES_CBC_decrypt_avx512_done_dec: + vmovdqu %xmm8, (%rdx) + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_CBC_decrypt_avx512,.-AES_CBC_decrypt_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_aes_ctr_bswap_avx512: +.quad 0x08090a0b0c0d0e0f,0x0001020304050607 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_aes_ctr_inc_avx512: +.quad 0x0000000000000000,0x0000000000000000 +.quad 0x0000000000000001,0x0000000000000000 +.quad 0x0000000000000002,0x0000000000000000 +.quad 0x0000000000000003,0x0000000000000000 +.quad 0x0000000000000004,0x0000000000000000 +.quad 0x0000000000000005,0x0000000000000000 +.quad 0x0000000000000006,0x0000000000000000 +.quad 0x0000000000000007,0x0000000000000000 +.quad 0x0000000000000008,0x0000000000000000 +.quad 0x0000000000000009,0x0000000000000000 +.quad 0x000000000000000a,0x0000000000000000 +.quad 0x000000000000000b,0x0000000000000000 +.quad 0x000000000000000c,0x0000000000000000 +.quad 0x000000000000000d,0x0000000000000000 +.quad 0x000000000000000e,0x0000000000000000 +.quad 0x000000000000000f,0x0000000000000000 +.quad 0x0000000000000010,0x0000000000000000 +#ifndef __APPLE__ +.text +.globl AES_CTR_encrypt_avx512 +.type AES_CTR_encrypt_avx512,@function +.align 16 +AES_CTR_encrypt_avx512: +#else +.section __TEXT,__text +.globl _AES_CTR_encrypt_avx512 +.p2align 4 +_AES_CTR_encrypt_avx512: +#endif /* __APPLE__ */ + pushq %rbx + vbroadcasti32x4 L_aes_ctr_bswap_avx512(%rip), %zmm8 + vbroadcasti32x4 (%r9), %zmm7 + vpshufb %zmm8, %zmm7, %zmm7 + vbroadcasti32x4 256+L_aes_ctr_inc_avx512(%rip), %zmm10 + vbroadcasti32x4 64+L_aes_ctr_inc_avx512(%rip), %zmm11 + vbroadcasti32x4 16+L_aes_ctr_inc_avx512(%rip), %zmm12 + xorl %eax, %eax + cmpl $0x40, %edx + jl L_AES_CTR_encrypt_avx512_done_64 + vbroadcasti32x4 (%rcx), %zmm14 + vbroadcasti32x4 16(%rcx), %zmm15 + vbroadcasti32x4 32(%rcx), %zmm16 + vbroadcasti32x4 48(%rcx), %zmm17 + vbroadcasti32x4 64(%rcx), %zmm18 + vbroadcasti32x4 80(%rcx), %zmm19 + vbroadcasti32x4 96(%rcx), %zmm20 + vbroadcasti32x4 112(%rcx), %zmm21 + vbroadcasti32x4 128(%rcx), %zmm22 + vbroadcasti32x4 144(%rcx), %zmm23 + vbroadcasti32x4 160(%rcx), %zmm24 + cmpl $11, %r8d + jl L_AES_CTR_encrypt_avx512_key_cached + vbroadcasti32x4 176(%rcx), %zmm25 + vbroadcasti32x4 192(%rcx), %zmm26 + cmpl $13, %r8d + jl L_AES_CTR_encrypt_avx512_key_cached + vbroadcasti32x4 208(%rcx), %zmm27 + vbroadcasti32x4 224(%rcx), %zmm28 +L_AES_CTR_encrypt_avx512_key_cached: + cmpl $0x100, %edx + movl %edx, %r10d + jl L_AES_CTR_encrypt_avx512_done_256 + andl $0xffffff00, %r10d + vmovdqa64 %zmm7, %zmm9 + vpaddq 0+L_aes_ctr_inc_avx512(%rip), %zmm7, %zmm4 + vpternlogq $0xb2, 0+L_aes_ctr_inc_avx512(%rip), %zmm4, %zmm9 + vpsrlq $63, %zmm9, %zmm9 + vpslldq $8, %zmm9, %zmm9 + vpaddq %zmm9, %zmm4, %zmm4 + vmovdqa64 %zmm7, %zmm9 + vpaddq 64+L_aes_ctr_inc_avx512(%rip), %zmm7, %zmm5 + vpternlogq $0xb2, 64+L_aes_ctr_inc_avx512(%rip), %zmm5, %zmm9 + vpsrlq $63, %zmm9, %zmm9 + vpslldq $8, %zmm9, %zmm9 + vpaddq %zmm9, %zmm5, %zmm5 + vmovdqa64 %zmm7, %zmm9 + vpaddq 128+L_aes_ctr_inc_avx512(%rip), %zmm7, %zmm6 + vpternlogq $0xb2, 128+L_aes_ctr_inc_avx512(%rip), %zmm6, %zmm9 + vpsrlq $63, %zmm9, %zmm9 + vpslldq $8, %zmm9, %zmm9 + vpaddq %zmm9, %zmm6, %zmm6 + vmovdqa64 %zmm7, %zmm9 + vpaddq 192+L_aes_ctr_inc_avx512(%rip), %zmm7, %zmm7 + vpternlogq $0xb2, 192+L_aes_ctr_inc_avx512(%rip), %zmm7, %zmm9 + vpsrlq $63, %zmm9, %zmm9 + vpslldq $8, %zmm9, %zmm9 + vpaddq %zmm9, %zmm7, %zmm7 +L_AES_CTR_encrypt_avx512_enc_256: + # 256 bytes of input + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %rbx + vpshufb %zmm8, %zmm4, %zmm0 + vpshufb %zmm8, %zmm5, %zmm1 + vpshufb %zmm8, %zmm6, %zmm2 + vpshufb %zmm8, %zmm7, %zmm3 + vmovdqa64 %zmm4, %zmm9 + vpaddq %zmm10, %zmm4, %zmm4 + vpternlogq $0xb2, %zmm10, %zmm4, %zmm9 + vpsrlq $63, %zmm9, %zmm9 + vpslldq $8, %zmm9, %zmm9 + vpaddq %zmm9, %zmm4, %zmm4 + vmovdqa64 %zmm5, %zmm9 + vpaddq %zmm10, %zmm5, %zmm5 + vpternlogq $0xb2, %zmm10, %zmm5, %zmm9 + vpsrlq $63, %zmm9, %zmm9 + vpslldq $8, %zmm9, %zmm9 + vpaddq %zmm9, %zmm5, %zmm5 + vmovdqa64 %zmm6, %zmm9 + vpaddq %zmm10, %zmm6, %zmm6 + vpternlogq $0xb2, %zmm10, %zmm6, %zmm9 + vpsrlq $63, %zmm9, %zmm9 + vpslldq $8, %zmm9, %zmm9 + vpaddq %zmm9, %zmm6, %zmm6 + vmovdqa64 %zmm7, %zmm9 + vpaddq %zmm10, %zmm7, %zmm7 + vpternlogq $0xb2, %zmm10, %zmm7, %zmm9 + vpsrlq $63, %zmm9, %zmm9 + vpslldq $8, %zmm9, %zmm9 + vpaddq %zmm9, %zmm7, %zmm7 + # aes_enc_block + vpxorq %zmm14, %zmm0, %zmm0 + vpxorq %zmm14, %zmm1, %zmm1 + vpxorq %zmm14, %zmm2, %zmm2 + vpxorq %zmm14, %zmm3, %zmm3 + vaesenc %zmm15, %zmm0, %zmm0 + vaesenc %zmm15, %zmm1, %zmm1 + vaesenc %zmm15, %zmm2, %zmm2 + vaesenc %zmm15, %zmm3, %zmm3 + vaesenc %zmm16, %zmm0, %zmm0 + vaesenc %zmm16, %zmm1, %zmm1 + vaesenc %zmm16, %zmm2, %zmm2 + vaesenc %zmm16, %zmm3, %zmm3 + vaesenc %zmm17, %zmm0, %zmm0 + vaesenc %zmm17, %zmm1, %zmm1 + vaesenc %zmm17, %zmm2, %zmm2 + vaesenc %zmm17, %zmm3, %zmm3 + vaesenc %zmm18, %zmm0, %zmm0 + vaesenc %zmm18, %zmm1, %zmm1 + vaesenc %zmm18, %zmm2, %zmm2 + vaesenc %zmm18, %zmm3, %zmm3 + vaesenc %zmm19, %zmm0, %zmm0 + vaesenc %zmm19, %zmm1, %zmm1 + vaesenc %zmm19, %zmm2, %zmm2 + vaesenc %zmm19, %zmm3, %zmm3 + vaesenc %zmm20, %zmm0, %zmm0 + vaesenc %zmm20, %zmm1, %zmm1 + vaesenc %zmm20, %zmm2, %zmm2 + vaesenc %zmm20, %zmm3, %zmm3 + vaesenc %zmm21, %zmm0, %zmm0 + vaesenc %zmm21, %zmm1, %zmm1 + vaesenc %zmm21, %zmm2, %zmm2 + vaesenc %zmm21, %zmm3, %zmm3 + vaesenc %zmm22, %zmm0, %zmm0 + vaesenc %zmm22, %zmm1, %zmm1 + vaesenc %zmm22, %zmm2, %zmm2 + vaesenc %zmm22, %zmm3, %zmm3 + vaesenc %zmm23, %zmm0, %zmm0 + vaesenc %zmm23, %zmm1, %zmm1 + vaesenc %zmm23, %zmm2, %zmm2 + vaesenc %zmm23, %zmm3, %zmm3 + cmpl $11, %r8d + vmovdqa64 %zmm24, %zmm13 + jl L_AES_CTR_encrypt_avx512_256_aes_enc_block_last + vaesenc %zmm24, %zmm0, %zmm0 + vaesenc %zmm24, %zmm1, %zmm1 + vaesenc %zmm24, %zmm2, %zmm2 + vaesenc %zmm24, %zmm3, %zmm3 + vaesenc %zmm25, %zmm0, %zmm0 + vaesenc %zmm25, %zmm1, %zmm1 + vaesenc %zmm25, %zmm2, %zmm2 + vaesenc %zmm25, %zmm3, %zmm3 + cmpl $13, %r8d + vmovdqa64 %zmm26, %zmm13 + jl L_AES_CTR_encrypt_avx512_256_aes_enc_block_last + vaesenc %zmm26, %zmm0, %zmm0 + vaesenc %zmm26, %zmm1, %zmm1 + vaesenc %zmm26, %zmm2, %zmm2 + vaesenc %zmm26, %zmm3, %zmm3 + vaesenc %zmm27, %zmm0, %zmm0 + vaesenc %zmm27, %zmm1, %zmm1 + vaesenc %zmm27, %zmm2, %zmm2 + vaesenc %zmm27, %zmm3, %zmm3 + vmovdqa64 %zmm28, %zmm13 +L_AES_CTR_encrypt_avx512_256_aes_enc_block_last: + vaesenclast %zmm13, %zmm0, %zmm0 + vaesenclast %zmm13, %zmm1, %zmm1 + vaesenclast %zmm13, %zmm2, %zmm2 + vaesenclast %zmm13, %zmm3, %zmm3 + vpxorq (%r11), %zmm0, %zmm0 + vpxorq 64(%r11), %zmm1, %zmm1 + vpxorq 128(%r11), %zmm2, %zmm2 + vpxorq 192(%r11), %zmm3, %zmm3 + vmovdqu64 %zmm0, (%rbx) + vmovdqu64 %zmm1, 64(%rbx) + vmovdqu64 %zmm2, 128(%rbx) + vmovdqu64 %zmm3, 192(%rbx) + addl $0x100, %eax + cmpl %r10d, %eax + jl L_AES_CTR_encrypt_avx512_enc_256 + vshufi64x2 $0x00, %zmm4, %zmm4, %zmm7 +L_AES_CTR_encrypt_avx512_done_256: + movl %edx, %r10d + andl $0xffffffc0, %r10d + cmpl %r10d, %eax + je L_AES_CTR_encrypt_avx512_done_64 +L_AES_CTR_encrypt_avx512_enc_64: + # 64 bytes of input + # aes_ctr_enc_64 + leaq (%rdi,%rax,1), %r11 + leaq (%rsi,%rax,1), %rbx + vpaddq 0+L_aes_ctr_inc_avx512(%rip), %zmm7, %zmm0 + vmovdqa64 %zmm7, %zmm9 + vpternlogq $0xb2, 0+L_aes_ctr_inc_avx512(%rip), %zmm0, %zmm9 + vpsrlq $63, %zmm9, %zmm9 + vpslldq $8, %zmm9, %zmm9 + vpaddq %zmm9, %zmm0, %zmm0 + vpshufb %zmm8, %zmm0, %zmm0 + vmovdqa64 %zmm7, %zmm9 + vpaddq %zmm11, %zmm7, %zmm7 + vpternlogq $0xb2, %zmm11, %zmm7, %zmm9 + vpsrlq $63, %zmm9, %zmm9 + vpslldq $8, %zmm9, %zmm9 + vpaddq %zmm9, %zmm7, %zmm7 + # aes_enc_block + vpxorq %zmm14, %zmm0, %zmm0 + vaesenc %zmm15, %zmm0, %zmm0 + vaesenc %zmm16, %zmm0, %zmm0 + vaesenc %zmm17, %zmm0, %zmm0 + vaesenc %zmm18, %zmm0, %zmm0 + vaesenc %zmm19, %zmm0, %zmm0 + vaesenc %zmm20, %zmm0, %zmm0 + vaesenc %zmm21, %zmm0, %zmm0 + vaesenc %zmm22, %zmm0, %zmm0 + vaesenc %zmm23, %zmm0, %zmm0 + cmpl $11, %r8d + vmovdqa64 %zmm24, %zmm13 + jl L_AES_CTR_encrypt_avx512_64_aes_enc_block_last + vaesenc %zmm24, %zmm0, %zmm0 + vaesenc %zmm25, %zmm0, %zmm0 + cmpl $13, %r8d + vmovdqa64 %zmm26, %zmm13 + jl L_AES_CTR_encrypt_avx512_64_aes_enc_block_last + vaesenc %zmm26, %zmm0, %zmm0 + vaesenc %zmm27, %zmm0, %zmm0 + vmovdqa64 %zmm28, %zmm13 +L_AES_CTR_encrypt_avx512_64_aes_enc_block_last: + vaesenclast %zmm13, %zmm0, %zmm0 + vpxorq (%r11), %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rbx) + addl $0x40, %eax + cmpl %r10d, %eax + jl L_AES_CTR_encrypt_avx512_enc_64 +L_AES_CTR_encrypt_avx512_done_64: + cmpl %edx, %eax + movl %edx, %r10d + je L_AES_CTR_encrypt_avx512_done_enc + andl $0xfffffff0, %r10d +L_AES_CTR_encrypt_avx512_enc_16: + # 16 bytes of input + vpshufb %xmm8, %xmm7, %xmm0 + vmovdqa64 %zmm7, %zmm9 + vpaddq %zmm12, %zmm7, %zmm7 + vpternlogq $0xb2, %zmm12, %zmm7, %zmm9 + vpsrlq $63, %zmm9, %zmm9 + vpslldq $8, %zmm9, %zmm9 + vpaddq %zmm9, %zmm7, %zmm7 + # aes_enc_block + vpxor (%rcx), %xmm0, %xmm0 + vmovdqu 16(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%rcx), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r8d + vmovdqu 160(%rcx), %xmm5 + jl L_AES_CTR_encrypt_avx512_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r8d + vmovdqu 192(%rcx), %xmm5 + jl L_AES_CTR_encrypt_avx512_16_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%rcx), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%rcx), %xmm5 +L_AES_CTR_encrypt_avx512_16_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + leaq (%rdi,%rax,1), %r11 + vpxor (%r11), %xmm0, %xmm0 + leaq (%rsi,%rax,1), %r11 + vmovdqu %xmm0, (%r11) + addl $16, %eax + cmpl %r10d, %eax + jl L_AES_CTR_encrypt_avx512_enc_16 +L_AES_CTR_encrypt_avx512_done_enc: + vpshufb %xmm8, %xmm7, %xmm0 + vmovdqu %xmm0, (%r9) + popq %rbx + repz retq +#ifndef __APPLE__ +.size AES_CTR_encrypt_avx512,.-AES_CTR_encrypt_avx512 +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX512 */ +#endif /* WOLFSSL_X86_64_BUILD */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/wolfcrypt/src/aes_x86_64_asm.asm b/wolfcrypt/src/aes_x86_64_asm.asm new file mode 100644 index 00000000000..26ccbb5ee8e --- /dev/null +++ b/wolfcrypt/src/aes_x86_64_asm.asm @@ -0,0 +1,4283 @@ +; /* aes_x86_64_asm.asm */ +; /* +; * Copyright (C) 2006-2026 wolfSSL Inc. +; * +; * This file is part of wolfSSL. +; * +; * wolfSSL is free software; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 3 of the License, or +; * (at your option) any later version. +; * +; * wolfSSL is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA +; */ + +IF @Version LT 1200 +; AVX2 instructions not recognized by old versions of MASM +IFNDEF NO_AVX2_SUPPORT +NO_AVX2_SUPPORT = 1 +ENDIF +; MOVBE instruction not recognized by old versions of MASM +IFNDEF NO_MOVBE_SUPPORT +NO_MOVBE_SUPPORT = 1 +ENDIF +ENDIF + +IFNDEF HAVE_INTEL_AVX1 +HAVE_INTEL_AVX1 = 1 +ENDIF +IFNDEF NO_AVX2_SUPPORT +HAVE_INTEL_AVX2 = 1 +ENDIF + +IFNDEF _WIN64 +_WIN64 = 1 +ENDIF + +_TEXT SEGMENT READONLY PARA +AES_128_Key_Expansion_AESNI PROC + movdqu xmm0, OWORD PTR [rcx] + movdqu OWORD PTR [rdx], xmm0 + aeskeygenassist xmm1, xmm0, 1 + pshufd xmm1, xmm1, 255 + movdqa xmm2, xmm0 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pxor xmm0, xmm1 + movdqu OWORD PTR [rdx+16], xmm0 + aeskeygenassist xmm1, xmm0, 2 + pshufd xmm1, xmm1, 255 + movdqa xmm2, xmm0 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pxor xmm0, xmm1 + movdqu OWORD PTR [rdx+32], xmm0 + aeskeygenassist xmm1, xmm0, 4 + pshufd xmm1, xmm1, 255 + movdqa xmm2, xmm0 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pxor xmm0, xmm1 + movdqu OWORD PTR [rdx+48], xmm0 + aeskeygenassist xmm1, xmm0, 8 + pshufd xmm1, xmm1, 255 + movdqa xmm2, xmm0 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pxor xmm0, xmm1 + movdqu OWORD PTR [rdx+64], xmm0 + aeskeygenassist xmm1, xmm0, 16 + pshufd xmm1, xmm1, 255 + movdqa xmm2, xmm0 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pxor xmm0, xmm1 + movdqu OWORD PTR [rdx+80], xmm0 + aeskeygenassist xmm1, xmm0, 32 + pshufd xmm1, xmm1, 255 + movdqa xmm2, xmm0 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pxor xmm0, xmm1 + movdqu OWORD PTR [rdx+96], xmm0 + aeskeygenassist xmm1, xmm0, 64 + pshufd xmm1, xmm1, 255 + movdqa xmm2, xmm0 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pxor xmm0, xmm1 + movdqu OWORD PTR [rdx+112], xmm0 + aeskeygenassist xmm1, xmm0, 128 + pshufd xmm1, xmm1, 255 + movdqa xmm2, xmm0 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pxor xmm0, xmm1 + movdqu OWORD PTR [rdx+128], xmm0 + aeskeygenassist xmm1, xmm0, 27 + pshufd xmm1, xmm1, 255 + movdqa xmm2, xmm0 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pxor xmm0, xmm1 + movdqu OWORD PTR [rdx+144], xmm0 + aeskeygenassist xmm1, xmm0, 54 + pshufd xmm1, xmm1, 255 + movdqa xmm2, xmm0 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pslldq xmm2, 4 + pxor xmm0, xmm2 + pxor xmm0, xmm1 + movdqu OWORD PTR [rdx+160], xmm0 + ret +AES_128_Key_Expansion_AESNI ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_192_Key_Expansion_AESNI PROC + movdqu xmm0, OWORD PTR [rcx] + pxor xmm1, xmm1 + pinsrq xmm1, QWORD PTR [rcx+16], 0 + movdqu OWORD PTR [rdx], xmm0 + movdqa xmm4, xmm1 + aeskeygenassist xmm2, xmm1, 1 + pshufd xmm2, xmm2, 85 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + pshufd xmm2, xmm0, 255 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + shufpd xmm4, xmm0, 0 + movdqu OWORD PTR [rdx+16], xmm4 + movdqa xmm5, xmm0 + shufpd xmm5, xmm1, 1 + movdqu OWORD PTR [rdx+32], xmm5 + aeskeygenassist xmm2, xmm1, 2 + pshufd xmm2, xmm2, 85 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + pshufd xmm2, xmm0, 255 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu OWORD PTR [rdx+48], xmm0 + movdqa xmm4, xmm1 + aeskeygenassist xmm2, xmm1, 4 + pshufd xmm2, xmm2, 85 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + pshufd xmm2, xmm0, 255 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + shufpd xmm4, xmm0, 0 + movdqu OWORD PTR [rdx+64], xmm4 + movdqa xmm5, xmm0 + shufpd xmm5, xmm1, 1 + movdqu OWORD PTR [rdx+80], xmm5 + aeskeygenassist xmm2, xmm1, 8 + pshufd xmm2, xmm2, 85 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + pshufd xmm2, xmm0, 255 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu OWORD PTR [rdx+96], xmm0 + movdqa xmm4, xmm1 + aeskeygenassist xmm2, xmm1, 16 + pshufd xmm2, xmm2, 85 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + pshufd xmm2, xmm0, 255 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + shufpd xmm4, xmm0, 0 + movdqu OWORD PTR [rdx+112], xmm4 + movdqa xmm5, xmm0 + shufpd xmm5, xmm1, 1 + movdqu OWORD PTR [rdx+128], xmm5 + aeskeygenassist xmm2, xmm1, 32 + pshufd xmm2, xmm2, 85 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + pshufd xmm2, xmm0, 255 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu OWORD PTR [rdx+144], xmm0 + movdqa xmm4, xmm1 + aeskeygenassist xmm2, xmm1, 64 + pshufd xmm2, xmm2, 85 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + pshufd xmm2, xmm0, 255 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + shufpd xmm4, xmm0, 0 + movdqu OWORD PTR [rdx+160], xmm4 + movdqa xmm5, xmm0 + shufpd xmm5, xmm1, 1 + movdqu OWORD PTR [rdx+176], xmm5 + aeskeygenassist xmm2, xmm1, 128 + pshufd xmm2, xmm2, 85 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + pshufd xmm2, xmm0, 255 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu OWORD PTR [rdx+192], xmm0 + movdqu OWORD PTR [rdx+208], xmm1 + ret +AES_192_Key_Expansion_AESNI ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_256_Key_Expansion_AESNI PROC + movdqu xmm0, OWORD PTR [rcx] + movdqu xmm1, OWORD PTR [rcx+16] + movdqu OWORD PTR [rdx], xmm0 + movdqu OWORD PTR [rdx+16], xmm1 + aeskeygenassist xmm2, xmm1, 1 + pshufd xmm2, xmm2, 255 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + movdqu OWORD PTR [rdx+32], xmm0 + aeskeygenassist xmm2, xmm0, 0 + pshufd xmm2, xmm2, 170 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu OWORD PTR [rdx+48], xmm1 + aeskeygenassist xmm2, xmm1, 2 + pshufd xmm2, xmm2, 255 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + movdqu OWORD PTR [rdx+64], xmm0 + aeskeygenassist xmm2, xmm0, 0 + pshufd xmm2, xmm2, 170 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu OWORD PTR [rdx+80], xmm1 + aeskeygenassist xmm2, xmm1, 4 + pshufd xmm2, xmm2, 255 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + movdqu OWORD PTR [rdx+96], xmm0 + aeskeygenassist xmm2, xmm0, 0 + pshufd xmm2, xmm2, 170 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu OWORD PTR [rdx+112], xmm1 + aeskeygenassist xmm2, xmm1, 8 + pshufd xmm2, xmm2, 255 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + movdqu OWORD PTR [rdx+128], xmm0 + aeskeygenassist xmm2, xmm0, 0 + pshufd xmm2, xmm2, 170 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu OWORD PTR [rdx+144], xmm1 + aeskeygenassist xmm2, xmm1, 16 + pshufd xmm2, xmm2, 255 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + movdqu OWORD PTR [rdx+160], xmm0 + aeskeygenassist xmm2, xmm0, 0 + pshufd xmm2, xmm2, 170 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu OWORD PTR [rdx+176], xmm1 + aeskeygenassist xmm2, xmm1, 32 + pshufd xmm2, xmm2, 255 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + movdqu OWORD PTR [rdx+192], xmm0 + aeskeygenassist xmm2, xmm0, 0 + pshufd xmm2, xmm2, 170 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu OWORD PTR [rdx+208], xmm1 + aeskeygenassist xmm2, xmm1, 64 + pshufd xmm2, xmm2, 255 + movdqa xmm3, xmm0 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pslldq xmm3, 4 + pxor xmm0, xmm3 + pxor xmm0, xmm2 + movdqu OWORD PTR [rdx+224], xmm0 + ret +AES_256_Key_Expansion_AESNI ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_ECB_encrypt_AESNI PROC + mov eax, DWORD PTR [rsp+40] + sub rsp, 16 + movdqu OWORD PTR [rsp], xmm6 + xor eax, eax + cmp r8d, 64 + mov r9d, r8d + jl L_AES_ECB_encrypt_AESNI_done_64 + and r9d, 4294967232 +L_AES_ECB_encrypt_AESNI_enc_64: + ; 64 bytes of input + ; aes_ecb_enc_64 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + movdqu xmm0, OWORD PTR [r10] + movdqu xmm1, OWORD PTR [r10+16] + movdqu xmm2, OWORD PTR [r10+32] + movdqu xmm3, OWORD PTR [r10+48] + ; aes_enc_block + movdqu xmm4, OWORD PTR [r9] + pxor xmm0, xmm4 + pxor xmm1, xmm4 + pxor xmm2, xmm4 + pxor xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+16] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+32] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+48] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+64] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+80] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+96] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+112] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+128] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+144] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + cmp eax, 11 + movdqu xmm4, OWORD PTR [r9+160] + jl L_AES_ECB_encrypt_AESNI_64_aes_enc_block_last + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+176] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + cmp eax, 13 + movdqu xmm4, OWORD PTR [r9+192] + jl L_AES_ECB_encrypt_AESNI_64_aes_enc_block_last + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+208] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+224] +L_AES_ECB_encrypt_AESNI_64_aes_enc_block_last: + aesenclast xmm0, xmm4 + aesenclast xmm1, xmm4 + aesenclast xmm2, xmm4 + aesenclast xmm3, xmm4 + movdqu OWORD PTR [r11], xmm0 + movdqu OWORD PTR [r11+16], xmm1 + movdqu OWORD PTR [r11+32], xmm2 + movdqu OWORD PTR [r11+48], xmm3 + add eax, 64 + cmp eax, r9d + jl L_AES_ECB_encrypt_AESNI_enc_64 +L_AES_ECB_encrypt_AESNI_done_64: + cmp eax, r8d + mov r9d, r8d + je L_AES_ECB_encrypt_AESNI_done_enc + and r9d, 4294967280 +L_AES_ECB_encrypt_AESNI_enc_16: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + movdqu xmm0, OWORD PTR [r10] + ; aes_enc_block + pxor xmm0, [r9] + movdqu xmm5, OWORD PTR [r9+16] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+32] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+48] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+64] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+80] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+96] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+112] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+128] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+144] + aesenc xmm0, xmm5 + cmp eax, 11 + movdqu xmm5, OWORD PTR [r9+160] + jl L_AES_ECB_encrypt_AESNI_16_aes_enc_block_last + aesenc xmm0, xmm5 + movdqu xmm6, OWORD PTR [r9+176] + aesenc xmm0, xmm6 + cmp eax, 13 + movdqu xmm5, OWORD PTR [r9+192] + jl L_AES_ECB_encrypt_AESNI_16_aes_enc_block_last + aesenc xmm0, xmm5 + movdqu xmm6, OWORD PTR [r9+208] + aesenc xmm0, xmm6 + movdqu xmm5, OWORD PTR [r9+224] +L_AES_ECB_encrypt_AESNI_16_aes_enc_block_last: + aesenclast xmm0, xmm5 + lea r10, QWORD PTR [rdx+rax] + movdqu OWORD PTR [r10], xmm0 + add eax, 16 + cmp eax, r9d + jl L_AES_ECB_encrypt_AESNI_enc_16 +L_AES_ECB_encrypt_AESNI_done_enc: + movdqu xmm6, OWORD PTR [rsp] + add rsp, 16 + ret +AES_ECB_encrypt_AESNI ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_ECB_decrypt_AESNI PROC + mov eax, DWORD PTR [rsp+40] + sub rsp, 16 + movdqu OWORD PTR [rsp], xmm6 + xor eax, eax + cmp r8d, 64 + mov r9d, r8d + jl L_AES_ECB_decrypt_AESNI_done_64 + and r9d, 4294967232 +L_AES_ECB_decrypt_AESNI_dec_64: + ; 64 bytes of input + ; aes_ecb_dec_64 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + movdqu xmm0, OWORD PTR [r10] + movdqu xmm1, OWORD PTR [r10+16] + movdqu xmm2, OWORD PTR [r10+32] + movdqu xmm3, OWORD PTR [r10+48] + ; aes_dec_block + movdqu xmm4, OWORD PTR [r9] + pxor xmm0, xmm4 + pxor xmm1, xmm4 + pxor xmm2, xmm4 + pxor xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+16] + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+32] + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+48] + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+64] + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+80] + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+96] + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+112] + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+128] + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+144] + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + cmp eax, 11 + movdqu xmm4, OWORD PTR [r9+160] + jl L_AES_ECB_decrypt_AESNI_64_aes_dec_block_last + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+176] + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + cmp eax, 13 + movdqu xmm4, OWORD PTR [r9+192] + jl L_AES_ECB_decrypt_AESNI_64_aes_dec_block_last + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+208] + aesdec xmm0, xmm4 + aesdec xmm1, xmm4 + aesdec xmm2, xmm4 + aesdec xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+224] +L_AES_ECB_decrypt_AESNI_64_aes_dec_block_last: + aesdeclast xmm0, xmm4 + aesdeclast xmm1, xmm4 + aesdeclast xmm2, xmm4 + aesdeclast xmm3, xmm4 + movdqu OWORD PTR [r11], xmm0 + movdqu OWORD PTR [r11+16], xmm1 + movdqu OWORD PTR [r11+32], xmm2 + movdqu OWORD PTR [r11+48], xmm3 + add eax, 64 + cmp eax, r9d + jl L_AES_ECB_decrypt_AESNI_dec_64 +L_AES_ECB_decrypt_AESNI_done_64: + cmp eax, r8d + mov r9d, r8d + je L_AES_ECB_decrypt_AESNI_done_dec + and r9d, 4294967280 +L_AES_ECB_decrypt_AESNI_dec_16: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + movdqu xmm0, OWORD PTR [r10] + ; aes_dec_block + pxor xmm0, [r9] + movdqu xmm5, OWORD PTR [r9+16] + aesdec xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+32] + aesdec xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+48] + aesdec xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+64] + aesdec xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+80] + aesdec xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+96] + aesdec xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+112] + aesdec xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+128] + aesdec xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+144] + aesdec xmm0, xmm5 + cmp eax, 11 + movdqu xmm5, OWORD PTR [r9+160] + jl L_AES_ECB_decrypt_AESNI_16_aes_dec_block_last + aesdec xmm0, xmm5 + movdqu xmm6, OWORD PTR [r9+176] + aesdec xmm0, xmm6 + cmp eax, 13 + movdqu xmm5, OWORD PTR [r9+192] + jl L_AES_ECB_decrypt_AESNI_16_aes_dec_block_last + aesdec xmm0, xmm5 + movdqu xmm6, OWORD PTR [r9+208] + aesdec xmm0, xmm6 + movdqu xmm5, OWORD PTR [r9+224] +L_AES_ECB_decrypt_AESNI_16_aes_dec_block_last: + aesdeclast xmm0, xmm5 + lea r10, QWORD PTR [rdx+rax] + movdqu OWORD PTR [r10], xmm0 + add eax, 16 + cmp eax, r9d + jl L_AES_ECB_decrypt_AESNI_dec_16 +L_AES_ECB_decrypt_AESNI_done_dec: + movdqu xmm6, OWORD PTR [rsp] + add rsp, 16 + ret +AES_ECB_decrypt_AESNI ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_CBC_encrypt_AESNI PROC + mov rax, QWORD PTR [rsp+40] + mov r10d, DWORD PTR [rsp+48] + movdqu xmm0, OWORD PTR [r8] + xor eax, eax + cmp eax, r9d + je L_AES_CBC_encrypt_AESNI_done +L_AES_CBC_encrypt_AESNI_loop: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + movdqu xmm1, OWORD PTR [r10] + pxor xmm1, xmm0 + ; aes_enc_block + pxor xmm1, [rax] + movdqu xmm3, OWORD PTR [rax+16] + aesenc xmm1, xmm3 + movdqu xmm3, OWORD PTR [rax+32] + aesenc xmm1, xmm3 + movdqu xmm3, OWORD PTR [rax+48] + aesenc xmm1, xmm3 + movdqu xmm3, OWORD PTR [rax+64] + aesenc xmm1, xmm3 + movdqu xmm3, OWORD PTR [rax+80] + aesenc xmm1, xmm3 + movdqu xmm3, OWORD PTR [rax+96] + aesenc xmm1, xmm3 + movdqu xmm3, OWORD PTR [rax+112] + aesenc xmm1, xmm3 + movdqu xmm3, OWORD PTR [rax+128] + aesenc xmm1, xmm3 + movdqu xmm3, OWORD PTR [rax+144] + aesenc xmm1, xmm3 + cmp r10d, 11 + movdqu xmm3, OWORD PTR [rax+160] + jl L_AES_CBC_encrypt_AESNI_aes_enc_block_last + aesenc xmm1, xmm3 + movdqu xmm4, OWORD PTR [rax+176] + aesenc xmm1, xmm4 + cmp r10d, 13 + movdqu xmm3, OWORD PTR [rax+192] + jl L_AES_CBC_encrypt_AESNI_aes_enc_block_last + aesenc xmm1, xmm3 + movdqu xmm4, OWORD PTR [rax+208] + aesenc xmm1, xmm4 + movdqu xmm3, OWORD PTR [rax+224] +L_AES_CBC_encrypt_AESNI_aes_enc_block_last: + aesenclast xmm1, xmm3 + lea r11, QWORD PTR [rdx+rax] + movdqu OWORD PTR [r11], xmm1 + movdqa xmm0, xmm1 + add eax, 16 + cmp eax, r9d + jl L_AES_CBC_encrypt_AESNI_loop +L_AES_CBC_encrypt_AESNI_done: + movdqu OWORD PTR [r8], xmm0 + ret +AES_CBC_encrypt_AESNI ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_CBC_decrypt_AESNI PROC + push r12 + mov rax, QWORD PTR [rsp+48] + mov r10d, DWORD PTR [rsp+56] + sub rsp, 48 + movdqu OWORD PTR [rsp], xmm6 + movdqu OWORD PTR [rsp+16], xmm7 + movdqu OWORD PTR [rsp+32], xmm8 + movdqu xmm4, OWORD PTR [r8] + xor eax, eax + cmp r9d, 64 + mov r10d, r9d + jl L_AES_CBC_decrypt_AESNI_done_64 + and r10d, 4294967232 +L_AES_CBC_decrypt_AESNI_dec_64: + ; 64 bytes of input + ; aes_cbc_dec_64 + lea r11, QWORD PTR [rcx+rax] + lea r12, QWORD PTR [rdx+rax] + movdqu xmm0, OWORD PTR [r11] + movdqu xmm1, OWORD PTR [r11+16] + movdqu xmm2, OWORD PTR [r11+32] + movdqu xmm3, OWORD PTR [r11+48] + ; aes_dec_block + movdqu xmm5, OWORD PTR [rax] + pxor xmm0, xmm5 + pxor xmm1, xmm5 + pxor xmm2, xmm5 + pxor xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+16] + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+32] + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+48] + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+64] + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+80] + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+96] + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+112] + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+128] + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+144] + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + cmp r10d, 11 + movdqu xmm5, OWORD PTR [rax+160] + jl L_AES_CBC_decrypt_AESNI_64_aes_dec_block_last + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+176] + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + cmp r10d, 13 + movdqu xmm5, OWORD PTR [rax+192] + jl L_AES_CBC_decrypt_AESNI_64_aes_dec_block_last + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+208] + aesdec xmm0, xmm5 + aesdec xmm1, xmm5 + aesdec xmm2, xmm5 + aesdec xmm3, xmm5 + movdqu xmm5, OWORD PTR [rax+224] +L_AES_CBC_decrypt_AESNI_64_aes_dec_block_last: + aesdeclast xmm0, xmm5 + aesdeclast xmm1, xmm5 + aesdeclast xmm2, xmm5 + aesdeclast xmm3, xmm5 + pxor xmm0, xmm4 + movdqu xmm5, OWORD PTR [r11] + pxor xmm1, xmm5 + movdqu xmm5, OWORD PTR [r11+16] + pxor xmm2, xmm5 + movdqu xmm5, OWORD PTR [r11+32] + pxor xmm3, xmm5 + movdqu xmm4, OWORD PTR [r11+48] + movdqu OWORD PTR [r12], xmm0 + movdqu OWORD PTR [r12+16], xmm1 + movdqu OWORD PTR [r12+32], xmm2 + movdqu OWORD PTR [r12+48], xmm3 + add eax, 64 + cmp eax, r10d + jl L_AES_CBC_decrypt_AESNI_dec_64 +L_AES_CBC_decrypt_AESNI_done_64: + cmp eax, r9d + mov r10d, r9d + je L_AES_CBC_decrypt_AESNI_done_dec + and r10d, 4294967280 +L_AES_CBC_decrypt_AESNI_dec_16: + ; 16 bytes of input + lea r11, QWORD PTR [rcx+rax] + movdqu xmm0, OWORD PTR [r11] + movdqa xmm8, xmm0 + ; aes_dec_block + pxor xmm0, [rax] + movdqu xmm6, OWORD PTR [rax+16] + aesdec xmm0, xmm6 + movdqu xmm6, OWORD PTR [rax+32] + aesdec xmm0, xmm6 + movdqu xmm6, OWORD PTR [rax+48] + aesdec xmm0, xmm6 + movdqu xmm6, OWORD PTR [rax+64] + aesdec xmm0, xmm6 + movdqu xmm6, OWORD PTR [rax+80] + aesdec xmm0, xmm6 + movdqu xmm6, OWORD PTR [rax+96] + aesdec xmm0, xmm6 + movdqu xmm6, OWORD PTR [rax+112] + aesdec xmm0, xmm6 + movdqu xmm6, OWORD PTR [rax+128] + aesdec xmm0, xmm6 + movdqu xmm6, OWORD PTR [rax+144] + aesdec xmm0, xmm6 + cmp r10d, 11 + movdqu xmm6, OWORD PTR [rax+160] + jl L_AES_CBC_decrypt_AESNI_16_aes_dec_block_last + aesdec xmm0, xmm6 + movdqu xmm7, OWORD PTR [rax+176] + aesdec xmm0, xmm7 + cmp r10d, 13 + movdqu xmm6, OWORD PTR [rax+192] + jl L_AES_CBC_decrypt_AESNI_16_aes_dec_block_last + aesdec xmm0, xmm6 + movdqu xmm7, OWORD PTR [rax+208] + aesdec xmm0, xmm7 + movdqu xmm6, OWORD PTR [rax+224] +L_AES_CBC_decrypt_AESNI_16_aes_dec_block_last: + aesdeclast xmm0, xmm6 + pxor xmm0, xmm4 + movdqa xmm4, xmm8 + lea r11, QWORD PTR [rdx+rax] + movdqu OWORD PTR [r11], xmm0 + add eax, 16 + cmp eax, r10d + jl L_AES_CBC_decrypt_AESNI_dec_16 +L_AES_CBC_decrypt_AESNI_done_dec: + movdqu OWORD PTR [r8], xmm4 + movdqu xmm6, OWORD PTR [rsp] + movdqu xmm7, OWORD PTR [rsp+16] + movdqu xmm8, OWORD PTR [rsp+32] + add rsp, 48 + pop r12 + ret +AES_CBC_decrypt_AESNI ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_ctr_aesni_bswap QWORD \ + 08090a0b0c0d0e0fh, 0001020304050607h +ptr_L_aes_ctr_aesni_bswap QWORD L_aes_ctr_aesni_bswap +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_ctr_aesni_one QWORD \ + 0000000000000001h, 0000000000000000h +ptr_L_aes_ctr_aesni_one QWORD L_aes_ctr_aesni_one +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_CTR_encrypt_AESNI PROC + push rbx + mov eax, DWORD PTR [rsp+48] + mov r10, QWORD PTR [rsp+56] + sub rsp, 96 + movdqu OWORD PTR [rsp], xmm6 + movdqu OWORD PTR [rsp+16], xmm7 + movdqu OWORD PTR [rsp+32], xmm8 + movdqu OWORD PTR [rsp+48], xmm9 + movdqu OWORD PTR [rsp+64], xmm10 + movdqu OWORD PTR [rsp+80], xmm11 + movdqu xmm8, OWORD PTR L_aes_ctr_aesni_bswap + movdqu xmm9, OWORD PTR L_aes_ctr_aesni_one + pxor xmm10, xmm10 + movdqu xmm7, OWORD PTR [r10] + pshufb xmm7, xmm8 + xor eax, eax + cmp r8d, 64 + mov r10d, r8d + jl L_AES_CTR_encrypt_AESNI_done_64 + and r10d, 4294967232 +L_AES_CTR_encrypt_AESNI_enc_64: + ; 64 bytes of input + ; aes_ctr_enc_64 + lea r11, QWORD PTR [rcx+rax] + lea rbx, QWORD PTR [rdx+rax] + movdqa xmm0, xmm7 + pshufb xmm0, xmm8 + paddq xmm7, xmm9 + movdqa xmm11, xmm7 + pcmpeqq xmm11, xmm10 + pslldq xmm11, 8 + psrlq xmm11, 63 + paddq xmm7, xmm11 + movdqa xmm1, xmm7 + pshufb xmm1, xmm8 + paddq xmm7, xmm9 + movdqa xmm11, xmm7 + pcmpeqq xmm11, xmm10 + pslldq xmm11, 8 + psrlq xmm11, 63 + paddq xmm7, xmm11 + movdqa xmm2, xmm7 + pshufb xmm2, xmm8 + paddq xmm7, xmm9 + movdqa xmm11, xmm7 + pcmpeqq xmm11, xmm10 + pslldq xmm11, 8 + psrlq xmm11, 63 + paddq xmm7, xmm11 + movdqa xmm3, xmm7 + pshufb xmm3, xmm8 + paddq xmm7, xmm9 + movdqa xmm11, xmm7 + pcmpeqq xmm11, xmm10 + pslldq xmm11, 8 + psrlq xmm11, 63 + paddq xmm7, xmm11 + ; aes_enc_block + movdqu xmm4, OWORD PTR [r9] + pxor xmm0, xmm4 + pxor xmm1, xmm4 + pxor xmm2, xmm4 + pxor xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+16] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+32] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+48] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+64] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+80] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+96] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+112] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+128] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+144] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + cmp eax, 11 + movdqu xmm4, OWORD PTR [r9+160] + jl L_AES_CTR_encrypt_AESNI_64_aes_enc_block_last + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+176] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + cmp eax, 13 + movdqu xmm4, OWORD PTR [r9+192] + jl L_AES_CTR_encrypt_AESNI_64_aes_enc_block_last + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+208] + aesenc xmm0, xmm4 + aesenc xmm1, xmm4 + aesenc xmm2, xmm4 + aesenc xmm3, xmm4 + movdqu xmm4, OWORD PTR [r9+224] +L_AES_CTR_encrypt_AESNI_64_aes_enc_block_last: + aesenclast xmm0, xmm4 + aesenclast xmm1, xmm4 + aesenclast xmm2, xmm4 + aesenclast xmm3, xmm4 + movdqu xmm4, OWORD PTR [r11] + pxor xmm0, xmm4 + movdqu xmm4, OWORD PTR [r11+16] + pxor xmm1, xmm4 + movdqu xmm4, OWORD PTR [r11+32] + pxor xmm2, xmm4 + movdqu xmm4, OWORD PTR [r11+48] + pxor xmm3, xmm4 + movdqu OWORD PTR [rbx], xmm0 + movdqu OWORD PTR [rbx+16], xmm1 + movdqu OWORD PTR [rbx+32], xmm2 + movdqu OWORD PTR [rbx+48], xmm3 + add eax, 64 + cmp eax, r10d + jl L_AES_CTR_encrypt_AESNI_enc_64 +L_AES_CTR_encrypt_AESNI_done_64: + cmp eax, r8d + mov r10d, r8d + je L_AES_CTR_encrypt_AESNI_done_enc + and r10d, 4294967280 +L_AES_CTR_encrypt_AESNI_enc_16: + ; 16 bytes of input + movdqa xmm0, xmm7 + pshufb xmm0, xmm8 + paddq xmm7, xmm9 + movdqa xmm11, xmm7 + pcmpeqq xmm11, xmm10 + pslldq xmm11, 8 + psrlq xmm11, 63 + paddq xmm7, xmm11 + ; aes_enc_block + pxor xmm0, [r9] + movdqu xmm5, OWORD PTR [r9+16] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+32] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+48] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+64] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+80] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+96] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+112] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+128] + aesenc xmm0, xmm5 + movdqu xmm5, OWORD PTR [r9+144] + aesenc xmm0, xmm5 + cmp eax, 11 + movdqu xmm5, OWORD PTR [r9+160] + jl L_AES_CTR_encrypt_AESNI_16_aes_enc_block_last + aesenc xmm0, xmm5 + movdqu xmm6, OWORD PTR [r9+176] + aesenc xmm0, xmm6 + cmp eax, 13 + movdqu xmm5, OWORD PTR [r9+192] + jl L_AES_CTR_encrypt_AESNI_16_aes_enc_block_last + aesenc xmm0, xmm5 + movdqu xmm6, OWORD PTR [r9+208] + aesenc xmm0, xmm6 + movdqu xmm5, OWORD PTR [r9+224] +L_AES_CTR_encrypt_AESNI_16_aes_enc_block_last: + aesenclast xmm0, xmm5 + lea r11, QWORD PTR [rcx+rax] + movdqu xmm4, OWORD PTR [r11] + pxor xmm0, xmm4 + lea r11, QWORD PTR [rdx+rax] + movdqu OWORD PTR [r11], xmm0 + add eax, 16 + cmp eax, r10d + jl L_AES_CTR_encrypt_AESNI_enc_16 +L_AES_CTR_encrypt_AESNI_done_enc: + pshufb xmm7, xmm8 + movdqu OWORD PTR [r10], xmm7 + movdqu xmm6, OWORD PTR [rsp] + movdqu xmm7, OWORD PTR [rsp+16] + movdqu xmm8, OWORD PTR [rsp+32] + movdqu xmm9, OWORD PTR [rsp+48] + movdqu xmm10, OWORD PTR [rsp+64] + movdqu xmm11, OWORD PTR [rsp+80] + add rsp, 96 + pop rbx + ret +AES_CTR_encrypt_AESNI ENDP +_TEXT ENDS +IFDEF HAVE_INTEL_AVX1 +_TEXT SEGMENT READONLY PARA +AES_ECB_encrypt_avx1 PROC + mov eax, DWORD PTR [rsp+40] + sub rsp, 16 + vmovdqu OWORD PTR [rsp], xmm6 + xor eax, eax + cmp r8d, 64 + mov r9d, r8d + jl L_AES_ECB_encrypt_avx1_done_64 + and r9d, 4294967232 +L_AES_ECB_encrypt_avx1_enc_64: + ; 64 bytes of input + ; aes_ecb_enc_64 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + vmovdqu xmm0, OWORD PTR [r10] + vmovdqu xmm1, OWORD PTR [r10+16] + vmovdqu xmm2, OWORD PTR [r10+32] + vmovdqu xmm3, OWORD PTR [r10+48] + ; aes_enc_block + vmovdqu xmm4, OWORD PTR [r9] + vpxor xmm0, xmm0, xmm4 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm4 + vpxor xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+16] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+32] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+48] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+64] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+80] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+96] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+112] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+128] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+144] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + cmp eax, 11 + vmovdqu xmm4, OWORD PTR [r9+160] + jl L_AES_ECB_encrypt_avx1_64_aes_enc_block_last + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+176] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + cmp eax, 13 + vmovdqu xmm4, OWORD PTR [r9+192] + jl L_AES_ECB_encrypt_avx1_64_aes_enc_block_last + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+208] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+224] +L_AES_ECB_encrypt_avx1_64_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm4 + vaesenclast xmm1, xmm1, xmm4 + vaesenclast xmm2, xmm2, xmm4 + vaesenclast xmm3, xmm3, xmm4 + vmovdqu OWORD PTR [r11], xmm0 + vmovdqu OWORD PTR [r11+16], xmm1 + vmovdqu OWORD PTR [r11+32], xmm2 + vmovdqu OWORD PTR [r11+48], xmm3 + add eax, 64 + cmp eax, r9d + jl L_AES_ECB_encrypt_avx1_enc_64 +L_AES_ECB_encrypt_avx1_done_64: + cmp eax, r8d + mov r9d, r8d + je L_AES_ECB_encrypt_avx1_done_enc + and r9d, 4294967280 +L_AES_ECB_encrypt_avx1_enc_16: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + vmovdqu xmm0, OWORD PTR [r10] + ; aes_enc_block + vpxor xmm0, xmm0, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm0, xmm0, xmm5 + cmp eax, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_ECB_encrypt_avx1_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm0, xmm0, xmm6 + cmp eax, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_ECB_encrypt_avx1_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_ECB_encrypt_avx1_16_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + lea r10, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r10], xmm0 + add eax, 16 + cmp eax, r9d + jl L_AES_ECB_encrypt_avx1_enc_16 +L_AES_ECB_encrypt_avx1_done_enc: + vmovdqu xmm6, OWORD PTR [rsp] + add rsp, 16 + ret +AES_ECB_encrypt_avx1 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_ECB_decrypt_avx1 PROC + mov eax, DWORD PTR [rsp+40] + sub rsp, 16 + vmovdqu OWORD PTR [rsp], xmm6 + xor eax, eax + cmp r8d, 64 + mov r9d, r8d + jl L_AES_ECB_decrypt_avx1_done_64 + and r9d, 4294967232 +L_AES_ECB_decrypt_avx1_dec_64: + ; 64 bytes of input + ; aes_ecb_dec_64 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + vmovdqu xmm0, OWORD PTR [r10] + vmovdqu xmm1, OWORD PTR [r10+16] + vmovdqu xmm2, OWORD PTR [r10+32] + vmovdqu xmm3, OWORD PTR [r10+48] + ; aes_dec_block + vmovdqu xmm4, OWORD PTR [r9] + vpxor xmm0, xmm0, xmm4 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm4 + vpxor xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+16] + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+32] + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+48] + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+64] + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+80] + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+96] + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+112] + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+128] + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+144] + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + cmp eax, 11 + vmovdqu xmm4, OWORD PTR [r9+160] + jl L_AES_ECB_decrypt_avx1_64_aes_dec_block_last + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+176] + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + cmp eax, 13 + vmovdqu xmm4, OWORD PTR [r9+192] + jl L_AES_ECB_decrypt_avx1_64_aes_dec_block_last + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+208] + vaesdec xmm0, xmm0, xmm4 + vaesdec xmm1, xmm1, xmm4 + vaesdec xmm2, xmm2, xmm4 + vaesdec xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+224] +L_AES_ECB_decrypt_avx1_64_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm4 + vaesdeclast xmm1, xmm1, xmm4 + vaesdeclast xmm2, xmm2, xmm4 + vaesdeclast xmm3, xmm3, xmm4 + vmovdqu OWORD PTR [r11], xmm0 + vmovdqu OWORD PTR [r11+16], xmm1 + vmovdqu OWORD PTR [r11+32], xmm2 + vmovdqu OWORD PTR [r11+48], xmm3 + add eax, 64 + cmp eax, r9d + jl L_AES_ECB_decrypt_avx1_dec_64 +L_AES_ECB_decrypt_avx1_done_64: + cmp eax, r8d + mov r9d, r8d + je L_AES_ECB_decrypt_avx1_done_dec + and r9d, 4294967280 +L_AES_ECB_decrypt_avx1_dec_16: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + vmovdqu xmm0, OWORD PTR [r10] + ; aes_dec_block + vpxor xmm0, xmm0, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesdec xmm0, xmm0, xmm5 + cmp eax, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_ECB_decrypt_avx1_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesdec xmm0, xmm0, xmm6 + cmp eax, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_ECB_decrypt_avx1_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_ECB_decrypt_avx1_16_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + lea r10, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r10], xmm0 + add eax, 16 + cmp eax, r9d + jl L_AES_ECB_decrypt_avx1_dec_16 +L_AES_ECB_decrypt_avx1_done_dec: + vmovdqu xmm6, OWORD PTR [rsp] + add rsp, 16 + ret +AES_ECB_decrypt_avx1 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_CBC_encrypt_avx1 PROC + mov rax, QWORD PTR [rsp+40] + mov r10d, DWORD PTR [rsp+48] + vmovdqu xmm0, OWORD PTR [r8] + xor eax, eax + cmp eax, r9d + je L_AES_CBC_encrypt_avx1_done +L_AES_CBC_encrypt_avx1_loop: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + vmovdqu xmm1, OWORD PTR [r10] + vpxor xmm1, xmm1, xmm0 + ; aes_enc_block + vpxor xmm1, xmm1, [rax] + vmovdqu xmm3, OWORD PTR [rax+16] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+32] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+48] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+64] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+80] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+96] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+112] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+128] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+144] + vaesenc xmm1, xmm1, xmm3 + cmp r10d, 11 + vmovdqu xmm3, OWORD PTR [rax+160] + jl L_AES_CBC_encrypt_avx1_aes_enc_block_last + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm4, OWORD PTR [rax+176] + vaesenc xmm1, xmm1, xmm4 + cmp r10d, 13 + vmovdqu xmm3, OWORD PTR [rax+192] + jl L_AES_CBC_encrypt_avx1_aes_enc_block_last + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm4, OWORD PTR [rax+208] + vaesenc xmm1, xmm1, xmm4 + vmovdqu xmm3, OWORD PTR [rax+224] +L_AES_CBC_encrypt_avx1_aes_enc_block_last: + vaesenclast xmm1, xmm1, xmm3 + lea r11, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r11], xmm1 + vmovdqa xmm0, xmm1 + add eax, 16 + cmp eax, r9d + jl L_AES_CBC_encrypt_avx1_loop +L_AES_CBC_encrypt_avx1_done: + vmovdqu OWORD PTR [r8], xmm0 + ret +AES_CBC_encrypt_avx1 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_CBC_decrypt_avx1 PROC + push r12 + mov rax, QWORD PTR [rsp+48] + mov r10d, DWORD PTR [rsp+56] + sub rsp, 48 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu xmm4, OWORD PTR [r8] + xor eax, eax + cmp r9d, 64 + mov r10d, r9d + jl L_AES_CBC_decrypt_avx1_done_64 + and r10d, 4294967232 +L_AES_CBC_decrypt_avx1_dec_64: + ; 64 bytes of input + ; aes_cbc_dec_64 + lea r11, QWORD PTR [rcx+rax] + lea r12, QWORD PTR [rdx+rax] + vmovdqu xmm0, OWORD PTR [r11] + vmovdqu xmm1, OWORD PTR [r11+16] + vmovdqu xmm2, OWORD PTR [r11+32] + vmovdqu xmm3, OWORD PTR [r11+48] + ; aes_dec_block + vmovdqu xmm5, OWORD PTR [rax] + vpxor xmm0, xmm0, xmm5 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm5 + vpxor xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+16] + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+32] + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+48] + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+64] + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+80] + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+96] + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+112] + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+128] + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+144] + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [rax+160] + jl L_AES_CBC_decrypt_avx1_64_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+176] + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [rax+192] + jl L_AES_CBC_decrypt_avx1_64_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+208] + vaesdec xmm0, xmm0, xmm5 + vaesdec xmm1, xmm1, xmm5 + vaesdec xmm2, xmm2, xmm5 + vaesdec xmm3, xmm3, xmm5 + vmovdqu xmm5, OWORD PTR [rax+224] +L_AES_CBC_decrypt_avx1_64_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vaesdeclast xmm1, xmm1, xmm5 + vaesdeclast xmm2, xmm2, xmm5 + vaesdeclast xmm3, xmm3, xmm5 + vpxor xmm0, xmm0, xmm4 + vpxor xmm1, xmm1, [r11] + vpxor xmm2, xmm2, [r11+16] + vpxor xmm3, xmm3, [r11+32] + vmovdqu xmm4, OWORD PTR [r11+48] + vmovdqu OWORD PTR [r12], xmm0 + vmovdqu OWORD PTR [r12+16], xmm1 + vmovdqu OWORD PTR [r12+32], xmm2 + vmovdqu OWORD PTR [r12+48], xmm3 + add eax, 64 + cmp eax, r10d + jl L_AES_CBC_decrypt_avx1_dec_64 +L_AES_CBC_decrypt_avx1_done_64: + cmp eax, r9d + mov r10d, r9d + je L_AES_CBC_decrypt_avx1_done_dec + and r10d, 4294967280 +L_AES_CBC_decrypt_avx1_dec_16: + ; 16 bytes of input + lea r11, QWORD PTR [rcx+rax] + vmovdqu xmm0, OWORD PTR [r11] + vmovdqa xmm8, xmm0 + ; aes_dec_block + vpxor xmm0, xmm0, [rax] + vmovdqu xmm6, OWORD PTR [rax+16] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm6, OWORD PTR [rax+32] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm6, OWORD PTR [rax+48] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm6, OWORD PTR [rax+64] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm6, OWORD PTR [rax+80] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm6, OWORD PTR [rax+96] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm6, OWORD PTR [rax+112] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm6, OWORD PTR [rax+128] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm6, OWORD PTR [rax+144] + vaesdec xmm0, xmm0, xmm6 + cmp r10d, 11 + vmovdqu xmm6, OWORD PTR [rax+160] + jl L_AES_CBC_decrypt_avx1_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm7, OWORD PTR [rax+176] + vaesdec xmm0, xmm0, xmm7 + cmp r10d, 13 + vmovdqu xmm6, OWORD PTR [rax+192] + jl L_AES_CBC_decrypt_avx1_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm7, OWORD PTR [rax+208] + vaesdec xmm0, xmm0, xmm7 + vmovdqu xmm6, OWORD PTR [rax+224] +L_AES_CBC_decrypt_avx1_16_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm6 + vpxor xmm0, xmm0, xmm4 + vmovdqa xmm4, xmm8 + lea r11, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r11], xmm0 + add eax, 16 + cmp eax, r10d + jl L_AES_CBC_decrypt_avx1_dec_16 +L_AES_CBC_decrypt_avx1_done_dec: + vmovdqu OWORD PTR [r8], xmm4 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + add rsp, 48 + pop r12 + ret +AES_CBC_decrypt_avx1 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_ctr_avx1_bswap QWORD \ + 08090a0b0c0d0e0fh, 0001020304050607h +ptr_L_aes_ctr_avx1_bswap QWORD L_aes_ctr_avx1_bswap +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_ctr_avx1_one QWORD \ + 0000000000000001h, 0000000000000000h +ptr_L_aes_ctr_avx1_one QWORD L_aes_ctr_avx1_one +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_CTR_encrypt_avx1 PROC + push rbx + mov eax, DWORD PTR [rsp+48] + mov r10, QWORD PTR [rsp+56] + sub rsp, 96 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu xmm8, OWORD PTR L_aes_ctr_avx1_bswap + vmovdqu xmm9, OWORD PTR L_aes_ctr_avx1_one + vpxor xmm10, xmm10, xmm10 + vmovdqu xmm7, OWORD PTR [r10] + vpshufb xmm7, xmm7, xmm8 + xor eax, eax + cmp r8d, 64 + mov r10d, r8d + jl L_AES_CTR_encrypt_avx1_done_64 + and r10d, 4294967232 +L_AES_CTR_encrypt_avx1_enc_64: + ; 64 bytes of input + ; aes_ctr_enc_64 + lea r11, QWORD PTR [rcx+rax] + lea rbx, QWORD PTR [rdx+rax] + vpshufb xmm0, xmm7, xmm8 + vpaddq xmm7, xmm7, xmm9 + vpcmpeqq xmm11, xmm7, xmm10 + vpslldq xmm11, xmm11, 8 + vpsrlq xmm11, xmm11, 63 + vpaddq xmm7, xmm7, xmm11 + vpshufb xmm1, xmm7, xmm8 + vpaddq xmm7, xmm7, xmm9 + vpcmpeqq xmm11, xmm7, xmm10 + vpslldq xmm11, xmm11, 8 + vpsrlq xmm11, xmm11, 63 + vpaddq xmm7, xmm7, xmm11 + vpshufb xmm2, xmm7, xmm8 + vpaddq xmm7, xmm7, xmm9 + vpcmpeqq xmm11, xmm7, xmm10 + vpslldq xmm11, xmm11, 8 + vpsrlq xmm11, xmm11, 63 + vpaddq xmm7, xmm7, xmm11 + vpshufb xmm3, xmm7, xmm8 + vpaddq xmm7, xmm7, xmm9 + vpcmpeqq xmm11, xmm7, xmm10 + vpslldq xmm11, xmm11, 8 + vpsrlq xmm11, xmm11, 63 + vpaddq xmm7, xmm7, xmm11 + ; aes_enc_block + vmovdqu xmm4, OWORD PTR [r9] + vpxor xmm0, xmm0, xmm4 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm4 + vpxor xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+16] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+32] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+48] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+64] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+80] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+96] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+112] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+128] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+144] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + cmp eax, 11 + vmovdqu xmm4, OWORD PTR [r9+160] + jl L_AES_CTR_encrypt_avx1_64_aes_enc_block_last + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+176] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + cmp eax, 13 + vmovdqu xmm4, OWORD PTR [r9+192] + jl L_AES_CTR_encrypt_avx1_64_aes_enc_block_last + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+208] + vaesenc xmm0, xmm0, xmm4 + vaesenc xmm1, xmm1, xmm4 + vaesenc xmm2, xmm2, xmm4 + vaesenc xmm3, xmm3, xmm4 + vmovdqu xmm4, OWORD PTR [r9+224] +L_AES_CTR_encrypt_avx1_64_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm4 + vaesenclast xmm1, xmm1, xmm4 + vaesenclast xmm2, xmm2, xmm4 + vaesenclast xmm3, xmm3, xmm4 + vpxor xmm0, xmm0, [r11] + vpxor xmm1, xmm1, [r11+16] + vpxor xmm2, xmm2, [r11+32] + vpxor xmm3, xmm3, [r11+48] + vmovdqu OWORD PTR [rbx], xmm0 + vmovdqu OWORD PTR [rbx+16], xmm1 + vmovdqu OWORD PTR [rbx+32], xmm2 + vmovdqu OWORD PTR [rbx+48], xmm3 + add eax, 64 + cmp eax, r10d + jl L_AES_CTR_encrypt_avx1_enc_64 +L_AES_CTR_encrypt_avx1_done_64: + cmp eax, r8d + mov r10d, r8d + je L_AES_CTR_encrypt_avx1_done_enc + and r10d, 4294967280 +L_AES_CTR_encrypt_avx1_enc_16: + ; 16 bytes of input + vpshufb xmm0, xmm7, xmm8 + vpaddq xmm7, xmm7, xmm9 + vpcmpeqq xmm11, xmm7, xmm10 + vpslldq xmm11, xmm11, 8 + vpsrlq xmm11, xmm11, 63 + vpaddq xmm7, xmm7, xmm11 + ; aes_enc_block + vpxor xmm0, xmm0, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm0, xmm0, xmm5 + cmp eax, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_CTR_encrypt_avx1_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm0, xmm0, xmm6 + cmp eax, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_CTR_encrypt_avx1_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_CTR_encrypt_avx1_16_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + lea r11, QWORD PTR [rcx+rax] + vpxor xmm0, xmm0, [r11] + lea r11, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r11], xmm0 + add eax, 16 + cmp eax, r10d + jl L_AES_CTR_encrypt_avx1_enc_16 +L_AES_CTR_encrypt_avx1_done_enc: + vpshufb xmm7, xmm7, xmm8 + vmovdqu OWORD PTR [r10], xmm7 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + add rsp, 96 + pop rbx + ret +AES_CTR_encrypt_avx1 ENDP +_TEXT ENDS +ENDIF +IFDEF HAVE_INTEL_VAES +_TEXT SEGMENT READONLY PARA +AES_ECB_encrypt_vaes PROC + mov eax, DWORD PTR [rsp+40] + sub rsp, 32 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + xor eax, eax + cmp r8d, 128 + mov r9d, r8d + jl L_AES_ECB_encrypt_vaes_done_128 + and r9d, 4294967168 +L_AES_ECB_encrypt_vaes_enc_128: + ; 128 bytes of input + ; aes_ecb_enc_128 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + vmovdqu ymm0, YMMWORD PTR [r10] + vmovdqu ymm1, YMMWORD PTR [r10+32] + vmovdqu ymm2, YMMWORD PTR [r10+64] + vmovdqu ymm3, YMMWORD PTR [r10+96] + ; aes_enc_block + vbroadcasti128 ymm7, [r9] + vpxor ymm0, ymm0, ymm7 + vpxor ymm1, ymm1, ymm7 + vpxor ymm2, ymm2, ymm7 + vpxor ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+16] + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+32] + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+48] + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+64] + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+80] + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+96] + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+112] + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+128] + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+144] + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + cmp eax, 11 + vbroadcasti128 ymm7, [r9+160] + jl L_AES_ECB_encrypt_vaes_128_aes_enc_block_last + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+176] + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + cmp eax, 13 + vbroadcasti128 ymm7, [r9+192] + jl L_AES_ECB_encrypt_vaes_128_aes_enc_block_last + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+208] + vaesenc ymm0, ymm0, ymm7 + vaesenc ymm1, ymm1, ymm7 + vaesenc ymm2, ymm2, ymm7 + vaesenc ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+224] +L_AES_ECB_encrypt_vaes_128_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm7 + vaesenclast ymm1, ymm1, ymm7 + vaesenclast ymm2, ymm2, ymm7 + vaesenclast ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [r11], ymm0 + vmovdqu YMMWORD PTR [r11+32], ymm1 + vmovdqu YMMWORD PTR [r11+64], ymm2 + vmovdqu YMMWORD PTR [r11+96], ymm3 + add eax, 128 + cmp eax, r9d + jl L_AES_ECB_encrypt_vaes_enc_128 +L_AES_ECB_encrypt_vaes_done_128: + mov r9d, r8d + and r9d, 4294967264 + cmp eax, r9d + je L_AES_ECB_encrypt_vaes_done_32 +L_AES_ECB_encrypt_vaes_enc_32: + ; 32 bytes of input + ; aes_ecb_enc_32 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + vmovdqu ymm0, YMMWORD PTR [r10] + ; aes_enc_block + vbroadcasti128 ymm7, [r9] + vpxor ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+16] + vaesenc ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+32] + vaesenc ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+48] + vaesenc ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+64] + vaesenc ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+80] + vaesenc ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+96] + vaesenc ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+112] + vaesenc ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+128] + vaesenc ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+144] + vaesenc ymm0, ymm0, ymm7 + cmp eax, 11 + vbroadcasti128 ymm7, [r9+160] + jl L_AES_ECB_encrypt_vaes_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+176] + vaesenc ymm0, ymm0, ymm7 + cmp eax, 13 + vbroadcasti128 ymm7, [r9+192] + jl L_AES_ECB_encrypt_vaes_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+208] + vaesenc ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+224] +L_AES_ECB_encrypt_vaes_32_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm7 + vmovdqu YMMWORD PTR [r11], ymm0 + add eax, 32 + cmp eax, r9d + jl L_AES_ECB_encrypt_vaes_enc_32 +L_AES_ECB_encrypt_vaes_done_32: + cmp eax, r8d + mov r9d, r8d + je L_AES_ECB_encrypt_vaes_done_enc + and r9d, 4294967280 +L_AES_ECB_encrypt_vaes_enc_16: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + vmovdqu xmm0, OWORD PTR [r10] + ; aes_enc_block + vpxor xmm0, xmm0, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm0, xmm0, xmm5 + cmp eax, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_ECB_encrypt_vaes_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm0, xmm0, xmm6 + cmp eax, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_ECB_encrypt_vaes_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_ECB_encrypt_vaes_16_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + lea r10, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r10], xmm0 + add eax, 16 + cmp eax, r9d + jl L_AES_ECB_encrypt_vaes_enc_16 +L_AES_ECB_encrypt_vaes_done_enc: + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + add rsp, 32 + ret +AES_ECB_encrypt_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_ECB_decrypt_vaes PROC + mov eax, DWORD PTR [rsp+40] + sub rsp, 32 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + xor eax, eax + cmp r8d, 128 + mov r9d, r8d + jl L_AES_ECB_decrypt_vaes_done_128 + and r9d, 4294967168 +L_AES_ECB_decrypt_vaes_dec_128: + ; 128 bytes of input + ; aes_ecb_dec_128 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + vmovdqu ymm0, YMMWORD PTR [r10] + vmovdqu ymm1, YMMWORD PTR [r10+32] + vmovdqu ymm2, YMMWORD PTR [r10+64] + vmovdqu ymm3, YMMWORD PTR [r10+96] + ; aes_dec_block + vbroadcasti128 ymm7, [r9] + vpxor ymm0, ymm0, ymm7 + vpxor ymm1, ymm1, ymm7 + vpxor ymm2, ymm2, ymm7 + vpxor ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+16] + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+32] + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+48] + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+64] + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+80] + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+96] + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+112] + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+128] + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+144] + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + cmp eax, 11 + vbroadcasti128 ymm7, [r9+160] + jl L_AES_ECB_decrypt_vaes_128_aes_dec_block_last + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+176] + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + cmp eax, 13 + vbroadcasti128 ymm7, [r9+192] + jl L_AES_ECB_decrypt_vaes_128_aes_dec_block_last + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+208] + vaesdec ymm0, ymm0, ymm7 + vaesdec ymm1, ymm1, ymm7 + vaesdec ymm2, ymm2, ymm7 + vaesdec ymm3, ymm3, ymm7 + vbroadcasti128 ymm7, [r9+224] +L_AES_ECB_decrypt_vaes_128_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm7 + vaesdeclast ymm1, ymm1, ymm7 + vaesdeclast ymm2, ymm2, ymm7 + vaesdeclast ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [r11], ymm0 + vmovdqu YMMWORD PTR [r11+32], ymm1 + vmovdqu YMMWORD PTR [r11+64], ymm2 + vmovdqu YMMWORD PTR [r11+96], ymm3 + add eax, 128 + cmp eax, r9d + jl L_AES_ECB_decrypt_vaes_dec_128 +L_AES_ECB_decrypt_vaes_done_128: + mov r9d, r8d + and r9d, 4294967264 + cmp eax, r9d + je L_AES_ECB_decrypt_vaes_done_32 +L_AES_ECB_decrypt_vaes_dec_32: + ; 32 bytes of input + ; aes_ecb_dec_32 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + vmovdqu ymm0, YMMWORD PTR [r10] + ; aes_dec_block + vbroadcasti128 ymm7, [r9] + vpxor ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+16] + vaesdec ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+32] + vaesdec ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+48] + vaesdec ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+64] + vaesdec ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+80] + vaesdec ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+96] + vaesdec ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+112] + vaesdec ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+128] + vaesdec ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+144] + vaesdec ymm0, ymm0, ymm7 + cmp eax, 11 + vbroadcasti128 ymm7, [r9+160] + jl L_AES_ECB_decrypt_vaes_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+176] + vaesdec ymm0, ymm0, ymm7 + cmp eax, 13 + vbroadcasti128 ymm7, [r9+192] + jl L_AES_ECB_decrypt_vaes_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+208] + vaesdec ymm0, ymm0, ymm7 + vbroadcasti128 ymm7, [r9+224] +L_AES_ECB_decrypt_vaes_32_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm7 + vmovdqu YMMWORD PTR [r11], ymm0 + add eax, 32 + cmp eax, r9d + jl L_AES_ECB_decrypt_vaes_dec_32 +L_AES_ECB_decrypt_vaes_done_32: + cmp eax, r8d + mov r9d, r8d + je L_AES_ECB_decrypt_vaes_done_dec + and r9d, 4294967280 +L_AES_ECB_decrypt_vaes_dec_16: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + vmovdqu xmm0, OWORD PTR [r10] + ; aes_dec_block + vpxor xmm0, xmm0, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesdec xmm0, xmm0, xmm5 + cmp eax, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_ECB_decrypt_vaes_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesdec xmm0, xmm0, xmm6 + cmp eax, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_ECB_decrypt_vaes_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_ECB_decrypt_vaes_16_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + lea r10, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r10], xmm0 + add eax, 16 + cmp eax, r9d + jl L_AES_ECB_decrypt_vaes_dec_16 +L_AES_ECB_decrypt_vaes_done_dec: + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + add rsp, 32 + ret +AES_ECB_decrypt_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_CBC_encrypt_vaes PROC + mov rax, QWORD PTR [rsp+40] + mov r10d, DWORD PTR [rsp+48] + vmovdqu xmm0, OWORD PTR [r8] + xor eax, eax + cmp eax, r9d + je L_AES_CBC_encrypt_vaes_done +L_AES_CBC_encrypt_vaes_loop: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + vmovdqu xmm1, OWORD PTR [r10] + vpxor xmm1, xmm1, xmm0 + ; aes_enc_block + vpxor xmm1, xmm1, [rax] + vmovdqu xmm3, OWORD PTR [rax+16] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+32] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+48] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+64] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+80] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+96] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+112] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+128] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+144] + vaesenc xmm1, xmm1, xmm3 + cmp r10d, 11 + vmovdqu xmm3, OWORD PTR [rax+160] + jl L_AES_CBC_encrypt_vaes_aes_enc_block_last + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm4, OWORD PTR [rax+176] + vaesenc xmm1, xmm1, xmm4 + cmp r10d, 13 + vmovdqu xmm3, OWORD PTR [rax+192] + jl L_AES_CBC_encrypt_vaes_aes_enc_block_last + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm4, OWORD PTR [rax+208] + vaesenc xmm1, xmm1, xmm4 + vmovdqu xmm3, OWORD PTR [rax+224] +L_AES_CBC_encrypt_vaes_aes_enc_block_last: + vaesenclast xmm1, xmm1, xmm3 + lea r11, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r11], xmm1 + vmovdqa xmm0, xmm1 + add eax, 16 + cmp eax, r9d + jl L_AES_CBC_encrypt_vaes_loop +L_AES_CBC_encrypt_vaes_done: + vmovdqu OWORD PTR [r8], xmm0 + ret +AES_CBC_encrypt_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_CBC_decrypt_vaes PROC + push r12 + mov rax, QWORD PTR [rsp+48] + mov r10d, DWORD PTR [rsp+56] + sub rsp, 128 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu xmm8, OWORD PTR [r8] + xor eax, eax + cmp r9d, 128 + mov r10d, r9d + jl L_AES_CBC_decrypt_vaes_done_128 + and r10d, 4294967168 +L_AES_CBC_decrypt_vaes_dec_128: + ; 128 bytes of input + ; aes_cbc_dec_128 + lea r11, QWORD PTR [rcx+rax] + lea r12, QWORD PTR [rdx+rax] + vmovdqu ymm0, YMMWORD PTR [r11] + vmovdqu ymm1, YMMWORD PTR [r11+32] + vmovdqu ymm2, YMMWORD PTR [r11+64] + vmovdqu ymm3, YMMWORD PTR [r11+96] + vinserti128 ymm10, ymm8, xmm0, 1 + vmovdqu ymm11, YMMWORD PTR [r11+16] + vmovdqu ymm12, YMMWORD PTR [r11+48] + vmovdqu ymm13, YMMWORD PTR [r11+80] + vextracti128 xmm8, ymm3, 1 + ; aes_dec_block + vbroadcasti128 ymm9, [rax] + vpxor ymm0, ymm0, ymm9 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm9 + vpxor ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+16] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+32] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+48] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+64] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+80] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+96] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+112] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+128] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+144] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + cmp r10d, 11 + vbroadcasti128 ymm9, [rax+160] + jl L_AES_CBC_decrypt_vaes_128_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+176] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + cmp r10d, 13 + vbroadcasti128 ymm9, [rax+192] + jl L_AES_CBC_decrypt_vaes_128_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+208] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [rax+224] +L_AES_CBC_decrypt_vaes_128_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm9 + vaesdeclast ymm1, ymm1, ymm9 + vaesdeclast ymm2, ymm2, ymm9 + vaesdeclast ymm3, ymm3, ymm9 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vmovdqu YMMWORD PTR [r12], ymm0 + vmovdqu YMMWORD PTR [r12+32], ymm1 + vmovdqu YMMWORD PTR [r12+64], ymm2 + vmovdqu YMMWORD PTR [r12+96], ymm3 + add eax, 128 + cmp eax, r10d + jl L_AES_CBC_decrypt_vaes_dec_128 +L_AES_CBC_decrypt_vaes_done_128: + mov r10d, r9d + and r10d, 4294967264 + cmp eax, r10d + je L_AES_CBC_decrypt_vaes_done_32 +L_AES_CBC_decrypt_vaes_dec_32: + ; 32 bytes of input + ; aes_cbc_dec_32 + lea r11, QWORD PTR [rcx+rax] + lea r12, QWORD PTR [rdx+rax] + vmovdqu ymm0, YMMWORD PTR [r11] + vinserti128 ymm10, ymm8, xmm0, 1 + vextracti128 xmm8, ymm0, 1 + ; aes_dec_block + vbroadcasti128 ymm9, [rax] + vpxor ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+16] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+32] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+48] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+64] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+80] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+96] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+112] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+128] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+144] + vaesdec ymm0, ymm0, ymm9 + cmp r10d, 11 + vbroadcasti128 ymm9, [rax+160] + jl L_AES_CBC_decrypt_vaes_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+176] + vaesdec ymm0, ymm0, ymm9 + cmp r10d, 13 + vbroadcasti128 ymm9, [rax+192] + jl L_AES_CBC_decrypt_vaes_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+208] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [rax+224] +L_AES_CBC_decrypt_vaes_32_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm9 + vpxor ymm0, ymm0, ymm10 + vmovdqu YMMWORD PTR [r12], ymm0 + add eax, 32 + cmp eax, r10d + jl L_AES_CBC_decrypt_vaes_dec_32 +L_AES_CBC_decrypt_vaes_done_32: + cmp eax, r9d + mov r10d, r9d + je L_AES_CBC_decrypt_vaes_done_dec + and r10d, 4294967280 +L_AES_CBC_decrypt_vaes_dec_16: + ; 16 bytes of input + lea r11, QWORD PTR [rcx+rax] + vmovdqu xmm0, OWORD PTR [r11] + vmovdqa xmm7, xmm0 + ; aes_dec_block + vpxor xmm0, xmm0, [rax] + vmovdqu xmm5, OWORD PTR [rax+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+144] + vaesdec xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [rax+160] + jl L_AES_CBC_decrypt_vaes_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [rax+176] + vaesdec xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [rax+192] + jl L_AES_CBC_decrypt_vaes_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [rax+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [rax+224] +L_AES_CBC_decrypt_vaes_16_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + vmovdqa xmm8, xmm7 + lea r11, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r11], xmm0 + add eax, 16 + cmp eax, r10d + jl L_AES_CBC_decrypt_vaes_dec_16 +L_AES_CBC_decrypt_vaes_done_dec: + vmovdqu OWORD PTR [r8], xmm8 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + add rsp, 128 + pop r12 + ret +AES_CBC_decrypt_vaes ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_ctr_bswap_vaes QWORD \ + 08090a0b0c0d0e0fh, 0001020304050607h +ptr_L_aes_ctr_bswap_vaes QWORD L_aes_ctr_bswap_vaes +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_ctr_inc_vaes QWORD \ + 0000000000000000h, 0000000000000000h, + 0000000000000001h, 0000000000000000h, + 0000000000000002h, 0000000000000000h, + 0000000000000003h, 0000000000000000h, + 0000000000000004h, 0000000000000000h, + 0000000000000005h, 0000000000000000h, + 0000000000000006h, 0000000000000000h, + 0000000000000007h, 0000000000000000h, + 0000000000000008h, 0000000000000000h, + 0000000000000009h, 0000000000000000h, + 000000000000000ah, 0000000000000000h, + 000000000000000bh, 0000000000000000h, + 000000000000000ch, 0000000000000000h, + 000000000000000dh, 0000000000000000h, + 000000000000000eh, 0000000000000000h, + 000000000000000fh, 0000000000000000h, + 0000000000000010h, 0000000000000000h +ptr_L_aes_ctr_inc_vaes QWORD L_aes_ctr_inc_vaes +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_CTR_encrypt_vaes PROC + push rbx + mov eax, DWORD PTR [rsp+48] + mov r10, QWORD PTR [rsp+56] + sub rsp, 144 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vbroadcasti128 ymm8, ptr_L_aes_ctr_bswap_vaes + vbroadcasti128 ymm7, [r10] + vpshufb ymm7, ymm7, ymm8 + vbroadcasti128 ymm10, [ptr_L_aes_ctr_inc_vaes+128] + vbroadcasti128 ymm11, [ptr_L_aes_ctr_inc_vaes+32] + vbroadcasti128 ymm12, [ptr_L_aes_ctr_inc_vaes+16] + xor eax, eax + cmp r8d, 128 + mov r10d, r8d + jl L_AES_CTR_encrypt_vaes_done_128 + and r10d, 4294967168 + vmovdqa ymm9, ymm7 + vpaddq ymm4, ymm7, [ptr_L_aes_ctr_inc_vaes] + vpand ymm14, ymm9, [ptr_L_aes_ctr_inc_vaes] + vpor ymm9, ymm9, [ptr_L_aes_ctr_inc_vaes] + vpandn ymm9, ymm4, ymm9 + vpor ymm9, ymm9, ymm14 + vpsrlq ymm9, ymm9, 63 + vpslldq ymm9, ymm9, 8 + vpaddq ymm4, ymm4, ymm9 + vmovdqa ymm9, ymm7 + vpaddq ymm5, ymm7, [ptr_L_aes_ctr_inc_vaes+32] + vpand ymm14, ymm9, [ptr_L_aes_ctr_inc_vaes+32] + vpor ymm9, ymm9, [ptr_L_aes_ctr_inc_vaes+32] + vpandn ymm9, ymm5, ymm9 + vpor ymm9, ymm9, ymm14 + vpsrlq ymm9, ymm9, 63 + vpslldq ymm9, ymm9, 8 + vpaddq ymm5, ymm5, ymm9 + vmovdqa ymm9, ymm7 + vpaddq ymm6, ymm7, [ptr_L_aes_ctr_inc_vaes+64] + vpand ymm14, ymm9, [ptr_L_aes_ctr_inc_vaes+64] + vpor ymm9, ymm9, [ptr_L_aes_ctr_inc_vaes+64] + vpandn ymm9, ymm6, ymm9 + vpor ymm9, ymm9, ymm14 + vpsrlq ymm9, ymm9, 63 + vpslldq ymm9, ymm9, 8 + vpaddq ymm6, ymm6, ymm9 + vmovdqa ymm9, ymm7 + vpaddq ymm7, ymm7, [ptr_L_aes_ctr_inc_vaes+96] + vpand ymm14, ymm9, [ptr_L_aes_ctr_inc_vaes+96] + vpor ymm9, ymm9, [ptr_L_aes_ctr_inc_vaes+96] + vpandn ymm9, ymm7, ymm9 + vpor ymm9, ymm9, ymm14 + vpsrlq ymm9, ymm9, 63 + vpslldq ymm9, ymm9, 8 + vpaddq ymm7, ymm7, ymm9 +L_AES_CTR_encrypt_vaes_enc_128: + ; 128 bytes of input + lea r11, QWORD PTR [rcx+rax] + lea rbx, QWORD PTR [rdx+rax] + vpshufb ymm0, ymm4, ymm8 + vpshufb ymm1, ymm5, ymm8 + vpshufb ymm2, ymm6, ymm8 + vpshufb ymm3, ymm7, ymm8 + vmovdqa ymm9, ymm4 + vpaddq ymm4, ymm4, ymm10 + vpand ymm14, ymm9, ymm10 + vpor ymm9, ymm9, ymm10 + vpandn ymm9, ymm4, ymm9 + vpor ymm9, ymm9, ymm14 + vpsrlq ymm9, ymm9, 63 + vpslldq ymm9, ymm9, 8 + vpaddq ymm4, ymm4, ymm9 + vmovdqa ymm9, ymm5 + vpaddq ymm5, ymm5, ymm10 + vpand ymm14, ymm9, ymm10 + vpor ymm9, ymm9, ymm10 + vpandn ymm9, ymm5, ymm9 + vpor ymm9, ymm9, ymm14 + vpsrlq ymm9, ymm9, 63 + vpslldq ymm9, ymm9, 8 + vpaddq ymm5, ymm5, ymm9 + vmovdqa ymm9, ymm6 + vpaddq ymm6, ymm6, ymm10 + vpand ymm14, ymm9, ymm10 + vpor ymm9, ymm9, ymm10 + vpandn ymm9, ymm6, ymm9 + vpor ymm9, ymm9, ymm14 + vpsrlq ymm9, ymm9, 63 + vpslldq ymm9, ymm9, 8 + vpaddq ymm6, ymm6, ymm9 + vmovdqa ymm9, ymm7 + vpaddq ymm7, ymm7, ymm10 + vpand ymm14, ymm9, ymm10 + vpor ymm9, ymm9, ymm10 + vpandn ymm9, ymm7, ymm9 + vpor ymm9, ymm9, ymm14 + vpsrlq ymm9, ymm9, 63 + vpslldq ymm9, ymm9, 8 + vpaddq ymm7, ymm7, ymm9 + ; aes_enc_block + vbroadcasti128 ymm13, [r9] + vpxor ymm0, ymm0, ymm13 + vpxor ymm1, ymm1, ymm13 + vpxor ymm2, ymm2, ymm13 + vpxor ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+16] + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+32] + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+48] + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+64] + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+80] + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+96] + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+112] + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+128] + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+144] + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + cmp eax, 11 + vbroadcasti128 ymm13, [r9+160] + jl L_AES_CTR_encrypt_vaes_128_aes_enc_block_last + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+176] + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + cmp eax, 13 + vbroadcasti128 ymm13, [r9+192] + jl L_AES_CTR_encrypt_vaes_128_aes_enc_block_last + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+208] + vaesenc ymm0, ymm0, ymm13 + vaesenc ymm1, ymm1, ymm13 + vaesenc ymm2, ymm2, ymm13 + vaesenc ymm3, ymm3, ymm13 + vbroadcasti128 ymm13, [r9+224] +L_AES_CTR_encrypt_vaes_128_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm13 + vaesenclast ymm1, ymm1, ymm13 + vaesenclast ymm2, ymm2, ymm13 + vaesenclast ymm3, ymm3, ymm13 + vpxor ymm0, ymm0, [r11] + vpxor ymm1, ymm1, [r11+32] + vpxor ymm2, ymm2, [r11+64] + vpxor ymm3, ymm3, [r11+96] + vmovdqu YMMWORD PTR [rbx], ymm0 + vmovdqu YMMWORD PTR [rbx+32], ymm1 + vmovdqu YMMWORD PTR [rbx+64], ymm2 + vmovdqu YMMWORD PTR [rbx+96], ymm3 + add eax, 128 + cmp eax, r10d + jl L_AES_CTR_encrypt_vaes_enc_128 + vperm2i128 ymm7, ymm4, ymm4, 0 +L_AES_CTR_encrypt_vaes_done_128: + mov r10d, r8d + and r10d, 4294967264 + cmp eax, r10d + je L_AES_CTR_encrypt_vaes_done_32 +L_AES_CTR_encrypt_vaes_enc_32: + ; 32 bytes of input + ; aes_ctr_enc_32 + lea r11, QWORD PTR [rcx+rax] + lea rbx, QWORD PTR [rdx+rax] + vpaddq ymm0, ymm7, [ptr_L_aes_ctr_inc_vaes] + vmovdqa ymm9, ymm7 + vpand ymm14, ymm9, [ptr_L_aes_ctr_inc_vaes] + vpor ymm9, ymm9, [ptr_L_aes_ctr_inc_vaes] + vpandn ymm9, ymm0, ymm9 + vpor ymm9, ymm9, ymm14 + vpsrlq ymm9, ymm9, 63 + vpslldq ymm9, ymm9, 8 + vpaddq ymm0, ymm0, ymm9 + vpshufb ymm0, ymm0, ymm8 + vmovdqa ymm9, ymm7 + vpaddq ymm7, ymm7, ymm11 + vpand ymm14, ymm9, ymm11 + vpor ymm9, ymm9, ymm11 + vpandn ymm9, ymm7, ymm9 + vpor ymm9, ymm9, ymm14 + vpsrlq ymm9, ymm9, 63 + vpslldq ymm9, ymm9, 8 + vpaddq ymm7, ymm7, ymm9 + ; aes_enc_block + vbroadcasti128 ymm13, [r9] + vpxor ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+16] + vaesenc ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+32] + vaesenc ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+48] + vaesenc ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+64] + vaesenc ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+80] + vaesenc ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+96] + vaesenc ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+112] + vaesenc ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+128] + vaesenc ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+144] + vaesenc ymm0, ymm0, ymm13 + cmp eax, 11 + vbroadcasti128 ymm13, [r9+160] + jl L_AES_CTR_encrypt_vaes_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+176] + vaesenc ymm0, ymm0, ymm13 + cmp eax, 13 + vbroadcasti128 ymm13, [r9+192] + jl L_AES_CTR_encrypt_vaes_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+208] + vaesenc ymm0, ymm0, ymm13 + vbroadcasti128 ymm13, [r9+224] +L_AES_CTR_encrypt_vaes_32_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm13 + vpxor ymm0, ymm0, [r11] + vmovdqu YMMWORD PTR [rbx], ymm0 + add eax, 32 + cmp eax, r10d + jl L_AES_CTR_encrypt_vaes_enc_32 +L_AES_CTR_encrypt_vaes_done_32: + cmp eax, r8d + mov r10d, r8d + je L_AES_CTR_encrypt_vaes_done_enc + and r10d, 4294967280 +L_AES_CTR_encrypt_vaes_enc_16: + ; 16 bytes of input + vpshufb xmm0, xmm7, xmm8 + vmovdqa ymm9, ymm7 + vpaddq ymm7, ymm7, ymm12 + vpand ymm14, ymm9, ymm12 + vpor ymm9, ymm9, ymm12 + vpandn ymm9, ymm7, ymm9 + vpor ymm9, ymm9, ymm14 + vpsrlq ymm9, ymm9, 63 + vpslldq ymm9, ymm9, 8 + vpaddq ymm7, ymm7, ymm9 + ; aes_enc_block + vpxor xmm0, xmm0, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm0, xmm0, xmm5 + cmp eax, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_CTR_encrypt_vaes_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm0, xmm0, xmm6 + cmp eax, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_CTR_encrypt_vaes_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_CTR_encrypt_vaes_16_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + lea r11, QWORD PTR [rcx+rax] + vpxor xmm0, xmm0, [r11] + lea r11, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r11], xmm0 + add eax, 16 + cmp eax, r10d + jl L_AES_CTR_encrypt_vaes_enc_16 +L_AES_CTR_encrypt_vaes_done_enc: + vpshufb xmm0, xmm7, xmm8 + vmovdqu OWORD PTR [r10], xmm0 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + add rsp, 144 + pop rbx + ret +AES_CTR_encrypt_vaes ENDP +_TEXT ENDS +ENDIF +IFDEF HAVE_INTEL_AVX512 +_TEXT SEGMENT READONLY PARA +AES_ECB_encrypt_avx512 PROC + mov eax, DWORD PTR [rsp+40] + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + xor eax, eax + cmp r8d, 64 + jl L_AES_ECB_encrypt_avx512_done_64 + vbroadcasti32x4 zmm8, [r9] + vbroadcasti32x4 zmm9, [r9+16] + vbroadcasti32x4 zmm10, [r9+32] + vbroadcasti32x4 zmm11, [r9+48] + vbroadcasti32x4 zmm12, [r9+64] + vbroadcasti32x4 zmm13, [r9+80] + vbroadcasti32x4 zmm14, [r9+96] + vbroadcasti32x4 zmm15, [r9+112] + vbroadcasti32x4 zmm16, [r9+128] + vbroadcasti32x4 zmm17, [r9+144] + vbroadcasti32x4 zmm18, [r9+160] + cmp eax, 11 + jl L_AES_ECB_encrypt_avx512_key_cached + vbroadcasti32x4 zmm19, [r9+176] + vbroadcasti32x4 zmm20, [r9+192] + cmp eax, 13 + jl L_AES_ECB_encrypt_avx512_key_cached + vbroadcasti32x4 zmm21, [r9+208] + vbroadcasti32x4 zmm22, [r9+224] +L_AES_ECB_encrypt_avx512_key_cached: + cmp r8d, 256 + mov r9d, r8d + jl L_AES_ECB_encrypt_avx512_done_256 + and r9d, 4294967040 +L_AES_ECB_encrypt_avx512_enc_256: + ; 256 bytes of input + ; aes_ecb_enc_256 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + vmovdqu64 zmm0, [r10] + vmovdqu64 zmm1, [r10+64] + vmovdqu64 zmm2, [r10+128] + vmovdqu64 zmm3, [r10+192] + ; aes_enc_block + vpxorq zmm0, zmm0, zmm8 + vpxorq zmm1, zmm1, zmm8 + vpxorq zmm2, zmm2, zmm8 + vpxorq zmm3, zmm3, zmm8 + vaesenc zmm0, zmm0, zmm9 + vaesenc zmm1, zmm1, zmm9 + vaesenc zmm2, zmm2, zmm9 + vaesenc zmm3, zmm3, zmm9 + vaesenc zmm0, zmm0, zmm10 + vaesenc zmm1, zmm1, zmm10 + vaesenc zmm2, zmm2, zmm10 + vaesenc zmm3, zmm3, zmm10 + vaesenc zmm0, zmm0, zmm11 + vaesenc zmm1, zmm1, zmm11 + vaesenc zmm2, zmm2, zmm11 + vaesenc zmm3, zmm3, zmm11 + vaesenc zmm0, zmm0, zmm12 + vaesenc zmm1, zmm1, zmm12 + vaesenc zmm2, zmm2, zmm12 + vaesenc zmm3, zmm3, zmm12 + vaesenc zmm0, zmm0, zmm13 + vaesenc zmm1, zmm1, zmm13 + vaesenc zmm2, zmm2, zmm13 + vaesenc zmm3, zmm3, zmm13 + vaesenc zmm0, zmm0, zmm14 + vaesenc zmm1, zmm1, zmm14 + vaesenc zmm2, zmm2, zmm14 + vaesenc zmm3, zmm3, zmm14 + vaesenc zmm0, zmm0, zmm15 + vaesenc zmm1, zmm1, zmm15 + vaesenc zmm2, zmm2, zmm15 + vaesenc zmm3, zmm3, zmm15 + vaesenc zmm0, zmm0, zmm16 + vaesenc zmm1, zmm1, zmm16 + vaesenc zmm2, zmm2, zmm16 + vaesenc zmm3, zmm3, zmm16 + vaesenc zmm0, zmm0, zmm17 + vaesenc zmm1, zmm1, zmm17 + vaesenc zmm2, zmm2, zmm17 + vaesenc zmm3, zmm3, zmm17 + cmp eax, 11 + vmovdqa64 zmm7, zmm18 + jl L_AES_ECB_encrypt_avx512_256_aes_enc_block_last + vaesenc zmm0, zmm0, zmm18 + vaesenc zmm1, zmm1, zmm18 + vaesenc zmm2, zmm2, zmm18 + vaesenc zmm3, zmm3, zmm18 + vaesenc zmm0, zmm0, zmm19 + vaesenc zmm1, zmm1, zmm19 + vaesenc zmm2, zmm2, zmm19 + vaesenc zmm3, zmm3, zmm19 + cmp eax, 13 + vmovdqa64 zmm7, zmm20 + jl L_AES_ECB_encrypt_avx512_256_aes_enc_block_last + vaesenc zmm0, zmm0, zmm20 + vaesenc zmm1, zmm1, zmm20 + vaesenc zmm2, zmm2, zmm20 + vaesenc zmm3, zmm3, zmm20 + vaesenc zmm0, zmm0, zmm21 + vaesenc zmm1, zmm1, zmm21 + vaesenc zmm2, zmm2, zmm21 + vaesenc zmm3, zmm3, zmm21 + vmovdqa64 zmm7, zmm22 +L_AES_ECB_encrypt_avx512_256_aes_enc_block_last: + vaesenclast zmm0, zmm0, zmm7 + vaesenclast zmm1, zmm1, zmm7 + vaesenclast zmm2, zmm2, zmm7 + vaesenclast zmm3, zmm3, zmm7 + vmovdqu64 [r11], zmm0 + vmovdqu64 [r11+64], zmm1 + vmovdqu64 [r11+128], zmm2 + vmovdqu64 [r11+192], zmm3 + add eax, 256 + cmp eax, r9d + jl L_AES_ECB_encrypt_avx512_enc_256 +L_AES_ECB_encrypt_avx512_done_256: + mov r9d, r8d + and r9d, 4294967232 + cmp eax, r9d + je L_AES_ECB_encrypt_avx512_done_64 +L_AES_ECB_encrypt_avx512_enc_64: + ; 64 bytes of input + ; aes_ecb_enc_64 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + vmovdqu64 zmm0, [r10] + ; aes_enc_block + vpxorq zmm0, zmm0, zmm8 + vaesenc zmm0, zmm0, zmm9 + vaesenc zmm0, zmm0, zmm10 + vaesenc zmm0, zmm0, zmm11 + vaesenc zmm0, zmm0, zmm12 + vaesenc zmm0, zmm0, zmm13 + vaesenc zmm0, zmm0, zmm14 + vaesenc zmm0, zmm0, zmm15 + vaesenc zmm0, zmm0, zmm16 + vaesenc zmm0, zmm0, zmm17 + cmp eax, 11 + vmovdqa64 zmm7, zmm18 + jl L_AES_ECB_encrypt_avx512_64_aes_enc_block_last + vaesenc zmm0, zmm0, zmm18 + vaesenc zmm0, zmm0, zmm19 + cmp eax, 13 + vmovdqa64 zmm7, zmm20 + jl L_AES_ECB_encrypt_avx512_64_aes_enc_block_last + vaesenc zmm0, zmm0, zmm20 + vaesenc zmm0, zmm0, zmm21 + vmovdqa64 zmm7, zmm22 +L_AES_ECB_encrypt_avx512_64_aes_enc_block_last: + vaesenclast zmm0, zmm0, zmm7 + vmovdqu64 [r11], zmm0 + add eax, 64 + cmp eax, r9d + jl L_AES_ECB_encrypt_avx512_enc_64 +L_AES_ECB_encrypt_avx512_done_64: + cmp eax, r8d + mov r9d, r8d + je L_AES_ECB_encrypt_avx512_done_enc + and r9d, 4294967280 +L_AES_ECB_encrypt_avx512_enc_16: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + vmovdqu xmm0, OWORD PTR [r10] + ; aes_enc_block + vpxor xmm0, xmm0, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm0, xmm0, xmm5 + cmp eax, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_ECB_encrypt_avx512_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm0, xmm0, xmm6 + cmp eax, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_ECB_encrypt_avx512_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_ECB_encrypt_avx512_16_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + lea r10, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r10], xmm0 + add eax, 16 + cmp eax, r9d + jl L_AES_ECB_encrypt_avx512_enc_16 +L_AES_ECB_encrypt_avx512_done_enc: + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + ret +AES_ECB_encrypt_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_ECB_decrypt_avx512 PROC + mov eax, DWORD PTR [rsp+40] + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + xor eax, eax + cmp r8d, 64 + jl L_AES_ECB_decrypt_avx512_done_64 + vbroadcasti32x4 zmm8, [r9] + vbroadcasti32x4 zmm9, [r9+16] + vbroadcasti32x4 zmm10, [r9+32] + vbroadcasti32x4 zmm11, [r9+48] + vbroadcasti32x4 zmm12, [r9+64] + vbroadcasti32x4 zmm13, [r9+80] + vbroadcasti32x4 zmm14, [r9+96] + vbroadcasti32x4 zmm15, [r9+112] + vbroadcasti32x4 zmm16, [r9+128] + vbroadcasti32x4 zmm17, [r9+144] + vbroadcasti32x4 zmm18, [r9+160] + cmp eax, 11 + jl L_AES_ECB_decrypt_avx512_key_cached + vbroadcasti32x4 zmm19, [r9+176] + vbroadcasti32x4 zmm20, [r9+192] + cmp eax, 13 + jl L_AES_ECB_decrypt_avx512_key_cached + vbroadcasti32x4 zmm21, [r9+208] + vbroadcasti32x4 zmm22, [r9+224] +L_AES_ECB_decrypt_avx512_key_cached: + cmp r8d, 256 + mov r9d, r8d + jl L_AES_ECB_decrypt_avx512_done_256 + and r9d, 4294967040 +L_AES_ECB_decrypt_avx512_dec_256: + ; 256 bytes of input + ; aes_ecb_dec_256 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + vmovdqu64 zmm0, [r10] + vmovdqu64 zmm1, [r10+64] + vmovdqu64 zmm2, [r10+128] + vmovdqu64 zmm3, [r10+192] + ; aes_dec_block + vpxorq zmm0, zmm0, zmm8 + vpxorq zmm1, zmm1, zmm8 + vpxorq zmm2, zmm2, zmm8 + vpxorq zmm3, zmm3, zmm8 + vaesdec zmm0, zmm0, zmm9 + vaesdec zmm1, zmm1, zmm9 + vaesdec zmm2, zmm2, zmm9 + vaesdec zmm3, zmm3, zmm9 + vaesdec zmm0, zmm0, zmm10 + vaesdec zmm1, zmm1, zmm10 + vaesdec zmm2, zmm2, zmm10 + vaesdec zmm3, zmm3, zmm10 + vaesdec zmm0, zmm0, zmm11 + vaesdec zmm1, zmm1, zmm11 + vaesdec zmm2, zmm2, zmm11 + vaesdec zmm3, zmm3, zmm11 + vaesdec zmm0, zmm0, zmm12 + vaesdec zmm1, zmm1, zmm12 + vaesdec zmm2, zmm2, zmm12 + vaesdec zmm3, zmm3, zmm12 + vaesdec zmm0, zmm0, zmm13 + vaesdec zmm1, zmm1, zmm13 + vaesdec zmm2, zmm2, zmm13 + vaesdec zmm3, zmm3, zmm13 + vaesdec zmm0, zmm0, zmm14 + vaesdec zmm1, zmm1, zmm14 + vaesdec zmm2, zmm2, zmm14 + vaesdec zmm3, zmm3, zmm14 + vaesdec zmm0, zmm0, zmm15 + vaesdec zmm1, zmm1, zmm15 + vaesdec zmm2, zmm2, zmm15 + vaesdec zmm3, zmm3, zmm15 + vaesdec zmm0, zmm0, zmm16 + vaesdec zmm1, zmm1, zmm16 + vaesdec zmm2, zmm2, zmm16 + vaesdec zmm3, zmm3, zmm16 + vaesdec zmm0, zmm0, zmm17 + vaesdec zmm1, zmm1, zmm17 + vaesdec zmm2, zmm2, zmm17 + vaesdec zmm3, zmm3, zmm17 + cmp eax, 11 + vmovdqa64 zmm7, zmm18 + jl L_AES_ECB_decrypt_avx512_256_aes_dec_block_last + vaesdec zmm0, zmm0, zmm18 + vaesdec zmm1, zmm1, zmm18 + vaesdec zmm2, zmm2, zmm18 + vaesdec zmm3, zmm3, zmm18 + vaesdec zmm0, zmm0, zmm19 + vaesdec zmm1, zmm1, zmm19 + vaesdec zmm2, zmm2, zmm19 + vaesdec zmm3, zmm3, zmm19 + cmp eax, 13 + vmovdqa64 zmm7, zmm20 + jl L_AES_ECB_decrypt_avx512_256_aes_dec_block_last + vaesdec zmm0, zmm0, zmm20 + vaesdec zmm1, zmm1, zmm20 + vaesdec zmm2, zmm2, zmm20 + vaesdec zmm3, zmm3, zmm20 + vaesdec zmm0, zmm0, zmm21 + vaesdec zmm1, zmm1, zmm21 + vaesdec zmm2, zmm2, zmm21 + vaesdec zmm3, zmm3, zmm21 + vmovdqa64 zmm7, zmm22 +L_AES_ECB_decrypt_avx512_256_aes_dec_block_last: + vaesdeclast zmm0, zmm0, zmm7 + vaesdeclast zmm1, zmm1, zmm7 + vaesdeclast zmm2, zmm2, zmm7 + vaesdeclast zmm3, zmm3, zmm7 + vmovdqu64 [r11], zmm0 + vmovdqu64 [r11+64], zmm1 + vmovdqu64 [r11+128], zmm2 + vmovdqu64 [r11+192], zmm3 + add eax, 256 + cmp eax, r9d + jl L_AES_ECB_decrypt_avx512_dec_256 +L_AES_ECB_decrypt_avx512_done_256: + mov r9d, r8d + and r9d, 4294967232 + cmp eax, r9d + je L_AES_ECB_decrypt_avx512_done_64 +L_AES_ECB_decrypt_avx512_dec_64: + ; 64 bytes of input + ; aes_ecb_dec_64 + lea r10, QWORD PTR [rcx+rax] + lea r11, QWORD PTR [rdx+rax] + vmovdqu64 zmm0, [r10] + ; aes_dec_block + vpxorq zmm0, zmm0, zmm8 + vaesdec zmm0, zmm0, zmm9 + vaesdec zmm0, zmm0, zmm10 + vaesdec zmm0, zmm0, zmm11 + vaesdec zmm0, zmm0, zmm12 + vaesdec zmm0, zmm0, zmm13 + vaesdec zmm0, zmm0, zmm14 + vaesdec zmm0, zmm0, zmm15 + vaesdec zmm0, zmm0, zmm16 + vaesdec zmm0, zmm0, zmm17 + cmp eax, 11 + vmovdqa64 zmm7, zmm18 + jl L_AES_ECB_decrypt_avx512_64_aes_dec_block_last + vaesdec zmm0, zmm0, zmm18 + vaesdec zmm0, zmm0, zmm19 + cmp eax, 13 + vmovdqa64 zmm7, zmm20 + jl L_AES_ECB_decrypt_avx512_64_aes_dec_block_last + vaesdec zmm0, zmm0, zmm20 + vaesdec zmm0, zmm0, zmm21 + vmovdqa64 zmm7, zmm22 +L_AES_ECB_decrypt_avx512_64_aes_dec_block_last: + vaesdeclast zmm0, zmm0, zmm7 + vmovdqu64 [r11], zmm0 + add eax, 64 + cmp eax, r9d + jl L_AES_ECB_decrypt_avx512_dec_64 +L_AES_ECB_decrypt_avx512_done_64: + cmp eax, r8d + mov r9d, r8d + je L_AES_ECB_decrypt_avx512_done_dec + and r9d, 4294967280 +L_AES_ECB_decrypt_avx512_dec_16: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + vmovdqu xmm0, OWORD PTR [r10] + ; aes_dec_block + vpxor xmm0, xmm0, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesdec xmm0, xmm0, xmm5 + cmp eax, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_ECB_decrypt_avx512_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesdec xmm0, xmm0, xmm6 + cmp eax, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_ECB_decrypt_avx512_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_ECB_decrypt_avx512_16_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + lea r10, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r10], xmm0 + add eax, 16 + cmp eax, r9d + jl L_AES_ECB_decrypt_avx512_dec_16 +L_AES_ECB_decrypt_avx512_done_dec: + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + ret +AES_ECB_decrypt_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_CBC_encrypt_avx512 PROC + mov rax, QWORD PTR [rsp+40] + mov r10d, DWORD PTR [rsp+48] + vmovdqu xmm0, OWORD PTR [r8] + xor eax, eax + cmp eax, r9d + je L_AES_CBC_encrypt_avx512_done +L_AES_CBC_encrypt_avx512_loop: + ; 16 bytes of input + lea r10, QWORD PTR [rcx+rax] + vmovdqu xmm1, OWORD PTR [r10] + vpternlogq xmm1, xmm0, [rax], 150 + ; aes_enc_block + vmovdqu xmm3, OWORD PTR [rax+16] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+32] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+48] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+64] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+80] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+96] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+112] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+128] + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm3, OWORD PTR [rax+144] + vaesenc xmm1, xmm1, xmm3 + cmp r10d, 11 + vmovdqu xmm3, OWORD PTR [rax+160] + jl L_AES_CBC_encrypt_avx512_aes_enc_block_last + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm4, OWORD PTR [rax+176] + vaesenc xmm1, xmm1, xmm4 + cmp r10d, 13 + vmovdqu xmm3, OWORD PTR [rax+192] + jl L_AES_CBC_encrypt_avx512_aes_enc_block_last + vaesenc xmm1, xmm1, xmm3 + vmovdqu xmm4, OWORD PTR [rax+208] + vaesenc xmm1, xmm1, xmm4 + vmovdqu xmm3, OWORD PTR [rax+224] +L_AES_CBC_encrypt_avx512_aes_enc_block_last: + vaesenclast xmm1, xmm1, xmm3 + lea r11, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r11], xmm1 + vmovdqa xmm0, xmm1 + add eax, 16 + cmp eax, r9d + jl L_AES_CBC_encrypt_avx512_loop +L_AES_CBC_encrypt_avx512_done: + vmovdqu OWORD PTR [r8], xmm0 + ret +AES_CBC_encrypt_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_CBC_decrypt_avx512 PROC + push r12 + mov rax, QWORD PTR [rsp+48] + mov r10d, DWORD PTR [rsp+56] + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + vmovdqu xmm8, OWORD PTR [r8] + xor eax, eax + cmp r9d, 64 + jl L_AES_CBC_decrypt_avx512_done_64 + vbroadcasti32x4 zmm14, [rax] + vbroadcasti32x4 zmm15, [rax+16] + vbroadcasti32x4 zmm16, [rax+32] + vbroadcasti32x4 zmm17, [rax+48] + vbroadcasti32x4 zmm18, [rax+64] + vbroadcasti32x4 zmm19, [rax+80] + vbroadcasti32x4 zmm20, [rax+96] + vbroadcasti32x4 zmm21, [rax+112] + vbroadcasti32x4 zmm22, [rax+128] + vbroadcasti32x4 zmm23, [rax+144] + vbroadcasti32x4 zmm24, [rax+160] + cmp r10d, 11 + jl L_AES_CBC_decrypt_avx512_key_cached + vbroadcasti32x4 zmm25, [rax+176] + vbroadcasti32x4 zmm26, [rax+192] + cmp r10d, 13 + jl L_AES_CBC_decrypt_avx512_key_cached + vbroadcasti32x4 zmm27, [rax+208] + vbroadcasti32x4 zmm28, [rax+224] +L_AES_CBC_decrypt_avx512_key_cached: + cmp r9d, 256 + mov r10d, r9d + jl L_AES_CBC_decrypt_avx512_done_256 + and r10d, 4294967040 +L_AES_CBC_decrypt_avx512_dec_256: + ; 256 bytes of input + ; aes_cbc_dec_256 + lea r11, QWORD PTR [rcx+rax] + lea r12, QWORD PTR [rdx+rax] + vmovdqu64 zmm0, [r11] + vmovdqu64 zmm1, [r11+64] + vmovdqu64 zmm2, [r11+128] + vmovdqu64 zmm3, [r11+192] + vshufi64x2 zmm10, zmm0, zmm0, 144 + vinserti32x4 zmm10, zmm10, xmm8, 0 + vmovdqu64 zmm11, [r11+48] + vmovdqu64 zmm12, [r11+112] + vmovdqu64 zmm13, [r11+176] + vextracti32x4 xmm8, zmm3, 3 + ; aes_dec_block + vpxorq zmm0, zmm0, zmm14 + vpxorq zmm1, zmm1, zmm14 + vpxorq zmm2, zmm2, zmm14 + vpxorq zmm3, zmm3, zmm14 + vaesdec zmm0, zmm0, zmm15 + vaesdec zmm1, zmm1, zmm15 + vaesdec zmm2, zmm2, zmm15 + vaesdec zmm3, zmm3, zmm15 + vaesdec zmm0, zmm0, zmm16 + vaesdec zmm1, zmm1, zmm16 + vaesdec zmm2, zmm2, zmm16 + vaesdec zmm3, zmm3, zmm16 + vaesdec zmm0, zmm0, zmm17 + vaesdec zmm1, zmm1, zmm17 + vaesdec zmm2, zmm2, zmm17 + vaesdec zmm3, zmm3, zmm17 + vaesdec zmm0, zmm0, zmm18 + vaesdec zmm1, zmm1, zmm18 + vaesdec zmm2, zmm2, zmm18 + vaesdec zmm3, zmm3, zmm18 + vaesdec zmm0, zmm0, zmm19 + vaesdec zmm1, zmm1, zmm19 + vaesdec zmm2, zmm2, zmm19 + vaesdec zmm3, zmm3, zmm19 + vaesdec zmm0, zmm0, zmm20 + vaesdec zmm1, zmm1, zmm20 + vaesdec zmm2, zmm2, zmm20 + vaesdec zmm3, zmm3, zmm20 + vaesdec zmm0, zmm0, zmm21 + vaesdec zmm1, zmm1, zmm21 + vaesdec zmm2, zmm2, zmm21 + vaesdec zmm3, zmm3, zmm21 + vaesdec zmm0, zmm0, zmm22 + vaesdec zmm1, zmm1, zmm22 + vaesdec zmm2, zmm2, zmm22 + vaesdec zmm3, zmm3, zmm22 + vaesdec zmm0, zmm0, zmm23 + vaesdec zmm1, zmm1, zmm23 + vaesdec zmm2, zmm2, zmm23 + vaesdec zmm3, zmm3, zmm23 + cmp r10d, 11 + vmovdqa64 zmm9, zmm24 + jl L_AES_CBC_decrypt_avx512_256_aes_dec_block_last + vaesdec zmm0, zmm0, zmm24 + vaesdec zmm1, zmm1, zmm24 + vaesdec zmm2, zmm2, zmm24 + vaesdec zmm3, zmm3, zmm24 + vaesdec zmm0, zmm0, zmm25 + vaesdec zmm1, zmm1, zmm25 + vaesdec zmm2, zmm2, zmm25 + vaesdec zmm3, zmm3, zmm25 + cmp r10d, 13 + vmovdqa64 zmm9, zmm26 + jl L_AES_CBC_decrypt_avx512_256_aes_dec_block_last + vaesdec zmm0, zmm0, zmm26 + vaesdec zmm1, zmm1, zmm26 + vaesdec zmm2, zmm2, zmm26 + vaesdec zmm3, zmm3, zmm26 + vaesdec zmm0, zmm0, zmm27 + vaesdec zmm1, zmm1, zmm27 + vaesdec zmm2, zmm2, zmm27 + vaesdec zmm3, zmm3, zmm27 + vmovdqa64 zmm9, zmm28 +L_AES_CBC_decrypt_avx512_256_aes_dec_block_last: + vaesdeclast zmm0, zmm0, zmm9 + vaesdeclast zmm1, zmm1, zmm9 + vaesdeclast zmm2, zmm2, zmm9 + vaesdeclast zmm3, zmm3, zmm9 + vpxorq zmm0, zmm0, zmm10 + vpxorq zmm1, zmm1, zmm11 + vpxorq zmm2, zmm2, zmm12 + vpxorq zmm3, zmm3, zmm13 + vmovdqu64 [r12], zmm0 + vmovdqu64 [r12+64], zmm1 + vmovdqu64 [r12+128], zmm2 + vmovdqu64 [r12+192], zmm3 + add eax, 256 + cmp eax, r10d + jl L_AES_CBC_decrypt_avx512_dec_256 +L_AES_CBC_decrypt_avx512_done_256: + mov r10d, r9d + and r10d, 4294967232 + cmp eax, r10d + je L_AES_CBC_decrypt_avx512_done_64 +L_AES_CBC_decrypt_avx512_dec_64: + ; 64 bytes of input + ; aes_cbc_dec_64 + lea r11, QWORD PTR [rcx+rax] + lea r12, QWORD PTR [rdx+rax] + vmovdqu64 zmm0, [r11] + vshufi64x2 zmm10, zmm0, zmm0, 144 + vinserti32x4 zmm10, zmm10, xmm8, 0 + vextracti32x4 xmm8, zmm0, 3 + ; aes_dec_block + vpxorq zmm0, zmm0, zmm14 + vaesdec zmm0, zmm0, zmm15 + vaesdec zmm0, zmm0, zmm16 + vaesdec zmm0, zmm0, zmm17 + vaesdec zmm0, zmm0, zmm18 + vaesdec zmm0, zmm0, zmm19 + vaesdec zmm0, zmm0, zmm20 + vaesdec zmm0, zmm0, zmm21 + vaesdec zmm0, zmm0, zmm22 + vaesdec zmm0, zmm0, zmm23 + cmp r10d, 11 + vmovdqa64 zmm9, zmm24 + jl L_AES_CBC_decrypt_avx512_64_aes_dec_block_last + vaesdec zmm0, zmm0, zmm24 + vaesdec zmm0, zmm0, zmm25 + cmp r10d, 13 + vmovdqa64 zmm9, zmm26 + jl L_AES_CBC_decrypt_avx512_64_aes_dec_block_last + vaesdec zmm0, zmm0, zmm26 + vaesdec zmm0, zmm0, zmm27 + vmovdqa64 zmm9, zmm28 +L_AES_CBC_decrypt_avx512_64_aes_dec_block_last: + vaesdeclast zmm0, zmm0, zmm9 + vpxorq zmm0, zmm0, zmm10 + vmovdqu64 [r12], zmm0 + add eax, 64 + cmp eax, r10d + jl L_AES_CBC_decrypt_avx512_dec_64 +L_AES_CBC_decrypt_avx512_done_64: + cmp eax, r9d + mov r10d, r9d + je L_AES_CBC_decrypt_avx512_done_dec + and r10d, 4294967280 +L_AES_CBC_decrypt_avx512_dec_16: + ; 16 bytes of input + lea r11, QWORD PTR [rcx+rax] + vmovdqu xmm0, OWORD PTR [r11] + vmovdqa xmm7, xmm0 + ; aes_dec_block + vpxor xmm0, xmm0, [rax] + vmovdqu xmm5, OWORD PTR [rax+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [rax+144] + vaesdec xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [rax+160] + jl L_AES_CBC_decrypt_avx512_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [rax+176] + vaesdec xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [rax+192] + jl L_AES_CBC_decrypt_avx512_16_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [rax+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [rax+224] +L_AES_CBC_decrypt_avx512_16_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + vmovdqa xmm8, xmm7 + lea r11, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r11], xmm0 + add eax, 16 + cmp eax, r10d + jl L_AES_CBC_decrypt_avx512_dec_16 +L_AES_CBC_decrypt_avx512_done_dec: + vmovdqu OWORD PTR [r8], xmm8 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + pop r12 + ret +AES_CBC_decrypt_avx512 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_ctr_bswap_avx512 QWORD \ + 08090a0b0c0d0e0fh, 0001020304050607h +ptr_L_aes_ctr_bswap_avx512 QWORD L_aes_ctr_bswap_avx512 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_ctr_inc_avx512 QWORD \ + 0000000000000000h, 0000000000000000h, + 0000000000000001h, 0000000000000000h, + 0000000000000002h, 0000000000000000h, + 0000000000000003h, 0000000000000000h, + 0000000000000004h, 0000000000000000h, + 0000000000000005h, 0000000000000000h, + 0000000000000006h, 0000000000000000h, + 0000000000000007h, 0000000000000000h, + 0000000000000008h, 0000000000000000h, + 0000000000000009h, 0000000000000000h, + 000000000000000ah, 0000000000000000h, + 000000000000000bh, 0000000000000000h, + 000000000000000ch, 0000000000000000h, + 000000000000000dh, 0000000000000000h, + 000000000000000eh, 0000000000000000h, + 000000000000000fh, 0000000000000000h, + 0000000000000010h, 0000000000000000h +ptr_L_aes_ctr_inc_avx512 QWORD L_aes_ctr_inc_avx512 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_CTR_encrypt_avx512 PROC + push rbx + mov eax, DWORD PTR [rsp+48] + mov r10, QWORD PTR [rsp+56] + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + vbroadcasti32x4 zmm8, ptr_L_aes_ctr_bswap_avx512 + vbroadcasti32x4 zmm7, [r10] + vpshufb zmm7, zmm7, zmm8 + vbroadcasti32x4 zmm10, [ptr_L_aes_ctr_inc_avx512+256] + vbroadcasti32x4 zmm11, [ptr_L_aes_ctr_inc_avx512+64] + vbroadcasti32x4 zmm12, [ptr_L_aes_ctr_inc_avx512+16] + xor eax, eax + cmp r8d, 64 + jl L_AES_CTR_encrypt_avx512_done_64 + vbroadcasti32x4 zmm14, [r9] + vbroadcasti32x4 zmm15, [r9+16] + vbroadcasti32x4 zmm16, [r9+32] + vbroadcasti32x4 zmm17, [r9+48] + vbroadcasti32x4 zmm18, [r9+64] + vbroadcasti32x4 zmm19, [r9+80] + vbroadcasti32x4 zmm20, [r9+96] + vbroadcasti32x4 zmm21, [r9+112] + vbroadcasti32x4 zmm22, [r9+128] + vbroadcasti32x4 zmm23, [r9+144] + vbroadcasti32x4 zmm24, [r9+160] + cmp eax, 11 + jl L_AES_CTR_encrypt_avx512_key_cached + vbroadcasti32x4 zmm25, [r9+176] + vbroadcasti32x4 zmm26, [r9+192] + cmp eax, 13 + jl L_AES_CTR_encrypt_avx512_key_cached + vbroadcasti32x4 zmm27, [r9+208] + vbroadcasti32x4 zmm28, [r9+224] +L_AES_CTR_encrypt_avx512_key_cached: + cmp r8d, 256 + mov r10d, r8d + jl L_AES_CTR_encrypt_avx512_done_256 + and r10d, 4294967040 + vmovdqa64 zmm9, zmm7 + vpaddq zmm4, zmm7, [ptr_L_aes_ctr_inc_avx512] + vpternlogq zmm9, zmm4, [ptr_L_aes_ctr_inc_avx512], 178 + vpsrlq zmm9, zmm9, 63 + vpslldq zmm9, zmm9, 8 + vpaddq zmm4, zmm4, zmm9 + vmovdqa64 zmm9, zmm7 + vpaddq zmm5, zmm7, [ptr_L_aes_ctr_inc_avx512+64] + vpternlogq zmm9, zmm5, [ptr_L_aes_ctr_inc_avx512+64], 178 + vpsrlq zmm9, zmm9, 63 + vpslldq zmm9, zmm9, 8 + vpaddq zmm5, zmm5, zmm9 + vmovdqa64 zmm9, zmm7 + vpaddq zmm6, zmm7, [ptr_L_aes_ctr_inc_avx512+128] + vpternlogq zmm9, zmm6, [ptr_L_aes_ctr_inc_avx512+128], 178 + vpsrlq zmm9, zmm9, 63 + vpslldq zmm9, zmm9, 8 + vpaddq zmm6, zmm6, zmm9 + vmovdqa64 zmm9, zmm7 + vpaddq zmm7, zmm7, [ptr_L_aes_ctr_inc_avx512+192] + vpternlogq zmm9, zmm7, [ptr_L_aes_ctr_inc_avx512+192], 178 + vpsrlq zmm9, zmm9, 63 + vpslldq zmm9, zmm9, 8 + vpaddq zmm7, zmm7, zmm9 +L_AES_CTR_encrypt_avx512_enc_256: + ; 256 bytes of input + lea r11, QWORD PTR [rcx+rax] + lea rbx, QWORD PTR [rdx+rax] + vpshufb zmm0, zmm4, zmm8 + vpshufb zmm1, zmm5, zmm8 + vpshufb zmm2, zmm6, zmm8 + vpshufb zmm3, zmm7, zmm8 + vmovdqa64 zmm9, zmm4 + vpaddq zmm4, zmm4, zmm10 + vpternlogq zmm9, zmm4, zmm10, 178 + vpsrlq zmm9, zmm9, 63 + vpslldq zmm9, zmm9, 8 + vpaddq zmm4, zmm4, zmm9 + vmovdqa64 zmm9, zmm5 + vpaddq zmm5, zmm5, zmm10 + vpternlogq zmm9, zmm5, zmm10, 178 + vpsrlq zmm9, zmm9, 63 + vpslldq zmm9, zmm9, 8 + vpaddq zmm5, zmm5, zmm9 + vmovdqa64 zmm9, zmm6 + vpaddq zmm6, zmm6, zmm10 + vpternlogq zmm9, zmm6, zmm10, 178 + vpsrlq zmm9, zmm9, 63 + vpslldq zmm9, zmm9, 8 + vpaddq zmm6, zmm6, zmm9 + vmovdqa64 zmm9, zmm7 + vpaddq zmm7, zmm7, zmm10 + vpternlogq zmm9, zmm7, zmm10, 178 + vpsrlq zmm9, zmm9, 63 + vpslldq zmm9, zmm9, 8 + vpaddq zmm7, zmm7, zmm9 + ; aes_enc_block + vpxorq zmm0, zmm0, zmm14 + vpxorq zmm1, zmm1, zmm14 + vpxorq zmm2, zmm2, zmm14 + vpxorq zmm3, zmm3, zmm14 + vaesenc zmm0, zmm0, zmm15 + vaesenc zmm1, zmm1, zmm15 + vaesenc zmm2, zmm2, zmm15 + vaesenc zmm3, zmm3, zmm15 + vaesenc zmm0, zmm0, zmm16 + vaesenc zmm1, zmm1, zmm16 + vaesenc zmm2, zmm2, zmm16 + vaesenc zmm3, zmm3, zmm16 + vaesenc zmm0, zmm0, zmm17 + vaesenc zmm1, zmm1, zmm17 + vaesenc zmm2, zmm2, zmm17 + vaesenc zmm3, zmm3, zmm17 + vaesenc zmm0, zmm0, zmm18 + vaesenc zmm1, zmm1, zmm18 + vaesenc zmm2, zmm2, zmm18 + vaesenc zmm3, zmm3, zmm18 + vaesenc zmm0, zmm0, zmm19 + vaesenc zmm1, zmm1, zmm19 + vaesenc zmm2, zmm2, zmm19 + vaesenc zmm3, zmm3, zmm19 + vaesenc zmm0, zmm0, zmm20 + vaesenc zmm1, zmm1, zmm20 + vaesenc zmm2, zmm2, zmm20 + vaesenc zmm3, zmm3, zmm20 + vaesenc zmm0, zmm0, zmm21 + vaesenc zmm1, zmm1, zmm21 + vaesenc zmm2, zmm2, zmm21 + vaesenc zmm3, zmm3, zmm21 + vaesenc zmm0, zmm0, zmm22 + vaesenc zmm1, zmm1, zmm22 + vaesenc zmm2, zmm2, zmm22 + vaesenc zmm3, zmm3, zmm22 + vaesenc zmm0, zmm0, zmm23 + vaesenc zmm1, zmm1, zmm23 + vaesenc zmm2, zmm2, zmm23 + vaesenc zmm3, zmm3, zmm23 + cmp eax, 11 + vmovdqa64 zmm13, zmm24 + jl L_AES_CTR_encrypt_avx512_256_aes_enc_block_last + vaesenc zmm0, zmm0, zmm24 + vaesenc zmm1, zmm1, zmm24 + vaesenc zmm2, zmm2, zmm24 + vaesenc zmm3, zmm3, zmm24 + vaesenc zmm0, zmm0, zmm25 + vaesenc zmm1, zmm1, zmm25 + vaesenc zmm2, zmm2, zmm25 + vaesenc zmm3, zmm3, zmm25 + cmp eax, 13 + vmovdqa64 zmm13, zmm26 + jl L_AES_CTR_encrypt_avx512_256_aes_enc_block_last + vaesenc zmm0, zmm0, zmm26 + vaesenc zmm1, zmm1, zmm26 + vaesenc zmm2, zmm2, zmm26 + vaesenc zmm3, zmm3, zmm26 + vaesenc zmm0, zmm0, zmm27 + vaesenc zmm1, zmm1, zmm27 + vaesenc zmm2, zmm2, zmm27 + vaesenc zmm3, zmm3, zmm27 + vmovdqa64 zmm13, zmm28 +L_AES_CTR_encrypt_avx512_256_aes_enc_block_last: + vaesenclast zmm0, zmm0, zmm13 + vaesenclast zmm1, zmm1, zmm13 + vaesenclast zmm2, zmm2, zmm13 + vaesenclast zmm3, zmm3, zmm13 + vpxorq zmm0, zmm0, [r11] + vpxorq zmm1, zmm1, [r11+64] + vpxorq zmm2, zmm2, [r11+128] + vpxorq zmm3, zmm3, [r11+192] + vmovdqu64 [rbx], zmm0 + vmovdqu64 [rbx+64], zmm1 + vmovdqu64 [rbx+128], zmm2 + vmovdqu64 [rbx+192], zmm3 + add eax, 256 + cmp eax, r10d + jl L_AES_CTR_encrypt_avx512_enc_256 + vshufi64x2 zmm7, zmm4, zmm4, 0 +L_AES_CTR_encrypt_avx512_done_256: + mov r10d, r8d + and r10d, 4294967232 + cmp eax, r10d + je L_AES_CTR_encrypt_avx512_done_64 +L_AES_CTR_encrypt_avx512_enc_64: + ; 64 bytes of input + ; aes_ctr_enc_64 + lea r11, QWORD PTR [rcx+rax] + lea rbx, QWORD PTR [rdx+rax] + vpaddq zmm0, zmm7, [ptr_L_aes_ctr_inc_avx512] + vmovdqa64 zmm9, zmm7 + vpternlogq zmm9, zmm0, [ptr_L_aes_ctr_inc_avx512], 178 + vpsrlq zmm9, zmm9, 63 + vpslldq zmm9, zmm9, 8 + vpaddq zmm0, zmm0, zmm9 + vpshufb zmm0, zmm0, zmm8 + vmovdqa64 zmm9, zmm7 + vpaddq zmm7, zmm7, zmm11 + vpternlogq zmm9, zmm7, zmm11, 178 + vpsrlq zmm9, zmm9, 63 + vpslldq zmm9, zmm9, 8 + vpaddq zmm7, zmm7, zmm9 + ; aes_enc_block + vpxorq zmm0, zmm0, zmm14 + vaesenc zmm0, zmm0, zmm15 + vaesenc zmm0, zmm0, zmm16 + vaesenc zmm0, zmm0, zmm17 + vaesenc zmm0, zmm0, zmm18 + vaesenc zmm0, zmm0, zmm19 + vaesenc zmm0, zmm0, zmm20 + vaesenc zmm0, zmm0, zmm21 + vaesenc zmm0, zmm0, zmm22 + vaesenc zmm0, zmm0, zmm23 + cmp eax, 11 + vmovdqa64 zmm13, zmm24 + jl L_AES_CTR_encrypt_avx512_64_aes_enc_block_last + vaesenc zmm0, zmm0, zmm24 + vaesenc zmm0, zmm0, zmm25 + cmp eax, 13 + vmovdqa64 zmm13, zmm26 + jl L_AES_CTR_encrypt_avx512_64_aes_enc_block_last + vaesenc zmm0, zmm0, zmm26 + vaesenc zmm0, zmm0, zmm27 + vmovdqa64 zmm13, zmm28 +L_AES_CTR_encrypt_avx512_64_aes_enc_block_last: + vaesenclast zmm0, zmm0, zmm13 + vpxorq zmm0, zmm0, [r11] + vmovdqu64 [rbx], zmm0 + add eax, 64 + cmp eax, r10d + jl L_AES_CTR_encrypt_avx512_enc_64 +L_AES_CTR_encrypt_avx512_done_64: + cmp eax, r8d + mov r10d, r8d + je L_AES_CTR_encrypt_avx512_done_enc + and r10d, 4294967280 +L_AES_CTR_encrypt_avx512_enc_16: + ; 16 bytes of input + vpshufb xmm0, xmm7, xmm8 + vmovdqa64 zmm9, zmm7 + vpaddq zmm7, zmm7, zmm12 + vpternlogq zmm9, zmm7, zmm12, 178 + vpsrlq zmm9, zmm9, 63 + vpslldq zmm9, zmm9, 8 + vpaddq zmm7, zmm7, zmm9 + ; aes_enc_block + vpxor xmm0, xmm0, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm0, xmm0, xmm5 + cmp eax, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_CTR_encrypt_avx512_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm0, xmm0, xmm6 + cmp eax, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_CTR_encrypt_avx512_16_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_CTR_encrypt_avx512_16_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + lea r11, QWORD PTR [rcx+rax] + vpxor xmm0, xmm0, [r11] + lea r11, QWORD PTR [rdx+rax] + vmovdqu OWORD PTR [r11], xmm0 + add eax, 16 + cmp eax, r10d + jl L_AES_CTR_encrypt_avx512_enc_16 +L_AES_CTR_encrypt_avx512_done_enc: + vpshufb xmm0, xmm7, xmm8 + vmovdqu OWORD PTR [r10], xmm0 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + pop rbx + ret +AES_CTR_encrypt_avx512 ENDP +_TEXT ENDS +ENDIF +END diff --git a/wolfcrypt/src/aes_xts_asm.S b/wolfcrypt/src/aes_xts_asm.S index 09045c6d8f7..29f3a0174b4 100644 --- a/wolfcrypt/src/aes_xts_asm.S +++ b/wolfcrypt/src/aes_xts_asm.S @@ -46,6 +46,16 @@ #define HAVE_INTEL_AVX2 #endif /* HAVE_INTEL_AVX2 */ #endif /* NO_AVX2_SUPPORT */ +#ifndef NO_VAES_SUPPORT +#ifndef HAVE_INTEL_VAES +#define HAVE_INTEL_VAES +#endif /* HAVE_INTEL_VAES */ +#endif /* NO_VAES_SUPPORT */ +#ifndef NO_AVX512_SUPPORT +#ifndef HAVE_INTEL_AVX512 +#define HAVE_INTEL_AVX512 +#endif /* HAVE_INTEL_AVX512 */ +#endif /* NO_AVX512_SUPPORT */ #ifdef WOLFSSL_AES_XTS #ifdef WOLFSSL_X86_64_BUILD @@ -2785,6 +2795,4408 @@ L_AES_XTS_decrypt_update_avx1_done_dec: .size AES_XTS_decrypt_update_avx1,.-AES_XTS_decrypt_update_avx1 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX1 */ +#ifdef HAVE_INTEL_VAES +#ifndef __APPLE__ +.text +.globl AES_XTS_init_vaes +.type AES_XTS_init_vaes,@function +.align 16 +AES_XTS_init_vaes: +#else +.section __TEXT,__text +.globl _AES_XTS_init_vaes +.p2align 4 +_AES_XTS_init_vaes: +#endif /* __APPLE__ */ + vmovdqu (%rdi), %xmm0 + # aes_enc_block + vpxor (%rsi), %xmm0, %xmm0 + vmovdqu 16(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 32(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 48(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 64(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 80(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 96(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 112(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 128(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 144(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + cmpl $11, %edx + vmovdqu 160(%rsi), %xmm2 + jl L_AES_XTS_init_vaes_tweak_aes_enc_block_last + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 176(%rsi), %xmm3 + vaesenc %xmm3, %xmm0, %xmm0 + cmpl $13, %edx + vmovdqu 192(%rsi), %xmm2 + jl L_AES_XTS_init_vaes_tweak_aes_enc_block_last + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 208(%rsi), %xmm3 + vaesenc %xmm3, %xmm0, %xmm0 + vmovdqu 224(%rsi), %xmm2 +L_AES_XTS_init_vaes_tweak_aes_enc_block_last: + vaesenclast %xmm2, %xmm0, %xmm0 + vmovdqu %xmm0, (%rdi) + repz retq +#ifndef __APPLE__ +.size AES_XTS_init_vaes,.-AES_XTS_init_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_xts_gc_xts: +.long 0x00000087,0x00000000,0x00000001,0x00000000 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_xts_poly: +.long 0x00000087,0x00000000,0x00000000,0x00000000 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_xts_shl: +.long 0x00000000,0x00000000,0x00000000,0x00000000 +.long 0x00000001,0x00000000,0x00000001,0x00000000 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_vaes_aes_xts_shr: +.long 0x00000040,0x00000000,0x00000040,0x00000000 +.long 0x0000003f,0x00000000,0x0000003f,0x00000000 +#ifndef __APPLE__ +.text +.globl AES_XTS_encrypt_vaes +.type AES_XTS_encrypt_vaes,@function +.align 16 +AES_XTS_encrypt_vaes: +#else +.section __TEXT,__text +.globl _AES_XTS_encrypt_vaes +.p2align 4 +_AES_XTS_encrypt_vaes: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + movq %rdx, %rax + movq %rcx, %r12 + movl 24(%rsp), %r10d + subq $0x40, %rsp + vmovdqu L_vaes_aes_xts_gc_xts(%rip), %xmm12 + vbroadcasti128 L_vaes_aes_xts_poly(%rip), %ymm13 + vmovdqu L_vaes_aes_xts_shl(%rip), %ymm14 + vmovdqu L_vaes_aes_xts_shr(%rip), %ymm15 + vmovdqu (%r12), %xmm8 + # aes_enc_block + vpxor (%r9), %xmm8, %xmm8 + vmovdqu 16(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 32(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 48(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 64(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 80(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 96(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 112(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 128(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 144(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + cmpl $11, %r10d + vmovdqu 160(%r9), %xmm5 + jl L_AES_XTS_encrypt_vaes_tweak_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 176(%r9), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqu 192(%r9), %xmm5 + jl L_AES_XTS_encrypt_vaes_tweak_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 208(%r9), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + vmovdqu 224(%r9), %xmm5 +L_AES_XTS_encrypt_vaes_tweak_aes_enc_block_last: + vaesenclast %xmm5, %xmm8, %xmm8 + xorl %r13d, %r13d + cmpl $32, %eax + jl L_AES_XTS_encrypt_vaes_done_128 + cmpl $0x80, %eax + movl %eax, %r11d + jl L_AES_XTS_encrypt_vaes_done_128 + andl $0xffffff80, %r11d + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpsrlq $62, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm4, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + vpsrlq $62, %ymm5, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm5, %ymm6 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm9, %ymm6, %ymm6 + vpsrlq $62, %ymm6, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm6, %ymm7 + vpxor %ymm10, %ymm7, %ymm7 + vpxor %ymm9, %ymm7, %ymm7 +L_AES_XTS_encrypt_vaes_enc_128: + # 128 bytes of input + # aes_enc_128 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + # aes_enc_block + vbroadcasti128 (%r8), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm9, %ymm1, %ymm1 + vpxor %ymm6, %ymm2, %ymm2 + vpxor %ymm9, %ymm2, %ymm2 + vpxor %ymm7, %ymm3, %ymm3 + vpxor %ymm9, %ymm3, %ymm3 + vbroadcasti128 16(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 32(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 48(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 64(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 80(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 96(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 112(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 128(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 144(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + cmpl $11, %r10d + vbroadcasti128 160(%r8), %ymm9 + jl L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 176(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + cmpl $13, %r10d + vbroadcasti128 192(%r8), %ymm9 + jl L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 208(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 224(%r8), %ymm9 +L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last: + vaesenclast %ymm9, %ymm0, %ymm0 + vaesenclast %ymm9, %ymm1, %ymm1 + vaesenclast %ymm9, %ymm2, %ymm2 + vaesenclast %ymm9, %ymm3, %ymm3 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vpsrlq $56, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm4, %ymm4 + vpxor %ymm10, %ymm4, %ymm4 + vpxor %ymm9, %ymm4, %ymm4 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vpsrlq $56, %ymm5, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm5, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + vpxor %ymm6, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vpsrlq $56, %ymm6, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm6, %ymm6 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm9, %ymm6, %ymm6 + vpxor %ymm7, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + vpsrlq $56, %ymm7, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm7, %ymm7 + vpxor %ymm10, %ymm7, %ymm7 + vpxor %ymm9, %ymm7, %ymm7 + addl $0x80, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_encrypt_vaes_enc_128 + vextracti128 $0x00, %ymm4, %xmm8 +L_AES_XTS_encrypt_vaes_done_128: + movl %eax, %r11d + andl $0xffffffc0, %r11d + cmpl %r11d, %r13d + je L_AES_XTS_encrypt_vaes_done_64 + # 64 bytes of input + # aes_enc_64 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpsrlq $62, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm4, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + # aes_enc_block + vbroadcasti128 (%r8), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm9, %ymm1, %ymm1 + vbroadcasti128 16(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 32(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 48(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 64(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 80(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 96(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 112(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 128(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 144(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + cmpl $11, %r10d + vbroadcasti128 160(%r8), %ymm9 + jl L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 176(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + cmpl $13, %r10d + vbroadcasti128 192(%r8), %ymm9 + jl L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 208(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 224(%r8), %ymm9 +L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last: + vaesenclast %ymm9, %ymm0, %ymm0 + vaesenclast %ymm9, %ymm1, %ymm1 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vextracti128 $0x01, %ymm5, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpand %xmm12, %xmm9, %xmm9 + vpxor %xmm9, %xmm8, %xmm8 + addl $0x40, %r13d +L_AES_XTS_encrypt_vaes_done_64: + movl %eax, %r11d + andl $0xffffffe0, %r11d + cmpl %r11d, %r13d + je L_AES_XTS_encrypt_vaes_done_32 + # 32 bytes of input + # aes_enc_32 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu (%rcx), %ymm0 + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + # aes_enc_block + vbroadcasti128 (%r8), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vbroadcasti128 16(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 32(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 48(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 64(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 80(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 96(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 112(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 128(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 144(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + cmpl $11, %r10d + vbroadcasti128 160(%r8), %ymm9 + jl L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 176(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + cmpl $13, %r10d + vbroadcasti128 192(%r8), %ymm9 + jl L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 208(%r8), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 224(%r8), %ymm9 +L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last: + vaesenclast %ymm9, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vextracti128 $0x01, %ymm4, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpand %xmm12, %xmm9, %xmm9 + vpxor %xmm9, %xmm8, %xmm8 + addl $32, %r13d +L_AES_XTS_encrypt_vaes_done_32: + cmpl %eax, %r13d + movl %eax, %r11d + je L_AES_XTS_encrypt_vaes_done_enc + subl %r13d, %r11d + cmpl $16, %r11d + movl %eax, %r11d + jl L_AES_XTS_encrypt_vaes_last_15 + andl $0xfffffff0, %r11d + # 16 bytes of input +L_AES_XTS_encrypt_vaes_enc_16: + leaq (%rdi,%r13,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_enc_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_encrypt_vaes_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_encrypt_vaes_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_encrypt_vaes_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r13,1), %rcx + vmovdqu %xmm0, (%rcx) + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm8, %xmm8 + addl $16, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_encrypt_vaes_enc_16 + cmpl %eax, %r13d + je L_AES_XTS_encrypt_vaes_done_enc +L_AES_XTS_encrypt_vaes_last_15: + subq $16, %r13 + leaq (%rsi,%r13,1), %rcx + vmovdqu (%rcx), %xmm0 + addq $16, %r13 + vmovdqu %xmm0, (%rsp) + xorq %rdx, %rdx +L_AES_XTS_encrypt_vaes_last_15_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r13,1), %cl + movb %r11b, (%rsi,%r13,1) + movb %cl, (%rsp,%rdx,1) + incl %r13d + incl %edx + cmpl %eax, %r13d + jl L_AES_XTS_encrypt_vaes_last_15_byte_loop + subq %rdx, %r13 + vmovdqu (%rsp), %xmm0 + subq $16, %r13 + vpxor %xmm8, %xmm0, %xmm0 + # aes_enc_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_encrypt_vaes_last_15_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_encrypt_vaes_last_15_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_encrypt_vaes_last_15_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r13,1), %rcx + vmovdqu %xmm0, (%rcx) +L_AES_XTS_encrypt_vaes_done_enc: + addq $0x40, %rsp + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_encrypt_vaes,.-AES_XTS_encrypt_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_XTS_encrypt_update_vaes +.type AES_XTS_encrypt_update_vaes,@function +.align 16 +AES_XTS_encrypt_update_vaes: +#else +.section __TEXT,__text +.globl _AES_XTS_encrypt_update_vaes +.p2align 4 +_AES_XTS_encrypt_update_vaes: +#endif /* __APPLE__ */ + pushq %r12 + movq %rdx, %rax + movq %rcx, %r10 + subq $0x40, %rsp + vmovdqu L_vaes_aes_xts_gc_xts(%rip), %xmm12 + vbroadcasti128 L_vaes_aes_xts_poly(%rip), %ymm13 + vmovdqu L_vaes_aes_xts_shl(%rip), %ymm14 + vmovdqu L_vaes_aes_xts_shr(%rip), %ymm15 + vmovdqu (%r8), %xmm8 + xorl %r12d, %r12d + cmpl $32, %eax + jl L_AES_XTS_encrypt_update_vaes_done_128 + cmpl $0x80, %eax + movl %eax, %r11d + jl L_AES_XTS_encrypt_update_vaes_done_128 + andl $0xffffff80, %r11d + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpsrlq $62, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm4, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + vpsrlq $62, %ymm5, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm5, %ymm6 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm9, %ymm6, %ymm6 + vpsrlq $62, %ymm6, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm6, %ymm7 + vpxor %ymm10, %ymm7, %ymm7 + vpxor %ymm9, %ymm7, %ymm7 +L_AES_XTS_encrypt_update_vaes_enc_128: + # 128 bytes of input + # aes_enc_128 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + # aes_enc_block + vbroadcasti128 (%r10), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm9, %ymm1, %ymm1 + vpxor %ymm6, %ymm2, %ymm2 + vpxor %ymm9, %ymm2, %ymm2 + vpxor %ymm7, %ymm3, %ymm3 + vpxor %ymm9, %ymm3, %ymm3 + vbroadcasti128 16(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 32(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 48(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 64(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 80(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 96(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 112(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 128(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 144(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + cmpl $11, %r9d + vbroadcasti128 160(%r10), %ymm9 + jl L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 176(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + cmpl $13, %r9d + vbroadcasti128 192(%r10), %ymm9 + jl L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 208(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vaesenc %ymm9, %ymm2, %ymm2 + vaesenc %ymm9, %ymm3, %ymm3 + vbroadcasti128 224(%r10), %ymm9 +L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last: + vaesenclast %ymm9, %ymm0, %ymm0 + vaesenclast %ymm9, %ymm1, %ymm1 + vaesenclast %ymm9, %ymm2, %ymm2 + vaesenclast %ymm9, %ymm3, %ymm3 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vpsrlq $56, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm4, %ymm4 + vpxor %ymm10, %ymm4, %ymm4 + vpxor %ymm9, %ymm4, %ymm4 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vpsrlq $56, %ymm5, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm5, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + vpxor %ymm6, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vpsrlq $56, %ymm6, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm6, %ymm6 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm9, %ymm6, %ymm6 + vpxor %ymm7, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + vpsrlq $56, %ymm7, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm7, %ymm7 + vpxor %ymm10, %ymm7, %ymm7 + vpxor %ymm9, %ymm7, %ymm7 + addl $0x80, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_encrypt_update_vaes_enc_128 + vextracti128 $0x00, %ymm4, %xmm8 +L_AES_XTS_encrypt_update_vaes_done_128: + movl %eax, %r11d + andl $0xffffffc0, %r11d + cmpl %r11d, %r12d + je L_AES_XTS_encrypt_update_vaes_done_64 + # 64 bytes of input + # aes_enc_64 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpsrlq $62, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm4, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + # aes_enc_block + vbroadcasti128 (%r10), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm9, %ymm1, %ymm1 + vbroadcasti128 16(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 32(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 48(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 64(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 80(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 96(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 112(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 128(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 144(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + cmpl $11, %r9d + vbroadcasti128 160(%r10), %ymm9 + jl L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 176(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + cmpl $13, %r9d + vbroadcasti128 192(%r10), %ymm9 + jl L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 208(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vaesenc %ymm9, %ymm1, %ymm1 + vbroadcasti128 224(%r10), %ymm9 +L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last: + vaesenclast %ymm9, %ymm0, %ymm0 + vaesenclast %ymm9, %ymm1, %ymm1 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vextracti128 $0x01, %ymm5, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpand %xmm12, %xmm9, %xmm9 + vpxor %xmm9, %xmm8, %xmm8 + addl $0x40, %r12d +L_AES_XTS_encrypt_update_vaes_done_64: + movl %eax, %r11d + andl $0xffffffe0, %r11d + cmpl %r11d, %r12d + je L_AES_XTS_encrypt_update_vaes_done_32 + # 32 bytes of input + # aes_enc_32 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu (%rcx), %ymm0 + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + # aes_enc_block + vbroadcasti128 (%r10), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vbroadcasti128 16(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 32(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 48(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 64(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 80(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 96(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 112(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 128(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 144(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + cmpl $11, %r9d + vbroadcasti128 160(%r10), %ymm9 + jl L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 176(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + cmpl $13, %r9d + vbroadcasti128 192(%r10), %ymm9 + jl L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 208(%r10), %ymm9 + vaesenc %ymm9, %ymm0, %ymm0 + vbroadcasti128 224(%r10), %ymm9 +L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last: + vaesenclast %ymm9, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vextracti128 $0x01, %ymm4, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpand %xmm12, %xmm9, %xmm9 + vpxor %xmm9, %xmm8, %xmm8 + addl $32, %r12d +L_AES_XTS_encrypt_update_vaes_done_32: + cmpl %eax, %r12d + movl %eax, %r11d + je L_AES_XTS_encrypt_update_vaes_done_enc + subl %r12d, %r11d + cmpl $16, %r11d + movl %eax, %r11d + jl L_AES_XTS_encrypt_update_vaes_last_15 + andl $0xfffffff0, %r11d + # 16 bytes of input +L_AES_XTS_encrypt_update_vaes_enc_16: + leaq (%rdi,%r12,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_enc_block + vpxor (%r10), %xmm0, %xmm0 + vmovdqu 16(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_vaes_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r10), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_vaes_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r10), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_encrypt_update_vaes_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r12,1), %rcx + vmovdqu %xmm0, (%rcx) + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm8, %xmm8 + addl $16, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_encrypt_update_vaes_enc_16 + cmpl %eax, %r12d + je L_AES_XTS_encrypt_update_vaes_done_enc +L_AES_XTS_encrypt_update_vaes_last_15: + subq $16, %r12 + leaq (%rsi,%r12,1), %rcx + vmovdqu (%rcx), %xmm0 + addq $16, %r12 + vmovdqu %xmm0, (%rsp) + xorq %rdx, %rdx +L_AES_XTS_encrypt_update_vaes_last_15_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r12,1), %cl + movb %r11b, (%rsi,%r12,1) + movb %cl, (%rsp,%rdx,1) + incl %r12d + incl %edx + cmpl %eax, %r12d + jl L_AES_XTS_encrypt_update_vaes_last_15_byte_loop + subq %rdx, %r12 + vmovdqu (%rsp), %xmm0 + subq $16, %r12 + vpxor %xmm8, %xmm0, %xmm0 + # aes_enc_block + vpxor (%r10), %xmm0, %xmm0 + vmovdqu 16(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_vaes_last_15_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r10), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_vaes_last_15_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r10), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_encrypt_update_vaes_last_15_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r12,1), %rcx + vmovdqu %xmm0, (%rcx) +L_AES_XTS_encrypt_update_vaes_done_enc: + vmovdqu %xmm8, (%r8) + addq $0x40, %rsp + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_encrypt_update_vaes,.-AES_XTS_encrypt_update_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_XTS_decrypt_vaes +.type AES_XTS_decrypt_vaes,@function +.align 16 +AES_XTS_decrypt_vaes: +#else +.section __TEXT,__text +.globl _AES_XTS_decrypt_vaes +.p2align 4 +_AES_XTS_decrypt_vaes: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + movq %rdx, %rax + movq %rcx, %r12 + movl 24(%rsp), %r10d + subq $0x40, %rsp + vmovdqu L_vaes_aes_xts_gc_xts(%rip), %xmm12 + vbroadcasti128 L_vaes_aes_xts_poly(%rip), %ymm13 + vmovdqu L_vaes_aes_xts_shl(%rip), %ymm14 + vmovdqu L_vaes_aes_xts_shr(%rip), %ymm15 + vmovdqu (%r12), %xmm8 + # aes_enc_block + vpxor (%r9), %xmm8, %xmm8 + vmovdqu 16(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 32(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 48(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 64(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 80(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 96(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 112(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 128(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 144(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + cmpl $11, %r10d + vmovdqu 160(%r9), %xmm5 + jl L_AES_XTS_decrypt_vaes_tweak_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 176(%r9), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqu 192(%r9), %xmm5 + jl L_AES_XTS_decrypt_vaes_tweak_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 208(%r9), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + vmovdqu 224(%r9), %xmm5 +L_AES_XTS_decrypt_vaes_tweak_aes_enc_block_last: + vaesenclast %xmm5, %xmm8, %xmm8 + xorl %r13d, %r13d + movl %eax, %r11d + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_vaes_mul16_128 + subl $16, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_vaes_last_31_start +L_AES_XTS_decrypt_vaes_mul16_128: + cmpl $32, %r11d + jl L_AES_XTS_decrypt_vaes_done_128 + cmpl $0x80, %r11d + jl L_AES_XTS_decrypt_vaes_done_128 + andl $0xffffff80, %r11d + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpsrlq $62, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm4, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + vpsrlq $62, %ymm5, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm5, %ymm6 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm9, %ymm6, %ymm6 + vpsrlq $62, %ymm6, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm6, %ymm7 + vpxor %ymm10, %ymm7, %ymm7 + vpxor %ymm9, %ymm7, %ymm7 +L_AES_XTS_decrypt_vaes_dec_128: + # 128 bytes of input + # aes_dec_128 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + # aes_dec_block + vbroadcasti128 (%r8), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm9, %ymm1, %ymm1 + vpxor %ymm6, %ymm2, %ymm2 + vpxor %ymm9, %ymm2, %ymm2 + vpxor %ymm7, %ymm3, %ymm3 + vpxor %ymm9, %ymm3, %ymm3 + vbroadcasti128 16(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 32(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 48(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 64(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 80(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 96(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 112(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 128(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 144(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + cmpl $11, %r10d + vbroadcasti128 160(%r8), %ymm9 + jl L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 176(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + cmpl $13, %r10d + vbroadcasti128 192(%r8), %ymm9 + jl L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 208(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 224(%r8), %ymm9 +L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last: + vaesdeclast %ymm9, %ymm0, %ymm0 + vaesdeclast %ymm9, %ymm1, %ymm1 + vaesdeclast %ymm9, %ymm2, %ymm2 + vaesdeclast %ymm9, %ymm3, %ymm3 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vpsrlq $56, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm4, %ymm4 + vpxor %ymm10, %ymm4, %ymm4 + vpxor %ymm9, %ymm4, %ymm4 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vpsrlq $56, %ymm5, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm5, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + vpxor %ymm6, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vpsrlq $56, %ymm6, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm6, %ymm6 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm9, %ymm6, %ymm6 + vpxor %ymm7, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + vpsrlq $56, %ymm7, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm7, %ymm7 + vpxor %ymm10, %ymm7, %ymm7 + vpxor %ymm9, %ymm7, %ymm7 + addl $0x80, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_decrypt_vaes_dec_128 + vextracti128 $0x00, %ymm4, %xmm8 +L_AES_XTS_decrypt_vaes_done_128: + cmpl %eax, %r13d + movl %eax, %r11d + je L_AES_XTS_decrypt_vaes_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_vaes_mul16_64 + subl $16, %r11d + subl %r13d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_vaes_last_31_start + addl %r13d, %r11d +L_AES_XTS_decrypt_vaes_mul16_64: + andl $0xffffffc0, %r11d + cmpl %r11d, %r13d + je L_AES_XTS_decrypt_vaes_done_64 + # 64 bytes of input + # aes_dec_64 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpsrlq $62, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm4, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + # aes_dec_block + vbroadcasti128 (%r8), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm9, %ymm1, %ymm1 + vbroadcasti128 16(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 32(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 48(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 64(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 80(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 96(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 112(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 128(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 144(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + cmpl $11, %r10d + vbroadcasti128 160(%r8), %ymm9 + jl L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 176(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + cmpl $13, %r10d + vbroadcasti128 192(%r8), %ymm9 + jl L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 208(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 224(%r8), %ymm9 +L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last: + vaesdeclast %ymm9, %ymm0, %ymm0 + vaesdeclast %ymm9, %ymm1, %ymm1 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vextracti128 $0x01, %ymm5, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpand %xmm12, %xmm9, %xmm9 + vpxor %xmm9, %xmm8, %xmm8 + addl $0x40, %r13d +L_AES_XTS_decrypt_vaes_done_64: + cmpl %eax, %r13d + movl %eax, %r11d + je L_AES_XTS_decrypt_vaes_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_vaes_mul16_32 + subl $16, %r11d + subl %r13d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_vaes_last_31_start + addl %r13d, %r11d +L_AES_XTS_decrypt_vaes_mul16_32: + andl $0xffffffe0, %r11d + cmpl %r11d, %r13d + je L_AES_XTS_decrypt_vaes_done_32 + # 32 bytes of input + # aes_dec_32 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu (%rcx), %ymm0 + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + # aes_dec_block + vbroadcasti128 (%r8), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vbroadcasti128 16(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 32(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 48(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 64(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 80(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 96(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 112(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 128(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 144(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + cmpl $11, %r10d + vbroadcasti128 160(%r8), %ymm9 + jl L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 176(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + cmpl $13, %r10d + vbroadcasti128 192(%r8), %ymm9 + jl L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 208(%r8), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 224(%r8), %ymm9 +L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last: + vaesdeclast %ymm9, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vextracti128 $0x01, %ymm4, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpand %xmm12, %xmm9, %xmm9 + vpxor %xmm9, %xmm8, %xmm8 + addl $32, %r13d +L_AES_XTS_decrypt_vaes_done_32: + cmpl %eax, %r13d + movl %eax, %r11d + je L_AES_XTS_decrypt_vaes_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_vaes_mul16 + subl $16, %r11d + subl %r13d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_vaes_last_31_start + addl %r13d, %r11d +L_AES_XTS_decrypt_vaes_mul16: +L_AES_XTS_decrypt_vaes_dec_16: + # 16 bytes of input + leaq (%rdi,%r13,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_decrypt_vaes_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_decrypt_vaes_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_decrypt_vaes_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r13,1), %rcx + vmovdqu %xmm0, (%rcx) + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm8, %xmm8 + addl $16, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_decrypt_vaes_dec_16 + cmpl %eax, %r13d + je L_AES_XTS_decrypt_vaes_done_dec +L_AES_XTS_decrypt_vaes_last_31_start: + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm7 + vpsrad $31, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm7, %xmm7 + leaq (%rdi,%r13,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm7, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_decrypt_vaes_last_31_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_decrypt_vaes_last_31_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_decrypt_vaes_last_31_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm7, %xmm0, %xmm0 + vmovdqu %xmm0, (%rsp) + addq $16, %r13 + xorq %rdx, %rdx +L_AES_XTS_decrypt_vaes_last_31_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r13,1), %cl + movb %r11b, (%rsi,%r13,1) + movb %cl, (%rsp,%rdx,1) + incl %r13d + incl %edx + cmpl %eax, %r13d + jl L_AES_XTS_decrypt_vaes_last_31_byte_loop + subq %rdx, %r13 + vmovdqu (%rsp), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_decrypt_vaes_last_31_2_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_decrypt_vaes_last_31_2_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_decrypt_vaes_last_31_2_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + subq $16, %r13 + leaq (%rsi,%r13,1), %rcx + vmovdqu %xmm0, (%rcx) +L_AES_XTS_decrypt_vaes_done_dec: + addq $0x40, %rsp + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_decrypt_vaes,.-AES_XTS_decrypt_vaes +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_XTS_decrypt_update_vaes +.type AES_XTS_decrypt_update_vaes,@function +.align 16 +AES_XTS_decrypt_update_vaes: +#else +.section __TEXT,__text +.globl _AES_XTS_decrypt_update_vaes +.p2align 4 +_AES_XTS_decrypt_update_vaes: +#endif /* __APPLE__ */ + pushq %r12 + movq %rdx, %rax + movq %rcx, %r10 + subq $0x40, %rsp + vmovdqu L_vaes_aes_xts_gc_xts(%rip), %xmm12 + vbroadcasti128 L_vaes_aes_xts_poly(%rip), %ymm13 + vmovdqu L_vaes_aes_xts_shl(%rip), %ymm14 + vmovdqu L_vaes_aes_xts_shr(%rip), %ymm15 + vmovdqu (%r8), %xmm8 + xorl %r12d, %r12d + movl %eax, %r11d + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_update_vaes_mul16_128 + subl $16, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_update_vaes_last_31_start +L_AES_XTS_decrypt_update_vaes_mul16_128: + cmpl $32, %r11d + jl L_AES_XTS_decrypt_update_vaes_done_128 + cmpl $0x80, %r11d + jl L_AES_XTS_decrypt_update_vaes_done_128 + andl $0xffffff80, %r11d + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpsrlq $62, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm4, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + vpsrlq $62, %ymm5, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm5, %ymm6 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm9, %ymm6, %ymm6 + vpsrlq $62, %ymm6, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm6, %ymm7 + vpxor %ymm10, %ymm7, %ymm7 + vpxor %ymm9, %ymm7, %ymm7 +L_AES_XTS_decrypt_update_vaes_dec_128: + # 128 bytes of input + # aes_dec_128 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + # aes_dec_block + vbroadcasti128 (%r10), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm9, %ymm1, %ymm1 + vpxor %ymm6, %ymm2, %ymm2 + vpxor %ymm9, %ymm2, %ymm2 + vpxor %ymm7, %ymm3, %ymm3 + vpxor %ymm9, %ymm3, %ymm3 + vbroadcasti128 16(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 32(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 48(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 64(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 80(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 96(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 112(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 128(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 144(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + cmpl $11, %r9d + vbroadcasti128 160(%r10), %ymm9 + jl L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 176(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + cmpl $13, %r9d + vbroadcasti128 192(%r10), %ymm9 + jl L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 208(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vaesdec %ymm9, %ymm2, %ymm2 + vaesdec %ymm9, %ymm3, %ymm3 + vbroadcasti128 224(%r10), %ymm9 +L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last: + vaesdeclast %ymm9, %ymm0, %ymm0 + vaesdeclast %ymm9, %ymm1, %ymm1 + vaesdeclast %ymm9, %ymm2, %ymm2 + vaesdeclast %ymm9, %ymm3, %ymm3 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vpsrlq $56, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm4, %ymm4 + vpxor %ymm10, %ymm4, %ymm4 + vpxor %ymm9, %ymm4, %ymm4 + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vpsrlq $56, %ymm5, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm5, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + vpxor %ymm6, %ymm2, %ymm2 + vmovdqu %ymm2, 64(%rdx) + vpsrlq $56, %ymm6, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm6, %ymm6 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm9, %ymm6, %ymm6 + vpxor %ymm7, %ymm3, %ymm3 + vmovdqu %ymm3, 96(%rdx) + vpsrlq $56, %ymm7, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $8, %ymm7, %ymm7 + vpxor %ymm10, %ymm7, %ymm7 + vpxor %ymm9, %ymm7, %ymm7 + addl $0x80, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_decrypt_update_vaes_dec_128 + vextracti128 $0x00, %ymm4, %xmm8 +L_AES_XTS_decrypt_update_vaes_done_128: + cmpl %eax, %r12d + movl %eax, %r11d + je L_AES_XTS_decrypt_update_vaes_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_update_vaes_mul16_64 + subl $16, %r11d + subl %r12d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_update_vaes_last_31_start + addl %r12d, %r11d +L_AES_XTS_decrypt_update_vaes_mul16_64: + andl $0xffffffc0, %r11d + cmpl %r11d, %r12d + je L_AES_XTS_decrypt_update_vaes_done_64 + # 64 bytes of input + # aes_dec_64 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpsrlq $62, %ymm4, %ymm9 + vpclmulqdq $0x01, %ymm13, %ymm9, %ymm10 + vpslldq $8, %ymm9, %ymm9 + vpsllq $2, %ymm4, %ymm5 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm5, %ymm5 + # aes_dec_block + vbroadcasti128 (%r10), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm9, %ymm1, %ymm1 + vbroadcasti128 16(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 32(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 48(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 64(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 80(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 96(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 112(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 128(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 144(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + cmpl $11, %r9d + vbroadcasti128 160(%r10), %ymm9 + jl L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 176(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + cmpl $13, %r9d + vbroadcasti128 192(%r10), %ymm9 + jl L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 208(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vaesdec %ymm9, %ymm1, %ymm1 + vbroadcasti128 224(%r10), %ymm9 +L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last: + vaesdeclast %ymm9, %ymm0, %ymm0 + vaesdeclast %ymm9, %ymm1, %ymm1 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vpxor %ymm5, %ymm1, %ymm1 + vmovdqu %ymm1, 32(%rdx) + vextracti128 $0x01, %ymm5, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpand %xmm12, %xmm9, %xmm9 + vpxor %xmm9, %xmm8, %xmm8 + addl $0x40, %r12d +L_AES_XTS_decrypt_update_vaes_done_64: + cmpl %eax, %r12d + movl %eax, %r11d + je L_AES_XTS_decrypt_update_vaes_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_update_vaes_mul16_32 + subl $16, %r11d + subl %r12d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_update_vaes_last_31_start + addl %r12d, %r11d +L_AES_XTS_decrypt_update_vaes_mul16_32: + andl $0xffffffe0, %r11d + cmpl %r11d, %r12d + je L_AES_XTS_decrypt_update_vaes_done_32 + # 32 bytes of input + # aes_dec_32 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu (%rcx), %ymm0 + vperm2i128 $0x00, %ymm8, %ymm8, %ymm5 + vpsrlvq %ymm15, %ymm5, %ymm6 + vpclmulqdq $0x01, %ymm13, %ymm6, %ymm7 + vpslldq $8, %ymm6, %ymm6 + vpsllvq %ymm14, %ymm5, %ymm4 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + # aes_dec_block + vbroadcasti128 (%r10), %ymm9 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm9, %ymm0, %ymm0 + vbroadcasti128 16(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 32(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 48(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 64(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 80(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 96(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 112(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 128(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 144(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + cmpl $11, %r9d + vbroadcasti128 160(%r10), %ymm9 + jl L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 176(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + cmpl $13, %r9d + vbroadcasti128 192(%r10), %ymm9 + jl L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 208(%r10), %ymm9 + vaesdec %ymm9, %ymm0, %ymm0 + vbroadcasti128 224(%r10), %ymm9 +L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last: + vaesdeclast %ymm9, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vmovdqu %ymm0, (%rdx) + vextracti128 $0x01, %ymm4, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpand %xmm12, %xmm9, %xmm9 + vpxor %xmm9, %xmm8, %xmm8 + addl $32, %r12d +L_AES_XTS_decrypt_update_vaes_done_32: + cmpl %eax, %r12d + movl %eax, %r11d + je L_AES_XTS_decrypt_update_vaes_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_update_vaes_mul16 + subl $16, %r11d + subl %r12d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_update_vaes_last_31_start + addl %r12d, %r11d +L_AES_XTS_decrypt_update_vaes_mul16: +L_AES_XTS_decrypt_update_vaes_dec_16: + # 16 bytes of input + leaq (%rdi,%r12,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r10), %xmm0, %xmm0 + vmovdqu 16(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_vaes_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_vaes_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_decrypt_update_vaes_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r12,1), %rcx + vmovdqu %xmm0, (%rcx) + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm8, %xmm8 + addl $16, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_decrypt_update_vaes_dec_16 + cmpl %eax, %r12d + je L_AES_XTS_decrypt_update_vaes_done_dec +L_AES_XTS_decrypt_update_vaes_last_31_start: + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm7 + vpsrad $31, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm7, %xmm7 + leaq (%rdi,%r12,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm7, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r10), %xmm0, %xmm0 + vmovdqu 16(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_vaes_last_31_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_vaes_last_31_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_decrypt_update_vaes_last_31_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm7, %xmm0, %xmm0 + vmovdqu %xmm0, (%rsp) + addq $16, %r12 + xorq %rdx, %rdx +L_AES_XTS_decrypt_update_vaes_last_31_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r12,1), %cl + movb %r11b, (%rsi,%r12,1) + movb %cl, (%rsp,%rdx,1) + incl %r12d + incl %edx + cmpl %eax, %r12d + jl L_AES_XTS_decrypt_update_vaes_last_31_byte_loop + subq %rdx, %r12 + vmovdqu (%rsp), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r10), %xmm0, %xmm0 + vmovdqu 16(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_vaes_last_31_2_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_vaes_last_31_2_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_decrypt_update_vaes_last_31_2_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + subq $16, %r12 + leaq (%rsi,%r12,1), %rcx + vmovdqu %xmm0, (%rcx) +L_AES_XTS_decrypt_update_vaes_done_dec: + vmovdqu %xmm8, (%r8) + addq $0x40, %rsp + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_decrypt_update_vaes,.-AES_XTS_decrypt_update_vaes +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_VAES */ +#ifdef HAVE_INTEL_AVX512 +#ifndef __APPLE__ +.text +.globl AES_XTS_init_avx512 +.type AES_XTS_init_avx512,@function +.align 16 +AES_XTS_init_avx512: +#else +.section __TEXT,__text +.globl _AES_XTS_init_avx512 +.p2align 4 +_AES_XTS_init_avx512: +#endif /* __APPLE__ */ + vmovdqu (%rdi), %xmm0 + # aes_enc_block + vpxor (%rsi), %xmm0, %xmm0 + vmovdqu 16(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 32(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 48(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 64(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 80(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 96(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 112(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 128(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 144(%rsi), %xmm2 + vaesenc %xmm2, %xmm0, %xmm0 + cmpl $11, %edx + vmovdqu 160(%rsi), %xmm2 + jl L_AES_XTS_init_avx512_tweak_aes_enc_block_last + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 176(%rsi), %xmm3 + vaesenc %xmm3, %xmm0, %xmm0 + cmpl $13, %edx + vmovdqu 192(%rsi), %xmm2 + jl L_AES_XTS_init_avx512_tweak_aes_enc_block_last + vaesenc %xmm2, %xmm0, %xmm0 + vmovdqu 208(%rsi), %xmm3 + vaesenc %xmm3, %xmm0, %xmm0 + vmovdqu 224(%rsi), %xmm2 +L_AES_XTS_init_avx512_tweak_aes_enc_block_last: + vaesenclast %xmm2, %xmm0, %xmm0 + vmovdqu %xmm0, (%rdi) + repz retq +#ifndef __APPLE__ +.size AES_XTS_init_avx512,.-AES_XTS_init_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_xts_gc_xts: +.long 0x00000087,0x00000000,0x00000001,0x00000000 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_xts_poly: +.long 0x00000087,0x00000000,0x00000000,0x00000000 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_xts_shl: +.long 0x00000000,0x00000000,0x00000000,0x00000000 +.long 0x00000001,0x00000000,0x00000001,0x00000000 +.long 0x00000002,0x00000000,0x00000002,0x00000000 +.long 0x00000003,0x00000000,0x00000003,0x00000000 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_avx512_aes_xts_shr: +.long 0x00000040,0x00000000,0x00000040,0x00000000 +.long 0x0000003f,0x00000000,0x0000003f,0x00000000 +.long 0x0000003e,0x00000000,0x0000003e,0x00000000 +.long 0x0000003d,0x00000000,0x0000003d,0x00000000 +#ifndef __APPLE__ +.text +.globl AES_XTS_encrypt_avx512 +.type AES_XTS_encrypt_avx512,@function +.align 16 +AES_XTS_encrypt_avx512: +#else +.section __TEXT,__text +.globl _AES_XTS_encrypt_avx512 +.p2align 4 +_AES_XTS_encrypt_avx512: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + movq %rdx, %rax + movq %rcx, %r12 + movl 24(%rsp), %r10d + subq $0x40, %rsp + vmovdqu L_avx512_aes_xts_gc_xts(%rip), %xmm12 + vbroadcasti32x4 L_avx512_aes_xts_poly(%rip), %zmm13 + vmovdqu64 L_avx512_aes_xts_shl(%rip), %zmm14 + vmovdqu64 L_avx512_aes_xts_shr(%rip), %zmm15 + vmovdqu (%r12), %xmm8 + # aes_enc_block + vpxor (%r9), %xmm8, %xmm8 + vmovdqu 16(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 32(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 48(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 64(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 80(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 96(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 112(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 128(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 144(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + cmpl $11, %r10d + vmovdqu 160(%r9), %xmm5 + jl L_AES_XTS_encrypt_avx512_tweak_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 176(%r9), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqu 192(%r9), %xmm5 + jl L_AES_XTS_encrypt_avx512_tweak_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 208(%r9), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + vmovdqu 224(%r9), %xmm5 +L_AES_XTS_encrypt_avx512_tweak_aes_enc_block_last: + vaesenclast %xmm5, %xmm8, %xmm8 + xorl %r13d, %r13d + cmpl $32, %eax + jl L_AES_XTS_encrypt_avx512_done_128 + vbroadcasti32x4 (%r8), %zmm16 + vbroadcasti32x4 16(%r8), %zmm17 + vbroadcasti32x4 32(%r8), %zmm18 + vbroadcasti32x4 48(%r8), %zmm19 + vbroadcasti32x4 64(%r8), %zmm20 + vbroadcasti32x4 80(%r8), %zmm21 + vbroadcasti32x4 96(%r8), %zmm22 + vbroadcasti32x4 112(%r8), %zmm23 + vbroadcasti32x4 128(%r8), %zmm24 + vbroadcasti32x4 144(%r8), %zmm25 + vbroadcasti32x4 160(%r8), %zmm26 + cmpl $11, %r10d + jl L_AES_XTS_encrypt_avx512_key_cached + vbroadcasti32x4 176(%r8), %zmm27 + vbroadcasti32x4 192(%r8), %zmm28 + cmpl $13, %r10d + jl L_AES_XTS_encrypt_avx512_key_cached + vbroadcasti32x4 208(%r8), %zmm29 + vbroadcasti32x4 224(%r8), %zmm30 +L_AES_XTS_encrypt_avx512_key_cached: + cmpl $0x100, %eax + movl %eax, %r11d + jl L_AES_XTS_encrypt_avx512_done_256 + andl $0xffffff00, %r11d + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + vpsrlq $60, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm4, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + vpsrlq $60, %zmm5, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm5, %zmm6 + vpternlogq $0x96, %zmm9, %zmm10, %zmm6 + vpsrlq $60, %zmm6, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm6, %zmm7 + vpternlogq $0x96, %zmm9, %zmm10, %zmm7 +L_AES_XTS_encrypt_avx512_enc_256: + # 256 bytes of input + # aes_enc_256 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vmovdqu64 64(%rcx), %zmm1 + vmovdqu64 128(%rcx), %zmm2 + vmovdqu64 192(%rcx), %zmm3 + # aes_enc_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vpternlogq $0x96, %zmm5, %zmm16, %zmm1 + vpternlogq $0x96, %zmm6, %zmm16, %zmm2 + vpternlogq $0x96, %zmm7, %zmm16, %zmm3 + vaesenc %zmm17, %zmm0, %zmm0 + vaesenc %zmm17, %zmm1, %zmm1 + vaesenc %zmm17, %zmm2, %zmm2 + vaesenc %zmm17, %zmm3, %zmm3 + vaesenc %zmm18, %zmm0, %zmm0 + vaesenc %zmm18, %zmm1, %zmm1 + vaesenc %zmm18, %zmm2, %zmm2 + vaesenc %zmm18, %zmm3, %zmm3 + vaesenc %zmm19, %zmm0, %zmm0 + vaesenc %zmm19, %zmm1, %zmm1 + vaesenc %zmm19, %zmm2, %zmm2 + vaesenc %zmm19, %zmm3, %zmm3 + vaesenc %zmm20, %zmm0, %zmm0 + vaesenc %zmm20, %zmm1, %zmm1 + vaesenc %zmm20, %zmm2, %zmm2 + vaesenc %zmm20, %zmm3, %zmm3 + vaesenc %zmm21, %zmm0, %zmm0 + vaesenc %zmm21, %zmm1, %zmm1 + vaesenc %zmm21, %zmm2, %zmm2 + vaesenc %zmm21, %zmm3, %zmm3 + vaesenc %zmm22, %zmm0, %zmm0 + vaesenc %zmm22, %zmm1, %zmm1 + vaesenc %zmm22, %zmm2, %zmm2 + vaesenc %zmm22, %zmm3, %zmm3 + vaesenc %zmm23, %zmm0, %zmm0 + vaesenc %zmm23, %zmm1, %zmm1 + vaesenc %zmm23, %zmm2, %zmm2 + vaesenc %zmm23, %zmm3, %zmm3 + vaesenc %zmm24, %zmm0, %zmm0 + vaesenc %zmm24, %zmm1, %zmm1 + vaesenc %zmm24, %zmm2, %zmm2 + vaesenc %zmm24, %zmm3, %zmm3 + vaesenc %zmm25, %zmm0, %zmm0 + vaesenc %zmm25, %zmm1, %zmm1 + vaesenc %zmm25, %zmm2, %zmm2 + vaesenc %zmm25, %zmm3, %zmm3 + cmpl $11, %r10d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_encrypt_avx512_aes_enc_256_aes_enc_block_last + vaesenc %zmm26, %zmm0, %zmm0 + vaesenc %zmm26, %zmm1, %zmm1 + vaesenc %zmm26, %zmm2, %zmm2 + vaesenc %zmm26, %zmm3, %zmm3 + vaesenc %zmm27, %zmm0, %zmm0 + vaesenc %zmm27, %zmm1, %zmm1 + vaesenc %zmm27, %zmm2, %zmm2 + vaesenc %zmm27, %zmm3, %zmm3 + cmpl $13, %r10d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_encrypt_avx512_aes_enc_256_aes_enc_block_last + vaesenc %zmm28, %zmm0, %zmm0 + vaesenc %zmm28, %zmm1, %zmm1 + vaesenc %zmm28, %zmm2, %zmm2 + vaesenc %zmm28, %zmm3, %zmm3 + vaesenc %zmm29, %zmm0, %zmm0 + vaesenc %zmm29, %zmm1, %zmm1 + vaesenc %zmm29, %zmm2, %zmm2 + vaesenc %zmm29, %zmm3, %zmm3 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_encrypt_avx512_aes_enc_256_aes_enc_block_last: + vaesenclast %zmm9, %zmm0, %zmm0 + vaesenclast %zmm9, %zmm1, %zmm1 + vaesenclast %zmm9, %zmm2, %zmm2 + vaesenclast %zmm9, %zmm3, %zmm3 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vpsrlq $48, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm4, %zmm4 + vpternlogq $0x96, %zmm9, %zmm10, %zmm4 + vpxorq %zmm5, %zmm1, %zmm1 + vmovdqu64 %zmm1, 64(%rdx) + vpsrlq $48, %zmm5, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm5, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + vpxorq %zmm6, %zmm2, %zmm2 + vmovdqu64 %zmm2, 128(%rdx) + vpsrlq $48, %zmm6, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm6, %zmm6 + vpternlogq $0x96, %zmm9, %zmm10, %zmm6 + vpxorq %zmm7, %zmm3, %zmm3 + vmovdqu64 %zmm3, 192(%rdx) + vpsrlq $48, %zmm7, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm7, %zmm7 + vpternlogq $0x96, %zmm9, %zmm10, %zmm7 + addl $0x100, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_encrypt_avx512_enc_256 + vextracti32x4 $0x00, %zmm4, %xmm8 +L_AES_XTS_encrypt_avx512_done_256: + movl %eax, %r11d + andl $0xffffff80, %r11d + cmpl %r11d, %r13d + je L_AES_XTS_encrypt_avx512_done_128 + # 128 bytes of input + # aes_enc_128 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vmovdqu64 64(%rcx), %zmm1 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + vpsrlq $60, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm4, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + # aes_enc_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vpternlogq $0x96, %zmm5, %zmm16, %zmm1 + vaesenc %zmm17, %zmm0, %zmm0 + vaesenc %zmm17, %zmm1, %zmm1 + vaesenc %zmm18, %zmm0, %zmm0 + vaesenc %zmm18, %zmm1, %zmm1 + vaesenc %zmm19, %zmm0, %zmm0 + vaesenc %zmm19, %zmm1, %zmm1 + vaesenc %zmm20, %zmm0, %zmm0 + vaesenc %zmm20, %zmm1, %zmm1 + vaesenc %zmm21, %zmm0, %zmm0 + vaesenc %zmm21, %zmm1, %zmm1 + vaesenc %zmm22, %zmm0, %zmm0 + vaesenc %zmm22, %zmm1, %zmm1 + vaesenc %zmm23, %zmm0, %zmm0 + vaesenc %zmm23, %zmm1, %zmm1 + vaesenc %zmm24, %zmm0, %zmm0 + vaesenc %zmm24, %zmm1, %zmm1 + vaesenc %zmm25, %zmm0, %zmm0 + vaesenc %zmm25, %zmm1, %zmm1 + cmpl $11, %r10d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_encrypt_avx512_aes_enc_128_aes_enc_block_last + vaesenc %zmm26, %zmm0, %zmm0 + vaesenc %zmm26, %zmm1, %zmm1 + vaesenc %zmm27, %zmm0, %zmm0 + vaesenc %zmm27, %zmm1, %zmm1 + cmpl $13, %r10d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_encrypt_avx512_aes_enc_128_aes_enc_block_last + vaesenc %zmm28, %zmm0, %zmm0 + vaesenc %zmm28, %zmm1, %zmm1 + vaesenc %zmm29, %zmm0, %zmm0 + vaesenc %zmm29, %zmm1, %zmm1 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_encrypt_avx512_aes_enc_128_aes_enc_block_last: + vaesenclast %zmm9, %zmm0, %zmm0 + vaesenclast %zmm9, %zmm1, %zmm1 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vpxorq %zmm5, %zmm1, %zmm1 + vmovdqu64 %zmm1, 64(%rdx) + vextracti32x4 $3, %zmm5, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpternlogd $0x78, %xmm12, %xmm9, %xmm8 + addl $0x80, %r13d +L_AES_XTS_encrypt_avx512_done_128: + movl %eax, %r11d + andl $0xffffffc0, %r11d + cmpl %r11d, %r13d + je L_AES_XTS_encrypt_avx512_done_64 + # 64 bytes of input + # aes_enc_64 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + # aes_enc_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vaesenc %zmm17, %zmm0, %zmm0 + vaesenc %zmm18, %zmm0, %zmm0 + vaesenc %zmm19, %zmm0, %zmm0 + vaesenc %zmm20, %zmm0, %zmm0 + vaesenc %zmm21, %zmm0, %zmm0 + vaesenc %zmm22, %zmm0, %zmm0 + vaesenc %zmm23, %zmm0, %zmm0 + vaesenc %zmm24, %zmm0, %zmm0 + vaesenc %zmm25, %zmm0, %zmm0 + cmpl $11, %r10d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_encrypt_avx512_aes_enc_64_aes_enc_block_last + vaesenc %zmm26, %zmm0, %zmm0 + vaesenc %zmm27, %zmm0, %zmm0 + cmpl $13, %r10d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_encrypt_avx512_aes_enc_64_aes_enc_block_last + vaesenc %zmm28, %zmm0, %zmm0 + vaesenc %zmm29, %zmm0, %zmm0 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_encrypt_avx512_aes_enc_64_aes_enc_block_last: + vaesenclast %zmm9, %zmm0, %zmm0 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vextracti32x4 $3, %zmm4, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpternlogd $0x78, %xmm12, %xmm9, %xmm8 + addl $0x40, %r13d +L_AES_XTS_encrypt_avx512_done_64: + movl %eax, %r11d + andl $0xffffffe0, %r11d + cmpl %r11d, %r13d + je L_AES_XTS_encrypt_avx512_done_32 + # 32 bytes of input + # aes_enc_32 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu64 (%rcx), %ymm0 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + # aes_enc_block + vpternlogq $0x96, %ymm4, %ymm16, %ymm0 + vaesenc %ymm17, %ymm0, %ymm0 + vaesenc %ymm18, %ymm0, %ymm0 + vaesenc %ymm19, %ymm0, %ymm0 + vaesenc %ymm20, %ymm0, %ymm0 + vaesenc %ymm21, %ymm0, %ymm0 + vaesenc %ymm22, %ymm0, %ymm0 + vaesenc %ymm23, %ymm0, %ymm0 + vaesenc %ymm24, %ymm0, %ymm0 + vaesenc %ymm25, %ymm0, %ymm0 + cmpl $11, %r10d + vmovdqa64 %ymm26, %ymm9 + jl L_AES_XTS_encrypt_avx512_aes_enc_32_aes_enc_block_last + vaesenc %ymm26, %ymm0, %ymm0 + vaesenc %ymm27, %ymm0, %ymm0 + cmpl $13, %r10d + vmovdqa64 %ymm28, %ymm9 + jl L_AES_XTS_encrypt_avx512_aes_enc_32_aes_enc_block_last + vaesenc %ymm28, %ymm0, %ymm0 + vaesenc %ymm29, %ymm0, %ymm0 + vmovdqa64 %ymm30, %ymm9 +L_AES_XTS_encrypt_avx512_aes_enc_32_aes_enc_block_last: + vaesenclast %ymm9, %ymm0, %ymm0 + vpxorq %ymm4, %ymm0, %ymm0 + vmovdqu64 %ymm0, (%rdx) + vextracti32x4 $2, %zmm4, %xmm8 + addl $32, %r13d +L_AES_XTS_encrypt_avx512_done_32: + cmpl %eax, %r13d + movl %eax, %r11d + je L_AES_XTS_encrypt_avx512_done_enc + subl %r13d, %r11d + cmpl $16, %r11d + movl %eax, %r11d + jl L_AES_XTS_encrypt_avx512_last_15 + andl $0xfffffff0, %r11d + # 16 bytes of input +L_AES_XTS_encrypt_avx512_enc_16: + leaq (%rdi,%r13,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_enc_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_encrypt_avx512_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_encrypt_avx512_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_encrypt_avx512_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r13,1), %rcx + vmovdqu %xmm0, (%rcx) + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm4, %xmm4 + vpternlogd $0x78, %xmm12, %xmm4, %xmm8 + addl $16, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_encrypt_avx512_enc_16 + cmpl %eax, %r13d + je L_AES_XTS_encrypt_avx512_done_enc +L_AES_XTS_encrypt_avx512_last_15: + subq $16, %r13 + leaq (%rsi,%r13,1), %rcx + vmovdqu (%rcx), %xmm0 + addq $16, %r13 + vmovdqu %xmm0, (%rsp) + xorq %rdx, %rdx +L_AES_XTS_encrypt_avx512_last_15_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r13,1), %cl + movb %r11b, (%rsi,%r13,1) + movb %cl, (%rsp,%rdx,1) + incl %r13d + incl %edx + cmpl %eax, %r13d + jl L_AES_XTS_encrypt_avx512_last_15_byte_loop + subq %rdx, %r13 + vmovdqu (%rsp), %xmm0 + subq $16, %r13 + vpxor %xmm8, %xmm0, %xmm0 + # aes_enc_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_encrypt_avx512_last_15_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_encrypt_avx512_last_15_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_encrypt_avx512_last_15_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r13,1), %rcx + vmovdqu %xmm0, (%rcx) +L_AES_XTS_encrypt_avx512_done_enc: + addq $0x40, %rsp + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_encrypt_avx512,.-AES_XTS_encrypt_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_XTS_encrypt_update_avx512 +.type AES_XTS_encrypt_update_avx512,@function +.align 16 +AES_XTS_encrypt_update_avx512: +#else +.section __TEXT,__text +.globl _AES_XTS_encrypt_update_avx512 +.p2align 4 +_AES_XTS_encrypt_update_avx512: +#endif /* __APPLE__ */ + pushq %r12 + movq %rdx, %rax + movq %rcx, %r10 + subq $0x40, %rsp + vmovdqu L_avx512_aes_xts_gc_xts(%rip), %xmm12 + vbroadcasti32x4 L_avx512_aes_xts_poly(%rip), %zmm13 + vmovdqu64 L_avx512_aes_xts_shl(%rip), %zmm14 + vmovdqu64 L_avx512_aes_xts_shr(%rip), %zmm15 + vmovdqu (%r8), %xmm8 + xorl %r12d, %r12d + cmpl $32, %eax + jl L_AES_XTS_encrypt_update_avx512_done_128 + vbroadcasti32x4 (%r10), %zmm16 + vbroadcasti32x4 16(%r10), %zmm17 + vbroadcasti32x4 32(%r10), %zmm18 + vbroadcasti32x4 48(%r10), %zmm19 + vbroadcasti32x4 64(%r10), %zmm20 + vbroadcasti32x4 80(%r10), %zmm21 + vbroadcasti32x4 96(%r10), %zmm22 + vbroadcasti32x4 112(%r10), %zmm23 + vbroadcasti32x4 128(%r10), %zmm24 + vbroadcasti32x4 144(%r10), %zmm25 + vbroadcasti32x4 160(%r10), %zmm26 + cmpl $11, %r9d + jl L_AES_XTS_encrypt_update_avx512_key_cached + vbroadcasti32x4 176(%r10), %zmm27 + vbroadcasti32x4 192(%r10), %zmm28 + cmpl $13, %r9d + jl L_AES_XTS_encrypt_update_avx512_key_cached + vbroadcasti32x4 208(%r10), %zmm29 + vbroadcasti32x4 224(%r10), %zmm30 +L_AES_XTS_encrypt_update_avx512_key_cached: + cmpl $0x100, %eax + movl %eax, %r11d + jl L_AES_XTS_encrypt_update_avx512_done_256 + andl $0xffffff00, %r11d + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + vpsrlq $60, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm4, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + vpsrlq $60, %zmm5, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm5, %zmm6 + vpternlogq $0x96, %zmm9, %zmm10, %zmm6 + vpsrlq $60, %zmm6, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm6, %zmm7 + vpternlogq $0x96, %zmm9, %zmm10, %zmm7 +L_AES_XTS_encrypt_update_avx512_enc_256: + # 256 bytes of input + # aes_enc_256 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vmovdqu64 64(%rcx), %zmm1 + vmovdqu64 128(%rcx), %zmm2 + vmovdqu64 192(%rcx), %zmm3 + # aes_enc_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vpternlogq $0x96, %zmm5, %zmm16, %zmm1 + vpternlogq $0x96, %zmm6, %zmm16, %zmm2 + vpternlogq $0x96, %zmm7, %zmm16, %zmm3 + vaesenc %zmm17, %zmm0, %zmm0 + vaesenc %zmm17, %zmm1, %zmm1 + vaesenc %zmm17, %zmm2, %zmm2 + vaesenc %zmm17, %zmm3, %zmm3 + vaesenc %zmm18, %zmm0, %zmm0 + vaesenc %zmm18, %zmm1, %zmm1 + vaesenc %zmm18, %zmm2, %zmm2 + vaesenc %zmm18, %zmm3, %zmm3 + vaesenc %zmm19, %zmm0, %zmm0 + vaesenc %zmm19, %zmm1, %zmm1 + vaesenc %zmm19, %zmm2, %zmm2 + vaesenc %zmm19, %zmm3, %zmm3 + vaesenc %zmm20, %zmm0, %zmm0 + vaesenc %zmm20, %zmm1, %zmm1 + vaesenc %zmm20, %zmm2, %zmm2 + vaesenc %zmm20, %zmm3, %zmm3 + vaesenc %zmm21, %zmm0, %zmm0 + vaesenc %zmm21, %zmm1, %zmm1 + vaesenc %zmm21, %zmm2, %zmm2 + vaesenc %zmm21, %zmm3, %zmm3 + vaesenc %zmm22, %zmm0, %zmm0 + vaesenc %zmm22, %zmm1, %zmm1 + vaesenc %zmm22, %zmm2, %zmm2 + vaesenc %zmm22, %zmm3, %zmm3 + vaesenc %zmm23, %zmm0, %zmm0 + vaesenc %zmm23, %zmm1, %zmm1 + vaesenc %zmm23, %zmm2, %zmm2 + vaesenc %zmm23, %zmm3, %zmm3 + vaesenc %zmm24, %zmm0, %zmm0 + vaesenc %zmm24, %zmm1, %zmm1 + vaesenc %zmm24, %zmm2, %zmm2 + vaesenc %zmm24, %zmm3, %zmm3 + vaesenc %zmm25, %zmm0, %zmm0 + vaesenc %zmm25, %zmm1, %zmm1 + vaesenc %zmm25, %zmm2, %zmm2 + vaesenc %zmm25, %zmm3, %zmm3 + cmpl $11, %r9d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_256_aes_enc_block_last + vaesenc %zmm26, %zmm0, %zmm0 + vaesenc %zmm26, %zmm1, %zmm1 + vaesenc %zmm26, %zmm2, %zmm2 + vaesenc %zmm26, %zmm3, %zmm3 + vaesenc %zmm27, %zmm0, %zmm0 + vaesenc %zmm27, %zmm1, %zmm1 + vaesenc %zmm27, %zmm2, %zmm2 + vaesenc %zmm27, %zmm3, %zmm3 + cmpl $13, %r9d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_256_aes_enc_block_last + vaesenc %zmm28, %zmm0, %zmm0 + vaesenc %zmm28, %zmm1, %zmm1 + vaesenc %zmm28, %zmm2, %zmm2 + vaesenc %zmm28, %zmm3, %zmm3 + vaesenc %zmm29, %zmm0, %zmm0 + vaesenc %zmm29, %zmm1, %zmm1 + vaesenc %zmm29, %zmm2, %zmm2 + vaesenc %zmm29, %zmm3, %zmm3 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_encrypt_update_avx512_aes_enc_256_aes_enc_block_last: + vaesenclast %zmm9, %zmm0, %zmm0 + vaesenclast %zmm9, %zmm1, %zmm1 + vaesenclast %zmm9, %zmm2, %zmm2 + vaesenclast %zmm9, %zmm3, %zmm3 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vpsrlq $48, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm4, %zmm4 + vpternlogq $0x96, %zmm9, %zmm10, %zmm4 + vpxorq %zmm5, %zmm1, %zmm1 + vmovdqu64 %zmm1, 64(%rdx) + vpsrlq $48, %zmm5, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm5, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + vpxorq %zmm6, %zmm2, %zmm2 + vmovdqu64 %zmm2, 128(%rdx) + vpsrlq $48, %zmm6, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm6, %zmm6 + vpternlogq $0x96, %zmm9, %zmm10, %zmm6 + vpxorq %zmm7, %zmm3, %zmm3 + vmovdqu64 %zmm3, 192(%rdx) + vpsrlq $48, %zmm7, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm7, %zmm7 + vpternlogq $0x96, %zmm9, %zmm10, %zmm7 + addl $0x100, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_encrypt_update_avx512_enc_256 + vextracti32x4 $0x00, %zmm4, %xmm8 +L_AES_XTS_encrypt_update_avx512_done_256: + movl %eax, %r11d + andl $0xffffff80, %r11d + cmpl %r11d, %r12d + je L_AES_XTS_encrypt_update_avx512_done_128 + # 128 bytes of input + # aes_enc_128 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vmovdqu64 64(%rcx), %zmm1 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + vpsrlq $60, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm4, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + # aes_enc_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vpternlogq $0x96, %zmm5, %zmm16, %zmm1 + vaesenc %zmm17, %zmm0, %zmm0 + vaesenc %zmm17, %zmm1, %zmm1 + vaesenc %zmm18, %zmm0, %zmm0 + vaesenc %zmm18, %zmm1, %zmm1 + vaesenc %zmm19, %zmm0, %zmm0 + vaesenc %zmm19, %zmm1, %zmm1 + vaesenc %zmm20, %zmm0, %zmm0 + vaesenc %zmm20, %zmm1, %zmm1 + vaesenc %zmm21, %zmm0, %zmm0 + vaesenc %zmm21, %zmm1, %zmm1 + vaesenc %zmm22, %zmm0, %zmm0 + vaesenc %zmm22, %zmm1, %zmm1 + vaesenc %zmm23, %zmm0, %zmm0 + vaesenc %zmm23, %zmm1, %zmm1 + vaesenc %zmm24, %zmm0, %zmm0 + vaesenc %zmm24, %zmm1, %zmm1 + vaesenc %zmm25, %zmm0, %zmm0 + vaesenc %zmm25, %zmm1, %zmm1 + cmpl $11, %r9d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_128_aes_enc_block_last + vaesenc %zmm26, %zmm0, %zmm0 + vaesenc %zmm26, %zmm1, %zmm1 + vaesenc %zmm27, %zmm0, %zmm0 + vaesenc %zmm27, %zmm1, %zmm1 + cmpl $13, %r9d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_128_aes_enc_block_last + vaesenc %zmm28, %zmm0, %zmm0 + vaesenc %zmm28, %zmm1, %zmm1 + vaesenc %zmm29, %zmm0, %zmm0 + vaesenc %zmm29, %zmm1, %zmm1 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_encrypt_update_avx512_aes_enc_128_aes_enc_block_last: + vaesenclast %zmm9, %zmm0, %zmm0 + vaesenclast %zmm9, %zmm1, %zmm1 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vpxorq %zmm5, %zmm1, %zmm1 + vmovdqu64 %zmm1, 64(%rdx) + vextracti32x4 $3, %zmm5, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpternlogd $0x78, %xmm12, %xmm9, %xmm8 + addl $0x80, %r12d +L_AES_XTS_encrypt_update_avx512_done_128: + movl %eax, %r11d + andl $0xffffffc0, %r11d + cmpl %r11d, %r12d + je L_AES_XTS_encrypt_update_avx512_done_64 + # 64 bytes of input + # aes_enc_64 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + # aes_enc_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vaesenc %zmm17, %zmm0, %zmm0 + vaesenc %zmm18, %zmm0, %zmm0 + vaesenc %zmm19, %zmm0, %zmm0 + vaesenc %zmm20, %zmm0, %zmm0 + vaesenc %zmm21, %zmm0, %zmm0 + vaesenc %zmm22, %zmm0, %zmm0 + vaesenc %zmm23, %zmm0, %zmm0 + vaesenc %zmm24, %zmm0, %zmm0 + vaesenc %zmm25, %zmm0, %zmm0 + cmpl $11, %r9d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_64_aes_enc_block_last + vaesenc %zmm26, %zmm0, %zmm0 + vaesenc %zmm27, %zmm0, %zmm0 + cmpl $13, %r9d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_64_aes_enc_block_last + vaesenc %zmm28, %zmm0, %zmm0 + vaesenc %zmm29, %zmm0, %zmm0 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_encrypt_update_avx512_aes_enc_64_aes_enc_block_last: + vaesenclast %zmm9, %zmm0, %zmm0 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vextracti32x4 $3, %zmm4, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpternlogd $0x78, %xmm12, %xmm9, %xmm8 + addl $0x40, %r12d +L_AES_XTS_encrypt_update_avx512_done_64: + movl %eax, %r11d + andl $0xffffffe0, %r11d + cmpl %r11d, %r12d + je L_AES_XTS_encrypt_update_avx512_done_32 + # 32 bytes of input + # aes_enc_32 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %ymm0 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + # aes_enc_block + vpternlogq $0x96, %ymm4, %ymm16, %ymm0 + vaesenc %ymm17, %ymm0, %ymm0 + vaesenc %ymm18, %ymm0, %ymm0 + vaesenc %ymm19, %ymm0, %ymm0 + vaesenc %ymm20, %ymm0, %ymm0 + vaesenc %ymm21, %ymm0, %ymm0 + vaesenc %ymm22, %ymm0, %ymm0 + vaesenc %ymm23, %ymm0, %ymm0 + vaesenc %ymm24, %ymm0, %ymm0 + vaesenc %ymm25, %ymm0, %ymm0 + cmpl $11, %r9d + vmovdqa64 %ymm26, %ymm9 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_32_aes_enc_block_last + vaesenc %ymm26, %ymm0, %ymm0 + vaesenc %ymm27, %ymm0, %ymm0 + cmpl $13, %r9d + vmovdqa64 %ymm28, %ymm9 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_32_aes_enc_block_last + vaesenc %ymm28, %ymm0, %ymm0 + vaesenc %ymm29, %ymm0, %ymm0 + vmovdqa64 %ymm30, %ymm9 +L_AES_XTS_encrypt_update_avx512_aes_enc_32_aes_enc_block_last: + vaesenclast %ymm9, %ymm0, %ymm0 + vpxorq %ymm4, %ymm0, %ymm0 + vmovdqu64 %ymm0, (%rdx) + vextracti32x4 $2, %zmm4, %xmm8 + addl $32, %r12d +L_AES_XTS_encrypt_update_avx512_done_32: + cmpl %eax, %r12d + movl %eax, %r11d + je L_AES_XTS_encrypt_update_avx512_done_enc + subl %r12d, %r11d + cmpl $16, %r11d + movl %eax, %r11d + jl L_AES_XTS_encrypt_update_avx512_last_15 + andl $0xfffffff0, %r11d + # 16 bytes of input +L_AES_XTS_encrypt_update_avx512_enc_16: + leaq (%rdi,%r12,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_enc_block + vpxor (%r10), %xmm0, %xmm0 + vmovdqu 16(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r10), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r10), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_encrypt_update_avx512_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r12,1), %rcx + vmovdqu %xmm0, (%rcx) + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm4, %xmm4 + vpternlogd $0x78, %xmm12, %xmm4, %xmm8 + addl $16, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_encrypt_update_avx512_enc_16 + cmpl %eax, %r12d + je L_AES_XTS_encrypt_update_avx512_done_enc +L_AES_XTS_encrypt_update_avx512_last_15: + subq $16, %r12 + leaq (%rsi,%r12,1), %rcx + vmovdqu (%rcx), %xmm0 + addq $16, %r12 + vmovdqu %xmm0, (%rsp) + xorq %rdx, %rdx +L_AES_XTS_encrypt_update_avx512_last_15_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r12,1), %cl + movb %r11b, (%rsi,%r12,1) + movb %cl, (%rsp,%rdx,1) + incl %r12d + incl %edx + cmpl %eax, %r12d + jl L_AES_XTS_encrypt_update_avx512_last_15_byte_loop + subq %rdx, %r12 + vmovdqu (%rsp), %xmm0 + subq $16, %r12 + vpxor %xmm8, %xmm0, %xmm0 + # aes_enc_block + vpxor (%r10), %xmm0, %xmm0 + vmovdqu 16(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r10), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_avx512_last_15_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r10), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_encrypt_update_avx512_last_15_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r10), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_encrypt_update_avx512_last_15_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r12,1), %rcx + vmovdqu %xmm0, (%rcx) +L_AES_XTS_encrypt_update_avx512_done_enc: + vmovdqu %xmm8, (%r8) + addq $0x40, %rsp + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_encrypt_update_avx512,.-AES_XTS_encrypt_update_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_XTS_decrypt_avx512 +.type AES_XTS_decrypt_avx512,@function +.align 16 +AES_XTS_decrypt_avx512: +#else +.section __TEXT,__text +.globl _AES_XTS_decrypt_avx512 +.p2align 4 +_AES_XTS_decrypt_avx512: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + movq %rdx, %rax + movq %rcx, %r12 + movl 24(%rsp), %r10d + subq $0x40, %rsp + vmovdqu L_avx512_aes_xts_gc_xts(%rip), %xmm12 + vbroadcasti32x4 L_avx512_aes_xts_poly(%rip), %zmm13 + vmovdqu64 L_avx512_aes_xts_shl(%rip), %zmm14 + vmovdqu64 L_avx512_aes_xts_shr(%rip), %zmm15 + vmovdqu (%r12), %xmm8 + # aes_enc_block + vpxor (%r9), %xmm8, %xmm8 + vmovdqu 16(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 32(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 48(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 64(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 80(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 96(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 112(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 128(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 144(%r9), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + cmpl $11, %r10d + vmovdqu 160(%r9), %xmm5 + jl L_AES_XTS_decrypt_avx512_tweak_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 176(%r9), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqu 192(%r9), %xmm5 + jl L_AES_XTS_decrypt_avx512_tweak_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 208(%r9), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + vmovdqu 224(%r9), %xmm5 +L_AES_XTS_decrypt_avx512_tweak_aes_enc_block_last: + vaesenclast %xmm5, %xmm8, %xmm8 + xorl %r13d, %r13d + movl %eax, %r11d + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_avx512_mul16_256 + subl $16, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_avx512_last_31_start +L_AES_XTS_decrypt_avx512_mul16_256: + cmpl $32, %r11d + jl L_AES_XTS_decrypt_avx512_done_128 + vbroadcasti32x4 (%r8), %zmm16 + vbroadcasti32x4 16(%r8), %zmm17 + vbroadcasti32x4 32(%r8), %zmm18 + vbroadcasti32x4 48(%r8), %zmm19 + vbroadcasti32x4 64(%r8), %zmm20 + vbroadcasti32x4 80(%r8), %zmm21 + vbroadcasti32x4 96(%r8), %zmm22 + vbroadcasti32x4 112(%r8), %zmm23 + vbroadcasti32x4 128(%r8), %zmm24 + vbroadcasti32x4 144(%r8), %zmm25 + vbroadcasti32x4 160(%r8), %zmm26 + cmpl $11, %r10d + jl L_AES_XTS_decrypt_avx512_key_cached + vbroadcasti32x4 176(%r8), %zmm27 + vbroadcasti32x4 192(%r8), %zmm28 + cmpl $13, %r10d + jl L_AES_XTS_decrypt_avx512_key_cached + vbroadcasti32x4 208(%r8), %zmm29 + vbroadcasti32x4 224(%r8), %zmm30 +L_AES_XTS_decrypt_avx512_key_cached: + cmpl $0x100, %r11d + jl L_AES_XTS_decrypt_avx512_done_256 + andl $0xffffff00, %r11d + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + vpsrlq $60, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm4, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + vpsrlq $60, %zmm5, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm5, %zmm6 + vpternlogq $0x96, %zmm9, %zmm10, %zmm6 + vpsrlq $60, %zmm6, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm6, %zmm7 + vpternlogq $0x96, %zmm9, %zmm10, %zmm7 +L_AES_XTS_decrypt_avx512_dec_256: + # 256 bytes of input + # aes_dec_256 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vmovdqu64 64(%rcx), %zmm1 + vmovdqu64 128(%rcx), %zmm2 + vmovdqu64 192(%rcx), %zmm3 + # aes_dec_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vpternlogq $0x96, %zmm5, %zmm16, %zmm1 + vpternlogq $0x96, %zmm6, %zmm16, %zmm2 + vpternlogq $0x96, %zmm7, %zmm16, %zmm3 + vaesdec %zmm17, %zmm0, %zmm0 + vaesdec %zmm17, %zmm1, %zmm1 + vaesdec %zmm17, %zmm2, %zmm2 + vaesdec %zmm17, %zmm3, %zmm3 + vaesdec %zmm18, %zmm0, %zmm0 + vaesdec %zmm18, %zmm1, %zmm1 + vaesdec %zmm18, %zmm2, %zmm2 + vaesdec %zmm18, %zmm3, %zmm3 + vaesdec %zmm19, %zmm0, %zmm0 + vaesdec %zmm19, %zmm1, %zmm1 + vaesdec %zmm19, %zmm2, %zmm2 + vaesdec %zmm19, %zmm3, %zmm3 + vaesdec %zmm20, %zmm0, %zmm0 + vaesdec %zmm20, %zmm1, %zmm1 + vaesdec %zmm20, %zmm2, %zmm2 + vaesdec %zmm20, %zmm3, %zmm3 + vaesdec %zmm21, %zmm0, %zmm0 + vaesdec %zmm21, %zmm1, %zmm1 + vaesdec %zmm21, %zmm2, %zmm2 + vaesdec %zmm21, %zmm3, %zmm3 + vaesdec %zmm22, %zmm0, %zmm0 + vaesdec %zmm22, %zmm1, %zmm1 + vaesdec %zmm22, %zmm2, %zmm2 + vaesdec %zmm22, %zmm3, %zmm3 + vaesdec %zmm23, %zmm0, %zmm0 + vaesdec %zmm23, %zmm1, %zmm1 + vaesdec %zmm23, %zmm2, %zmm2 + vaesdec %zmm23, %zmm3, %zmm3 + vaesdec %zmm24, %zmm0, %zmm0 + vaesdec %zmm24, %zmm1, %zmm1 + vaesdec %zmm24, %zmm2, %zmm2 + vaesdec %zmm24, %zmm3, %zmm3 + vaesdec %zmm25, %zmm0, %zmm0 + vaesdec %zmm25, %zmm1, %zmm1 + vaesdec %zmm25, %zmm2, %zmm2 + vaesdec %zmm25, %zmm3, %zmm3 + cmpl $11, %r10d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_decrypt_avx512_aes_dec_256_aes_dec_block_last + vaesdec %zmm26, %zmm0, %zmm0 + vaesdec %zmm26, %zmm1, %zmm1 + vaesdec %zmm26, %zmm2, %zmm2 + vaesdec %zmm26, %zmm3, %zmm3 + vaesdec %zmm27, %zmm0, %zmm0 + vaesdec %zmm27, %zmm1, %zmm1 + vaesdec %zmm27, %zmm2, %zmm2 + vaesdec %zmm27, %zmm3, %zmm3 + cmpl $13, %r10d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_decrypt_avx512_aes_dec_256_aes_dec_block_last + vaesdec %zmm28, %zmm0, %zmm0 + vaesdec %zmm28, %zmm1, %zmm1 + vaesdec %zmm28, %zmm2, %zmm2 + vaesdec %zmm28, %zmm3, %zmm3 + vaesdec %zmm29, %zmm0, %zmm0 + vaesdec %zmm29, %zmm1, %zmm1 + vaesdec %zmm29, %zmm2, %zmm2 + vaesdec %zmm29, %zmm3, %zmm3 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_decrypt_avx512_aes_dec_256_aes_dec_block_last: + vaesdeclast %zmm9, %zmm0, %zmm0 + vaesdeclast %zmm9, %zmm1, %zmm1 + vaesdeclast %zmm9, %zmm2, %zmm2 + vaesdeclast %zmm9, %zmm3, %zmm3 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vpsrlq $48, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm4, %zmm4 + vpternlogq $0x96, %zmm9, %zmm10, %zmm4 + vpxorq %zmm5, %zmm1, %zmm1 + vmovdqu64 %zmm1, 64(%rdx) + vpsrlq $48, %zmm5, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm5, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + vpxorq %zmm6, %zmm2, %zmm2 + vmovdqu64 %zmm2, 128(%rdx) + vpsrlq $48, %zmm6, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm6, %zmm6 + vpternlogq $0x96, %zmm9, %zmm10, %zmm6 + vpxorq %zmm7, %zmm3, %zmm3 + vmovdqu64 %zmm3, 192(%rdx) + vpsrlq $48, %zmm7, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm7, %zmm7 + vpternlogq $0x96, %zmm9, %zmm10, %zmm7 + addl $0x100, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_decrypt_avx512_dec_256 + vextracti32x4 $0x00, %zmm4, %xmm8 +L_AES_XTS_decrypt_avx512_done_256: + cmpl %eax, %r13d + movl %eax, %r11d + je L_AES_XTS_decrypt_avx512_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_avx512_mul16_128 + subl $16, %r11d + subl %r13d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_avx512_last_31_start + addl %r13d, %r11d +L_AES_XTS_decrypt_avx512_mul16_128: + andl $0xffffff80, %r11d + cmpl %r11d, %r13d + je L_AES_XTS_decrypt_avx512_done_128 + # 128 bytes of input + # aes_dec_128 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vmovdqu64 64(%rcx), %zmm1 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + vpsrlq $60, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm4, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + # aes_dec_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vpternlogq $0x96, %zmm5, %zmm16, %zmm1 + vaesdec %zmm17, %zmm0, %zmm0 + vaesdec %zmm17, %zmm1, %zmm1 + vaesdec %zmm18, %zmm0, %zmm0 + vaesdec %zmm18, %zmm1, %zmm1 + vaesdec %zmm19, %zmm0, %zmm0 + vaesdec %zmm19, %zmm1, %zmm1 + vaesdec %zmm20, %zmm0, %zmm0 + vaesdec %zmm20, %zmm1, %zmm1 + vaesdec %zmm21, %zmm0, %zmm0 + vaesdec %zmm21, %zmm1, %zmm1 + vaesdec %zmm22, %zmm0, %zmm0 + vaesdec %zmm22, %zmm1, %zmm1 + vaesdec %zmm23, %zmm0, %zmm0 + vaesdec %zmm23, %zmm1, %zmm1 + vaesdec %zmm24, %zmm0, %zmm0 + vaesdec %zmm24, %zmm1, %zmm1 + vaesdec %zmm25, %zmm0, %zmm0 + vaesdec %zmm25, %zmm1, %zmm1 + cmpl $11, %r10d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_decrypt_avx512_aes_dec_128_aes_dec_block_last + vaesdec %zmm26, %zmm0, %zmm0 + vaesdec %zmm26, %zmm1, %zmm1 + vaesdec %zmm27, %zmm0, %zmm0 + vaesdec %zmm27, %zmm1, %zmm1 + cmpl $13, %r10d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_decrypt_avx512_aes_dec_128_aes_dec_block_last + vaesdec %zmm28, %zmm0, %zmm0 + vaesdec %zmm28, %zmm1, %zmm1 + vaesdec %zmm29, %zmm0, %zmm0 + vaesdec %zmm29, %zmm1, %zmm1 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_decrypt_avx512_aes_dec_128_aes_dec_block_last: + vaesdeclast %zmm9, %zmm0, %zmm0 + vaesdeclast %zmm9, %zmm1, %zmm1 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vpxorq %zmm5, %zmm1, %zmm1 + vmovdqu64 %zmm1, 64(%rdx) + vextracti32x4 $3, %zmm5, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpternlogd $0x78, %xmm12, %xmm9, %xmm8 + addl $0x80, %r13d +L_AES_XTS_decrypt_avx512_done_128: + cmpl %eax, %r13d + movl %eax, %r11d + je L_AES_XTS_decrypt_avx512_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_avx512_mul16_64 + subl $16, %r11d + subl %r13d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_avx512_last_31_start + addl %r13d, %r11d +L_AES_XTS_decrypt_avx512_mul16_64: + andl $0xffffffc0, %r11d + cmpl %r11d, %r13d + je L_AES_XTS_decrypt_avx512_done_64 + # 64 bytes of input + # aes_dec_64 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + # aes_dec_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vaesdec %zmm17, %zmm0, %zmm0 + vaesdec %zmm18, %zmm0, %zmm0 + vaesdec %zmm19, %zmm0, %zmm0 + vaesdec %zmm20, %zmm0, %zmm0 + vaesdec %zmm21, %zmm0, %zmm0 + vaesdec %zmm22, %zmm0, %zmm0 + vaesdec %zmm23, %zmm0, %zmm0 + vaesdec %zmm24, %zmm0, %zmm0 + vaesdec %zmm25, %zmm0, %zmm0 + cmpl $11, %r10d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_decrypt_avx512_aes_dec_64_aes_dec_block_last + vaesdec %zmm26, %zmm0, %zmm0 + vaesdec %zmm27, %zmm0, %zmm0 + cmpl $13, %r10d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_decrypt_avx512_aes_dec_64_aes_dec_block_last + vaesdec %zmm28, %zmm0, %zmm0 + vaesdec %zmm29, %zmm0, %zmm0 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_decrypt_avx512_aes_dec_64_aes_dec_block_last: + vaesdeclast %zmm9, %zmm0, %zmm0 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vextracti32x4 $3, %zmm4, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpternlogd $0x78, %xmm12, %xmm9, %xmm8 + addl $0x40, %r13d +L_AES_XTS_decrypt_avx512_done_64: + cmpl %eax, %r13d + movl %eax, %r11d + je L_AES_XTS_decrypt_avx512_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_avx512_mul16_32 + subl $16, %r11d + subl %r13d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_avx512_last_31_start + addl %r13d, %r11d +L_AES_XTS_decrypt_avx512_mul16_32: + andl $0xffffffe0, %r11d + cmpl %r11d, %r13d + je L_AES_XTS_decrypt_avx512_done_32 + # 32 bytes of input + # aes_dec_32 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu64 (%rcx), %ymm0 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + # aes_dec_block + vpternlogq $0x96, %ymm4, %ymm16, %ymm0 + vaesdec %ymm17, %ymm0, %ymm0 + vaesdec %ymm18, %ymm0, %ymm0 + vaesdec %ymm19, %ymm0, %ymm0 + vaesdec %ymm20, %ymm0, %ymm0 + vaesdec %ymm21, %ymm0, %ymm0 + vaesdec %ymm22, %ymm0, %ymm0 + vaesdec %ymm23, %ymm0, %ymm0 + vaesdec %ymm24, %ymm0, %ymm0 + vaesdec %ymm25, %ymm0, %ymm0 + cmpl $11, %r10d + vmovdqa64 %ymm26, %ymm9 + jl L_AES_XTS_decrypt_avx512_aes_dec_32_aes_dec_block_last + vaesdec %ymm26, %ymm0, %ymm0 + vaesdec %ymm27, %ymm0, %ymm0 + cmpl $13, %r10d + vmovdqa64 %ymm28, %ymm9 + jl L_AES_XTS_decrypt_avx512_aes_dec_32_aes_dec_block_last + vaesdec %ymm28, %ymm0, %ymm0 + vaesdec %ymm29, %ymm0, %ymm0 + vmovdqa64 %ymm30, %ymm9 +L_AES_XTS_decrypt_avx512_aes_dec_32_aes_dec_block_last: + vaesdeclast %ymm9, %ymm0, %ymm0 + vpxorq %ymm4, %ymm0, %ymm0 + vmovdqu64 %ymm0, (%rdx) + vextracti32x4 $2, %zmm4, %xmm8 + addl $32, %r13d +L_AES_XTS_decrypt_avx512_done_32: + cmpl %eax, %r13d + movl %eax, %r11d + je L_AES_XTS_decrypt_avx512_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_avx512_mul16 + subl $16, %r11d + subl %r13d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_avx512_last_31_start + addl %r13d, %r11d +L_AES_XTS_decrypt_avx512_mul16: +L_AES_XTS_decrypt_avx512_dec_16: + # 16 bytes of input + leaq (%rdi,%r13,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_decrypt_avx512_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_decrypt_avx512_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_decrypt_avx512_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r13,1), %rcx + vmovdqu %xmm0, (%rcx) + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm4, %xmm4 + vpternlogd $0x78, %xmm12, %xmm4, %xmm8 + addl $16, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_decrypt_avx512_dec_16 + cmpl %eax, %r13d + je L_AES_XTS_decrypt_avx512_done_dec +L_AES_XTS_decrypt_avx512_last_31_start: + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm7 + vpsrad $31, %xmm4, %xmm4 + vpternlogd $0x78, %xmm12, %xmm4, %xmm7 + leaq (%rdi,%r13,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm7, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_decrypt_avx512_last_31_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_decrypt_avx512_last_31_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_decrypt_avx512_last_31_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm7, %xmm0, %xmm0 + vmovdqu %xmm0, (%rsp) + addq $16, %r13 + xorq %rdx, %rdx +L_AES_XTS_decrypt_avx512_last_31_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r13,1), %cl + movb %r11b, (%rsi,%r13,1) + movb %cl, (%rsp,%rdx,1) + incl %r13d + incl %edx + cmpl %eax, %r13d + jl L_AES_XTS_decrypt_avx512_last_31_byte_loop + subq %rdx, %r13 + vmovdqu (%rsp), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r8), %xmm0, %xmm0 + vmovdqu 16(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r8), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_decrypt_avx512_last_31_2_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_decrypt_avx512_last_31_2_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r8), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_decrypt_avx512_last_31_2_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + subq $16, %r13 + leaq (%rsi,%r13,1), %rcx + vmovdqu %xmm0, (%rcx) +L_AES_XTS_decrypt_avx512_done_dec: + addq $0x40, %rsp + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_decrypt_avx512,.-AES_XTS_decrypt_avx512 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_XTS_decrypt_update_avx512 +.type AES_XTS_decrypt_update_avx512,@function +.align 16 +AES_XTS_decrypt_update_avx512: +#else +.section __TEXT,__text +.globl _AES_XTS_decrypt_update_avx512 +.p2align 4 +_AES_XTS_decrypt_update_avx512: +#endif /* __APPLE__ */ + pushq %r12 + movq %rdx, %rax + movq %rcx, %r10 + subq $0x40, %rsp + vmovdqu L_avx512_aes_xts_gc_xts(%rip), %xmm12 + vbroadcasti32x4 L_avx512_aes_xts_poly(%rip), %zmm13 + vmovdqu64 L_avx512_aes_xts_shl(%rip), %zmm14 + vmovdqu64 L_avx512_aes_xts_shr(%rip), %zmm15 + vmovdqu (%r8), %xmm8 + xorl %r12d, %r12d + movl %eax, %r11d + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_update_avx512_mul16_256 + subl $16, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_update_avx512_last_31_start +L_AES_XTS_decrypt_update_avx512_mul16_256: + cmpl $32, %r11d + jl L_AES_XTS_decrypt_update_avx512_done_128 + vbroadcasti32x4 (%r10), %zmm16 + vbroadcasti32x4 16(%r10), %zmm17 + vbroadcasti32x4 32(%r10), %zmm18 + vbroadcasti32x4 48(%r10), %zmm19 + vbroadcasti32x4 64(%r10), %zmm20 + vbroadcasti32x4 80(%r10), %zmm21 + vbroadcasti32x4 96(%r10), %zmm22 + vbroadcasti32x4 112(%r10), %zmm23 + vbroadcasti32x4 128(%r10), %zmm24 + vbroadcasti32x4 144(%r10), %zmm25 + vbroadcasti32x4 160(%r10), %zmm26 + cmpl $11, %r9d + jl L_AES_XTS_decrypt_update_avx512_key_cached + vbroadcasti32x4 176(%r10), %zmm27 + vbroadcasti32x4 192(%r10), %zmm28 + cmpl $13, %r9d + jl L_AES_XTS_decrypt_update_avx512_key_cached + vbroadcasti32x4 208(%r10), %zmm29 + vbroadcasti32x4 224(%r10), %zmm30 +L_AES_XTS_decrypt_update_avx512_key_cached: + cmpl $0x100, %r11d + jl L_AES_XTS_decrypt_update_avx512_done_256 + andl $0xffffff00, %r11d + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + vpsrlq $60, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm4, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + vpsrlq $60, %zmm5, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm5, %zmm6 + vpternlogq $0x96, %zmm9, %zmm10, %zmm6 + vpsrlq $60, %zmm6, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm6, %zmm7 + vpternlogq $0x96, %zmm9, %zmm10, %zmm7 +L_AES_XTS_decrypt_update_avx512_dec_256: + # 256 bytes of input + # aes_dec_256 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vmovdqu64 64(%rcx), %zmm1 + vmovdqu64 128(%rcx), %zmm2 + vmovdqu64 192(%rcx), %zmm3 + # aes_dec_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vpternlogq $0x96, %zmm5, %zmm16, %zmm1 + vpternlogq $0x96, %zmm6, %zmm16, %zmm2 + vpternlogq $0x96, %zmm7, %zmm16, %zmm3 + vaesdec %zmm17, %zmm0, %zmm0 + vaesdec %zmm17, %zmm1, %zmm1 + vaesdec %zmm17, %zmm2, %zmm2 + vaesdec %zmm17, %zmm3, %zmm3 + vaesdec %zmm18, %zmm0, %zmm0 + vaesdec %zmm18, %zmm1, %zmm1 + vaesdec %zmm18, %zmm2, %zmm2 + vaesdec %zmm18, %zmm3, %zmm3 + vaesdec %zmm19, %zmm0, %zmm0 + vaesdec %zmm19, %zmm1, %zmm1 + vaesdec %zmm19, %zmm2, %zmm2 + vaesdec %zmm19, %zmm3, %zmm3 + vaesdec %zmm20, %zmm0, %zmm0 + vaesdec %zmm20, %zmm1, %zmm1 + vaesdec %zmm20, %zmm2, %zmm2 + vaesdec %zmm20, %zmm3, %zmm3 + vaesdec %zmm21, %zmm0, %zmm0 + vaesdec %zmm21, %zmm1, %zmm1 + vaesdec %zmm21, %zmm2, %zmm2 + vaesdec %zmm21, %zmm3, %zmm3 + vaesdec %zmm22, %zmm0, %zmm0 + vaesdec %zmm22, %zmm1, %zmm1 + vaesdec %zmm22, %zmm2, %zmm2 + vaesdec %zmm22, %zmm3, %zmm3 + vaesdec %zmm23, %zmm0, %zmm0 + vaesdec %zmm23, %zmm1, %zmm1 + vaesdec %zmm23, %zmm2, %zmm2 + vaesdec %zmm23, %zmm3, %zmm3 + vaesdec %zmm24, %zmm0, %zmm0 + vaesdec %zmm24, %zmm1, %zmm1 + vaesdec %zmm24, %zmm2, %zmm2 + vaesdec %zmm24, %zmm3, %zmm3 + vaesdec %zmm25, %zmm0, %zmm0 + vaesdec %zmm25, %zmm1, %zmm1 + vaesdec %zmm25, %zmm2, %zmm2 + vaesdec %zmm25, %zmm3, %zmm3 + cmpl $11, %r9d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_256_aes_dec_block_last + vaesdec %zmm26, %zmm0, %zmm0 + vaesdec %zmm26, %zmm1, %zmm1 + vaesdec %zmm26, %zmm2, %zmm2 + vaesdec %zmm26, %zmm3, %zmm3 + vaesdec %zmm27, %zmm0, %zmm0 + vaesdec %zmm27, %zmm1, %zmm1 + vaesdec %zmm27, %zmm2, %zmm2 + vaesdec %zmm27, %zmm3, %zmm3 + cmpl $13, %r9d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_256_aes_dec_block_last + vaesdec %zmm28, %zmm0, %zmm0 + vaesdec %zmm28, %zmm1, %zmm1 + vaesdec %zmm28, %zmm2, %zmm2 + vaesdec %zmm28, %zmm3, %zmm3 + vaesdec %zmm29, %zmm0, %zmm0 + vaesdec %zmm29, %zmm1, %zmm1 + vaesdec %zmm29, %zmm2, %zmm2 + vaesdec %zmm29, %zmm3, %zmm3 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_decrypt_update_avx512_aes_dec_256_aes_dec_block_last: + vaesdeclast %zmm9, %zmm0, %zmm0 + vaesdeclast %zmm9, %zmm1, %zmm1 + vaesdeclast %zmm9, %zmm2, %zmm2 + vaesdeclast %zmm9, %zmm3, %zmm3 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vpsrlq $48, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm4, %zmm4 + vpternlogq $0x96, %zmm9, %zmm10, %zmm4 + vpxorq %zmm5, %zmm1, %zmm1 + vmovdqu64 %zmm1, 64(%rdx) + vpsrlq $48, %zmm5, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm5, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + vpxorq %zmm6, %zmm2, %zmm2 + vmovdqu64 %zmm2, 128(%rdx) + vpsrlq $48, %zmm6, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm6, %zmm6 + vpternlogq $0x96, %zmm9, %zmm10, %zmm6 + vpxorq %zmm7, %zmm3, %zmm3 + vmovdqu64 %zmm3, 192(%rdx) + vpsrlq $48, %zmm7, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $16, %zmm7, %zmm7 + vpternlogq $0x96, %zmm9, %zmm10, %zmm7 + addl $0x100, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_decrypt_update_avx512_dec_256 + vextracti32x4 $0x00, %zmm4, %xmm8 +L_AES_XTS_decrypt_update_avx512_done_256: + cmpl %eax, %r12d + movl %eax, %r11d + je L_AES_XTS_decrypt_update_avx512_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_update_avx512_mul16_128 + subl $16, %r11d + subl %r12d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_update_avx512_last_31_start + addl %r12d, %r11d +L_AES_XTS_decrypt_update_avx512_mul16_128: + andl $0xffffff80, %r11d + cmpl %r11d, %r12d + je L_AES_XTS_decrypt_update_avx512_done_128 + # 128 bytes of input + # aes_dec_128 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vmovdqu64 64(%rcx), %zmm1 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + vpsrlq $60, %zmm4, %zmm9 + vpclmulqdq $0x01, %zmm13, %zmm9, %zmm10 + vpslldq $8, %zmm9, %zmm9 + vpsllq $4, %zmm4, %zmm5 + vpternlogq $0x96, %zmm9, %zmm10, %zmm5 + # aes_dec_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vpternlogq $0x96, %zmm5, %zmm16, %zmm1 + vaesdec %zmm17, %zmm0, %zmm0 + vaesdec %zmm17, %zmm1, %zmm1 + vaesdec %zmm18, %zmm0, %zmm0 + vaesdec %zmm18, %zmm1, %zmm1 + vaesdec %zmm19, %zmm0, %zmm0 + vaesdec %zmm19, %zmm1, %zmm1 + vaesdec %zmm20, %zmm0, %zmm0 + vaesdec %zmm20, %zmm1, %zmm1 + vaesdec %zmm21, %zmm0, %zmm0 + vaesdec %zmm21, %zmm1, %zmm1 + vaesdec %zmm22, %zmm0, %zmm0 + vaesdec %zmm22, %zmm1, %zmm1 + vaesdec %zmm23, %zmm0, %zmm0 + vaesdec %zmm23, %zmm1, %zmm1 + vaesdec %zmm24, %zmm0, %zmm0 + vaesdec %zmm24, %zmm1, %zmm1 + vaesdec %zmm25, %zmm0, %zmm0 + vaesdec %zmm25, %zmm1, %zmm1 + cmpl $11, %r9d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_128_aes_dec_block_last + vaesdec %zmm26, %zmm0, %zmm0 + vaesdec %zmm26, %zmm1, %zmm1 + vaesdec %zmm27, %zmm0, %zmm0 + vaesdec %zmm27, %zmm1, %zmm1 + cmpl $13, %r9d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_128_aes_dec_block_last + vaesdec %zmm28, %zmm0, %zmm0 + vaesdec %zmm28, %zmm1, %zmm1 + vaesdec %zmm29, %zmm0, %zmm0 + vaesdec %zmm29, %zmm1, %zmm1 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_decrypt_update_avx512_aes_dec_128_aes_dec_block_last: + vaesdeclast %zmm9, %zmm0, %zmm0 + vaesdeclast %zmm9, %zmm1, %zmm1 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vpxorq %zmm5, %zmm1, %zmm1 + vmovdqu64 %zmm1, 64(%rdx) + vextracti32x4 $3, %zmm5, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpternlogd $0x78, %xmm12, %xmm9, %xmm8 + addl $0x80, %r12d +L_AES_XTS_decrypt_update_avx512_done_128: + cmpl %eax, %r12d + movl %eax, %r11d + je L_AES_XTS_decrypt_update_avx512_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_update_avx512_mul16_64 + subl $16, %r11d + subl %r12d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_update_avx512_last_31_start + addl %r12d, %r11d +L_AES_XTS_decrypt_update_avx512_mul16_64: + andl $0xffffffc0, %r11d + cmpl %r11d, %r12d + je L_AES_XTS_decrypt_update_avx512_done_64 + # 64 bytes of input + # aes_dec_64 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %zmm0 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + # aes_dec_block + vpternlogq $0x96, %zmm4, %zmm16, %zmm0 + vaesdec %zmm17, %zmm0, %zmm0 + vaesdec %zmm18, %zmm0, %zmm0 + vaesdec %zmm19, %zmm0, %zmm0 + vaesdec %zmm20, %zmm0, %zmm0 + vaesdec %zmm21, %zmm0, %zmm0 + vaesdec %zmm22, %zmm0, %zmm0 + vaesdec %zmm23, %zmm0, %zmm0 + vaesdec %zmm24, %zmm0, %zmm0 + vaesdec %zmm25, %zmm0, %zmm0 + cmpl $11, %r9d + vmovdqa64 %zmm26, %zmm9 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_64_aes_dec_block_last + vaesdec %zmm26, %zmm0, %zmm0 + vaesdec %zmm27, %zmm0, %zmm0 + cmpl $13, %r9d + vmovdqa64 %zmm28, %zmm9 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_64_aes_dec_block_last + vaesdec %zmm28, %zmm0, %zmm0 + vaesdec %zmm29, %zmm0, %zmm0 + vmovdqa64 %zmm30, %zmm9 +L_AES_XTS_decrypt_update_avx512_aes_dec_64_aes_dec_block_last: + vaesdeclast %zmm9, %zmm0, %zmm0 + vpxorq %zmm4, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rdx) + vextracti32x4 $3, %zmm4, %xmm8 + vpshufd $19, %xmm8, %xmm9 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm9, %xmm9 + vpternlogd $0x78, %xmm12, %xmm9, %xmm8 + addl $0x40, %r12d +L_AES_XTS_decrypt_update_avx512_done_64: + cmpl %eax, %r12d + movl %eax, %r11d + je L_AES_XTS_decrypt_update_avx512_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_update_avx512_mul16_32 + subl $16, %r11d + subl %r12d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_update_avx512_last_31_start + addl %r12d, %r11d +L_AES_XTS_decrypt_update_avx512_mul16_32: + andl $0xffffffe0, %r11d + cmpl %r11d, %r12d + je L_AES_XTS_decrypt_update_avx512_done_32 + # 32 bytes of input + # aes_dec_32 + leaq (%rdi,%r12,1), %rcx + leaq (%rsi,%r12,1), %rdx + vmovdqu64 (%rcx), %ymm0 + vshufi64x2 $0x00, %zmm8, %zmm8, %zmm5 + vpsrlvq %zmm15, %zmm5, %zmm6 + vpclmulqdq $0x01, %zmm13, %zmm6, %zmm7 + vpslldq $8, %zmm6, %zmm6 + vpsllvq %zmm14, %zmm5, %zmm4 + vpternlogq $0x96, %zmm6, %zmm7, %zmm4 + # aes_dec_block + vpternlogq $0x96, %ymm4, %ymm16, %ymm0 + vaesdec %ymm17, %ymm0, %ymm0 + vaesdec %ymm18, %ymm0, %ymm0 + vaesdec %ymm19, %ymm0, %ymm0 + vaesdec %ymm20, %ymm0, %ymm0 + vaesdec %ymm21, %ymm0, %ymm0 + vaesdec %ymm22, %ymm0, %ymm0 + vaesdec %ymm23, %ymm0, %ymm0 + vaesdec %ymm24, %ymm0, %ymm0 + vaesdec %ymm25, %ymm0, %ymm0 + cmpl $11, %r9d + vmovdqa64 %ymm26, %ymm9 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_32_aes_dec_block_last + vaesdec %ymm26, %ymm0, %ymm0 + vaesdec %ymm27, %ymm0, %ymm0 + cmpl $13, %r9d + vmovdqa64 %ymm28, %ymm9 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_32_aes_dec_block_last + vaesdec %ymm28, %ymm0, %ymm0 + vaesdec %ymm29, %ymm0, %ymm0 + vmovdqa64 %ymm30, %ymm9 +L_AES_XTS_decrypt_update_avx512_aes_dec_32_aes_dec_block_last: + vaesdeclast %ymm9, %ymm0, %ymm0 + vpxorq %ymm4, %ymm0, %ymm0 + vmovdqu64 %ymm0, (%rdx) + vextracti32x4 $2, %zmm4, %xmm8 + addl $32, %r12d +L_AES_XTS_decrypt_update_avx512_done_32: + cmpl %eax, %r12d + movl %eax, %r11d + je L_AES_XTS_decrypt_update_avx512_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_update_avx512_mul16 + subl $16, %r11d + subl %r12d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_update_avx512_last_31_start + addl %r12d, %r11d +L_AES_XTS_decrypt_update_avx512_mul16: +L_AES_XTS_decrypt_update_avx512_dec_16: + # 16 bytes of input + leaq (%rdi,%r12,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r10), %xmm0, %xmm0 + vmovdqu 16(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_decrypt_update_avx512_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + leaq (%rsi,%r12,1), %rcx + vmovdqu %xmm0, (%rcx) + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm8 + vpsrad $31, %xmm4, %xmm4 + vpternlogd $0x78, %xmm12, %xmm4, %xmm8 + addl $16, %r12d + cmpl %r11d, %r12d + jl L_AES_XTS_decrypt_update_avx512_dec_16 + cmpl %eax, %r12d + je L_AES_XTS_decrypt_update_avx512_done_dec +L_AES_XTS_decrypt_update_avx512_last_31_start: + vpshufd $19, %xmm8, %xmm4 + vpaddq %xmm8, %xmm8, %xmm7 + vpsrad $31, %xmm4, %xmm4 + vpternlogd $0x78, %xmm12, %xmm4, %xmm7 + leaq (%rdi,%r12,1), %rcx + vmovdqu (%rcx), %xmm0 + vpxor %xmm7, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r10), %xmm0, %xmm0 + vmovdqu 16(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_avx512_last_31_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_avx512_last_31_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_decrypt_update_avx512_last_31_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm7, %xmm0, %xmm0 + vmovdqu %xmm0, (%rsp) + addq $16, %r12 + xorq %rdx, %rdx +L_AES_XTS_decrypt_update_avx512_last_31_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r12,1), %cl + movb %r11b, (%rsi,%r12,1) + movb %cl, (%rsp,%rdx,1) + incl %r12d + incl %edx + cmpl %eax, %r12d + jl L_AES_XTS_decrypt_update_avx512_last_31_byte_loop + subq %rdx, %r12 + vmovdqu (%rsp), %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + # aes_dec_block + vpxor (%r10), %xmm0, %xmm0 + vmovdqu 16(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r10), %xmm5 + vaesdec %xmm5, %xmm0, %xmm0 + cmpl $11, %r9d + vmovdqu 160(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_avx512_last_31_2_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + cmpl $13, %r9d + vmovdqu 192(%r10), %xmm5 + jl L_AES_XTS_decrypt_update_avx512_last_31_2_aes_dec_block_last + vaesdec %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r10), %xmm6 + vaesdec %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r10), %xmm5 +L_AES_XTS_decrypt_update_avx512_last_31_2_aes_dec_block_last: + vaesdeclast %xmm5, %xmm0, %xmm0 + vpxor %xmm8, %xmm0, %xmm0 + subq $16, %r12 + leaq (%rsi,%r12,1), %rcx + vmovdqu %xmm0, (%rcx) +L_AES_XTS_decrypt_update_avx512_done_dec: + vmovdqu %xmm8, (%r8) + addq $0x40, %rsp + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_decrypt_update_avx512,.-AES_XTS_decrypt_update_avx512 +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX512 */ #endif /* WOLFSSL_X86_64_BUILD */ #endif /* WOLFSSL_AES_XTS */ diff --git a/wolfcrypt/src/aes_xts_asm.asm b/wolfcrypt/src/aes_xts_asm.asm index b0e5cebf316..a904ffa4ce7 100644 --- a/wolfcrypt/src/aes_xts_asm.asm +++ b/wolfcrypt/src/aes_xts_asm.asm @@ -2831,4 +2831,4472 @@ L_AES_XTS_decrypt_update_avx1_done_dec: AES_XTS_decrypt_update_avx1 ENDP _TEXT ENDS ENDIF +IFDEF HAVE_INTEL_VAES +_TEXT SEGMENT READONLY PARA +AES_XTS_init_vaes PROC + vmovdqu xmm0, OWORD PTR [rcx] + ; aes_enc_block + vpxor xmm0, xmm0, [rdx] + vmovdqu xmm2, OWORD PTR [rdx+16] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+32] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+48] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+64] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+80] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+96] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+112] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+128] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+144] + vaesenc xmm0, xmm0, xmm2 + cmp r8d, 11 + vmovdqu xmm2, OWORD PTR [rdx+160] + jl L_AES_XTS_init_vaes_tweak_aes_enc_block_last + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm3, OWORD PTR [rdx+176] + vaesenc xmm0, xmm0, xmm3 + cmp r8d, 13 + vmovdqu xmm2, OWORD PTR [rdx+192] + jl L_AES_XTS_init_vaes_tweak_aes_enc_block_last + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm3, OWORD PTR [rdx+208] + vaesenc xmm0, xmm0, xmm3 + vmovdqu xmm2, OWORD PTR [rdx+224] +L_AES_XTS_init_vaes_tweak_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm2 + vmovdqu OWORD PTR [rcx], xmm0 + ret +AES_XTS_init_vaes ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_xts_gc_xts DWORD \ + 00000087h, 00000000h, 00000001h, 00000000h +ptr_L_vaes_aes_xts_gc_xts QWORD L_vaes_aes_xts_gc_xts +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_xts_poly DWORD \ + 00000087h, 00000000h, 00000000h, 00000000h +ptr_L_vaes_aes_xts_poly QWORD L_vaes_aes_xts_poly +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_xts_shl DWORD \ + 00000000h, 00000000h, 00000000h, 00000000h, + 00000001h, 00000000h, 00000001h, 00000000h +ptr_L_vaes_aes_xts_shl QWORD L_vaes_aes_xts_shl +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_vaes_aes_xts_shr DWORD \ + 00000040h, 00000000h, 00000040h, 00000000h, + 0000003fh, 00000000h, 0000003fh, 00000000h +ptr_L_vaes_aes_xts_shr QWORD L_vaes_aes_xts_shr +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_XTS_encrypt_vaes PROC + push rdi + push rsi + push r12 + push r13 + mov rdi, rcx + mov rsi, rdx + mov rax, r8 + mov r12, r9 + mov r8, QWORD PTR [rsp+72] + mov r9, QWORD PTR [rsp+80] + mov r10d, DWORD PTR [rsp+88] + sub rsp, 224 + vmovdqu OWORD PTR [rsp+64], xmm6 + vmovdqu OWORD PTR [rsp+80], xmm7 + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + vmovdqu OWORD PTR [rsp+128], xmm10 + vmovdqu OWORD PTR [rsp+144], xmm11 + vmovdqu OWORD PTR [rsp+160], xmm12 + vmovdqu OWORD PTR [rsp+176], xmm13 + vmovdqu OWORD PTR [rsp+192], xmm14 + vmovdqu OWORD PTR [rsp+208], xmm15 + vmovdqu xmm12, OWORD PTR L_vaes_aes_xts_gc_xts + vbroadcasti128 ymm13, ptr_L_vaes_aes_xts_poly + vmovdqu ymm14, YMMWORD PTR L_vaes_aes_xts_shl + vmovdqu ymm15, YMMWORD PTR L_vaes_aes_xts_shr + vmovdqu xmm8, OWORD PTR [r12] + ; aes_enc_block + vpxor xmm8, xmm8, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm8, xmm8, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_XTS_encrypt_vaes_tweak_aes_enc_block_last + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm8, xmm8, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_XTS_encrypt_vaes_tweak_aes_enc_block_last + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm8, xmm8, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_XTS_encrypt_vaes_tweak_aes_enc_block_last: + vaesenclast xmm8, xmm8, xmm5 + xor r13d, r13d + cmp eax, 32 + jl L_AES_XTS_encrypt_vaes_done_128 + cmp eax, 128 + mov r11d, eax + jl L_AES_XTS_encrypt_vaes_done_128 + and r11d, 4294967168 + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + vpsrlq ymm9, ymm4, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm4, 2 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + vpsrlq ymm9, ymm5, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm6, ymm5, 2 + vpxor ymm6, ymm6, ymm10 + vpxor ymm6, ymm6, ymm9 + vpsrlq ymm9, ymm6, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm7, ymm6, 2 + vpxor ymm7, ymm7, ymm10 + vpxor ymm7, ymm7, ymm9 +L_AES_XTS_encrypt_vaes_enc_128: + ; 128 bytes of input + ; aes_enc_128 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + ; aes_enc_block + vbroadcasti128 ymm9, [r8] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vpxor ymm1, ymm1, ymm5 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm6 + vpxor ymm2, ymm2, ymm9 + vpxor ymm3, ymm3, ymm7 + vpxor ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+16] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+32] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+48] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+64] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+80] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+96] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+112] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+128] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+144] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + cmp r10d, 11 + vbroadcasti128 ymm9, [r8+160] + jl L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+176] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + cmp r10d, 13 + vbroadcasti128 ymm9, [r8+192] + jl L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+208] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+224] +L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm9 + vaesenclast ymm1, ymm1, ymm9 + vaesenclast ymm2, ymm2, ymm9 + vaesenclast ymm3, ymm3, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vpsrlq ymm9, ymm4, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm4, ymm4, 8 + vpxor ymm4, ymm4, ymm10 + vpxor ymm4, ymm4, ymm9 + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vpsrlq ymm9, ymm5, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm5, 8 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + vpxor ymm2, ymm2, ymm6 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vpsrlq ymm9, ymm6, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm6, ymm6, 8 + vpxor ymm6, ymm6, ymm10 + vpxor ymm6, ymm6, ymm9 + vpxor ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + vpsrlq ymm9, ymm7, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm7, ymm7, 8 + vpxor ymm7, ymm7, ymm10 + vpxor ymm7, ymm7, ymm9 + add r13d, 128 + cmp r13d, r11d + jl L_AES_XTS_encrypt_vaes_enc_128 + vextracti128 xmm8, ymm4, 0 +L_AES_XTS_encrypt_vaes_done_128: + mov r11d, eax + and r11d, 4294967232 + cmp r13d, r11d + je L_AES_XTS_encrypt_vaes_done_64 + ; 64 bytes of input + ; aes_enc_64 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + vpsrlq ymm9, ymm4, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm4, 2 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + ; aes_enc_block + vbroadcasti128 ymm9, [r8] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vpxor ymm1, ymm1, ymm5 + vpxor ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+16] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+32] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+48] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+64] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+80] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+96] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+112] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+128] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+144] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + cmp r10d, 11 + vbroadcasti128 ymm9, [r8+160] + jl L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+176] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + cmp r10d, 13 + vbroadcasti128 ymm9, [r8+192] + jl L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+208] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+224] +L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm9 + vaesenclast ymm1, ymm1, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vextracti128 xmm8, ymm5, 1 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpand xmm9, xmm9, xmm12 + vpxor xmm8, xmm8, xmm9 + add r13d, 64 +L_AES_XTS_encrypt_vaes_done_64: + mov r11d, eax + and r11d, 4294967264 + cmp r13d, r11d + je L_AES_XTS_encrypt_vaes_done_32 + ; 32 bytes of input + ; aes_enc_32 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu ymm0, YMMWORD PTR [rcx] + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + ; aes_enc_block + vbroadcasti128 ymm9, [r8] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+16] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+32] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+48] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+64] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+80] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+96] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+112] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+128] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+144] + vaesenc ymm0, ymm0, ymm9 + cmp r10d, 11 + vbroadcasti128 ymm9, [r8+160] + jl L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+176] + vaesenc ymm0, ymm0, ymm9 + cmp r10d, 13 + vbroadcasti128 ymm9, [r8+192] + jl L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+208] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+224] +L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vextracti128 xmm8, ymm4, 1 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpand xmm9, xmm9, xmm12 + vpxor xmm8, xmm8, xmm9 + add r13d, 32 +L_AES_XTS_encrypt_vaes_done_32: + cmp r13d, eax + mov r11d, eax + je L_AES_XTS_encrypt_vaes_done_enc + sub r11d, r13d + cmp r11d, 16 + mov r11d, eax + jl L_AES_XTS_encrypt_vaes_last_15 + and r11d, 4294967280 + ; 16 bytes of input +L_AES_XTS_encrypt_vaes_enc_16: + lea rcx, QWORD PTR [rdi+r13] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm8 + ; aes_enc_block + vpxor xmm0, xmm0, [r8] + vmovdqu xmm5, OWORD PTR [r8+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+144] + vaesenc xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r8+160] + jl L_AES_XTS_encrypt_vaes_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+176] + vaesenc xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r8+192] + jl L_AES_XTS_encrypt_vaes_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r8+224] +L_AES_XTS_encrypt_vaes_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r13] + vmovdqu OWORD PTR [rcx], xmm0 + vpshufd xmm4, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpand xmm4, xmm4, xmm12 + vpxor xmm8, xmm8, xmm4 + add r13d, 16 + cmp r13d, r11d + jl L_AES_XTS_encrypt_vaes_enc_16 + cmp r13d, eax + je L_AES_XTS_encrypt_vaes_done_enc +L_AES_XTS_encrypt_vaes_last_15: + sub r13, 16 + lea rcx, QWORD PTR [rsi+r13] + vmovdqu xmm0, OWORD PTR [rcx] + add r13, 16 + vmovdqu OWORD PTR [rsp], xmm0 + xor rdx, rdx +L_AES_XTS_encrypt_vaes_last_15_byte_loop: + mov r11b, BYTE PTR [rsp+rdx] + mov cl, BYTE PTR [rdi+r13] + mov BYTE PTR [rsi+r13], r11b + mov BYTE PTR [rsp+rdx], cl + inc r13d + inc edx + cmp r13d, eax + jl L_AES_XTS_encrypt_vaes_last_15_byte_loop + sub r13, rdx + vmovdqu xmm0, OWORD PTR [rsp] + sub r13, 16 + vpxor xmm0, xmm0, xmm8 + ; aes_enc_block + vpxor xmm0, xmm0, [r8] + vmovdqu xmm5, OWORD PTR [r8+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+144] + vaesenc xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r8+160] + jl L_AES_XTS_encrypt_vaes_last_15_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+176] + vaesenc xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r8+192] + jl L_AES_XTS_encrypt_vaes_last_15_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r8+224] +L_AES_XTS_encrypt_vaes_last_15_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r13] + vmovdqu OWORD PTR [rcx], xmm0 +L_AES_XTS_encrypt_vaes_done_enc: + vmovdqu xmm6, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm8, OWORD PTR [rsp+96] + vmovdqu xmm9, OWORD PTR [rsp+112] + vmovdqu xmm10, OWORD PTR [rsp+128] + vmovdqu xmm11, OWORD PTR [rsp+144] + vmovdqu xmm12, OWORD PTR [rsp+160] + vmovdqu xmm13, OWORD PTR [rsp+176] + vmovdqu xmm14, OWORD PTR [rsp+192] + vmovdqu xmm15, OWORD PTR [rsp+208] + add rsp, 224 + pop r13 + pop r12 + pop rsi + pop rdi + ret +AES_XTS_encrypt_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_XTS_encrypt_update_vaes PROC + push rdi + push rsi + push r12 + mov rdi, rcx + mov rsi, rdx + mov rax, r8 + mov r10, r9 + mov r8, QWORD PTR [rsp+64] + mov r9d, DWORD PTR [rsp+72] + sub rsp, 224 + vmovdqu OWORD PTR [rsp+64], xmm6 + vmovdqu OWORD PTR [rsp+80], xmm7 + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + vmovdqu OWORD PTR [rsp+128], xmm10 + vmovdqu OWORD PTR [rsp+144], xmm11 + vmovdqu OWORD PTR [rsp+160], xmm12 + vmovdqu OWORD PTR [rsp+176], xmm13 + vmovdqu OWORD PTR [rsp+192], xmm14 + vmovdqu OWORD PTR [rsp+208], xmm15 + vmovdqu xmm12, OWORD PTR L_vaes_aes_xts_gc_xts + vbroadcasti128 ymm13, ptr_L_vaes_aes_xts_poly + vmovdqu ymm14, YMMWORD PTR L_vaes_aes_xts_shl + vmovdqu ymm15, YMMWORD PTR L_vaes_aes_xts_shr + vmovdqu xmm8, OWORD PTR [r8] + xor r12d, r12d + cmp eax, 32 + jl L_AES_XTS_encrypt_update_vaes_done_128 + cmp eax, 128 + mov r11d, eax + jl L_AES_XTS_encrypt_update_vaes_done_128 + and r11d, 4294967168 + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + vpsrlq ymm9, ymm4, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm4, 2 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + vpsrlq ymm9, ymm5, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm6, ymm5, 2 + vpxor ymm6, ymm6, ymm10 + vpxor ymm6, ymm6, ymm9 + vpsrlq ymm9, ymm6, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm7, ymm6, 2 + vpxor ymm7, ymm7, ymm10 + vpxor ymm7, ymm7, ymm9 +L_AES_XTS_encrypt_update_vaes_enc_128: + ; 128 bytes of input + ; aes_enc_128 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + ; aes_enc_block + vbroadcasti128 ymm9, [r10] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vpxor ymm1, ymm1, ymm5 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm6 + vpxor ymm2, ymm2, ymm9 + vpxor ymm3, ymm3, ymm7 + vpxor ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+16] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+32] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+48] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+64] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+80] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+96] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+112] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+128] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+144] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + cmp r9d, 11 + vbroadcasti128 ymm9, [r10+160] + jl L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+176] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + cmp r9d, 13 + vbroadcasti128 ymm9, [r10+192] + jl L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+208] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vaesenc ymm2, ymm2, ymm9 + vaesenc ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+224] +L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm9 + vaesenclast ymm1, ymm1, ymm9 + vaesenclast ymm2, ymm2, ymm9 + vaesenclast ymm3, ymm3, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vpsrlq ymm9, ymm4, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm4, ymm4, 8 + vpxor ymm4, ymm4, ymm10 + vpxor ymm4, ymm4, ymm9 + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vpsrlq ymm9, ymm5, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm5, 8 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + vpxor ymm2, ymm2, ymm6 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vpsrlq ymm9, ymm6, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm6, ymm6, 8 + vpxor ymm6, ymm6, ymm10 + vpxor ymm6, ymm6, ymm9 + vpxor ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + vpsrlq ymm9, ymm7, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm7, ymm7, 8 + vpxor ymm7, ymm7, ymm10 + vpxor ymm7, ymm7, ymm9 + add r12d, 128 + cmp r12d, r11d + jl L_AES_XTS_encrypt_update_vaes_enc_128 + vextracti128 xmm8, ymm4, 0 +L_AES_XTS_encrypt_update_vaes_done_128: + mov r11d, eax + and r11d, 4294967232 + cmp r12d, r11d + je L_AES_XTS_encrypt_update_vaes_done_64 + ; 64 bytes of input + ; aes_enc_64 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + vpsrlq ymm9, ymm4, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm4, 2 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + ; aes_enc_block + vbroadcasti128 ymm9, [r10] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vpxor ymm1, ymm1, ymm5 + vpxor ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+16] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+32] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+48] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+64] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+80] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+96] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+112] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+128] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+144] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + cmp r9d, 11 + vbroadcasti128 ymm9, [r10+160] + jl L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+176] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + cmp r9d, 13 + vbroadcasti128 ymm9, [r10+192] + jl L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+208] + vaesenc ymm0, ymm0, ymm9 + vaesenc ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+224] +L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm9 + vaesenclast ymm1, ymm1, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vextracti128 xmm8, ymm5, 1 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpand xmm9, xmm9, xmm12 + vpxor xmm8, xmm8, xmm9 + add r12d, 64 +L_AES_XTS_encrypt_update_vaes_done_64: + mov r11d, eax + and r11d, 4294967264 + cmp r12d, r11d + je L_AES_XTS_encrypt_update_vaes_done_32 + ; 32 bytes of input + ; aes_enc_32 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu ymm0, YMMWORD PTR [rcx] + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + ; aes_enc_block + vbroadcasti128 ymm9, [r10] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+16] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+32] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+48] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+64] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+80] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+96] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+112] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+128] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+144] + vaesenc ymm0, ymm0, ymm9 + cmp r9d, 11 + vbroadcasti128 ymm9, [r10+160] + jl L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+176] + vaesenc ymm0, ymm0, ymm9 + cmp r9d, 13 + vbroadcasti128 ymm9, [r10+192] + jl L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+208] + vaesenc ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+224] +L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vextracti128 xmm8, ymm4, 1 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpand xmm9, xmm9, xmm12 + vpxor xmm8, xmm8, xmm9 + add r12d, 32 +L_AES_XTS_encrypt_update_vaes_done_32: + cmp r12d, eax + mov r11d, eax + je L_AES_XTS_encrypt_update_vaes_done_enc + sub r11d, r12d + cmp r11d, 16 + mov r11d, eax + jl L_AES_XTS_encrypt_update_vaes_last_15 + and r11d, 4294967280 + ; 16 bytes of input +L_AES_XTS_encrypt_update_vaes_enc_16: + lea rcx, QWORD PTR [rdi+r12] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm8 + ; aes_enc_block + vpxor xmm0, xmm0, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesenc xmm0, xmm0, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_encrypt_update_vaes_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesenc xmm0, xmm0, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_encrypt_update_vaes_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_encrypt_update_vaes_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu OWORD PTR [rcx], xmm0 + vpshufd xmm4, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpand xmm4, xmm4, xmm12 + vpxor xmm8, xmm8, xmm4 + add r12d, 16 + cmp r12d, r11d + jl L_AES_XTS_encrypt_update_vaes_enc_16 + cmp r12d, eax + je L_AES_XTS_encrypt_update_vaes_done_enc +L_AES_XTS_encrypt_update_vaes_last_15: + sub r12, 16 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu xmm0, OWORD PTR [rcx] + add r12, 16 + vmovdqu OWORD PTR [rsp], xmm0 + xor rdx, rdx +L_AES_XTS_encrypt_update_vaes_last_15_byte_loop: + mov r11b, BYTE PTR [rsp+rdx] + mov cl, BYTE PTR [rdi+r12] + mov BYTE PTR [rsi+r12], r11b + mov BYTE PTR [rsp+rdx], cl + inc r12d + inc edx + cmp r12d, eax + jl L_AES_XTS_encrypt_update_vaes_last_15_byte_loop + sub r12, rdx + vmovdqu xmm0, OWORD PTR [rsp] + sub r12, 16 + vpxor xmm0, xmm0, xmm8 + ; aes_enc_block + vpxor xmm0, xmm0, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesenc xmm0, xmm0, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_encrypt_update_vaes_last_15_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesenc xmm0, xmm0, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_encrypt_update_vaes_last_15_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_encrypt_update_vaes_last_15_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu OWORD PTR [rcx], xmm0 +L_AES_XTS_encrypt_update_vaes_done_enc: + vmovdqu OWORD PTR [r8], xmm8 + vmovdqu xmm6, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm8, OWORD PTR [rsp+96] + vmovdqu xmm9, OWORD PTR [rsp+112] + vmovdqu xmm10, OWORD PTR [rsp+128] + vmovdqu xmm11, OWORD PTR [rsp+144] + vmovdqu xmm12, OWORD PTR [rsp+160] + vmovdqu xmm13, OWORD PTR [rsp+176] + vmovdqu xmm14, OWORD PTR [rsp+192] + vmovdqu xmm15, OWORD PTR [rsp+208] + add rsp, 224 + pop r12 + pop rsi + pop rdi + ret +AES_XTS_encrypt_update_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_XTS_decrypt_vaes PROC + push rdi + push rsi + push r12 + push r13 + mov rdi, rcx + mov rsi, rdx + mov rax, r8 + mov r12, r9 + mov r8, QWORD PTR [rsp+72] + mov r9, QWORD PTR [rsp+80] + mov r10d, DWORD PTR [rsp+88] + sub rsp, 224 + vmovdqu OWORD PTR [rsp+64], xmm6 + vmovdqu OWORD PTR [rsp+80], xmm7 + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + vmovdqu OWORD PTR [rsp+128], xmm10 + vmovdqu OWORD PTR [rsp+144], xmm11 + vmovdqu OWORD PTR [rsp+160], xmm12 + vmovdqu OWORD PTR [rsp+176], xmm13 + vmovdqu OWORD PTR [rsp+192], xmm14 + vmovdqu OWORD PTR [rsp+208], xmm15 + vmovdqu xmm12, OWORD PTR L_vaes_aes_xts_gc_xts + vbroadcasti128 ymm13, ptr_L_vaes_aes_xts_poly + vmovdqu ymm14, YMMWORD PTR L_vaes_aes_xts_shl + vmovdqu ymm15, YMMWORD PTR L_vaes_aes_xts_shr + vmovdqu xmm8, OWORD PTR [r12] + ; aes_enc_block + vpxor xmm8, xmm8, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm8, xmm8, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_XTS_decrypt_vaes_tweak_aes_enc_block_last + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm8, xmm8, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_XTS_decrypt_vaes_tweak_aes_enc_block_last + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm8, xmm8, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_XTS_decrypt_vaes_tweak_aes_enc_block_last: + vaesenclast xmm8, xmm8, xmm5 + xor r13d, r13d + mov r11d, eax + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_vaes_mul16_128 + sub r11d, 16 + cmp r11d, 16 + jl L_AES_XTS_decrypt_vaes_last_31_start +L_AES_XTS_decrypt_vaes_mul16_128: + cmp r11d, 32 + jl L_AES_XTS_decrypt_vaes_done_128 + cmp r11d, 128 + jl L_AES_XTS_decrypt_vaes_done_128 + and r11d, 4294967168 + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + vpsrlq ymm9, ymm4, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm4, 2 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + vpsrlq ymm9, ymm5, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm6, ymm5, 2 + vpxor ymm6, ymm6, ymm10 + vpxor ymm6, ymm6, ymm9 + vpsrlq ymm9, ymm6, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm7, ymm6, 2 + vpxor ymm7, ymm7, ymm10 + vpxor ymm7, ymm7, ymm9 +L_AES_XTS_decrypt_vaes_dec_128: + ; 128 bytes of input + ; aes_dec_128 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + ; aes_dec_block + vbroadcasti128 ymm9, [r8] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vpxor ymm1, ymm1, ymm5 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm6 + vpxor ymm2, ymm2, ymm9 + vpxor ymm3, ymm3, ymm7 + vpxor ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+16] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+32] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+48] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+64] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+80] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+96] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+112] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+128] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+144] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + cmp r10d, 11 + vbroadcasti128 ymm9, [r8+160] + jl L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+176] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + cmp r10d, 13 + vbroadcasti128 ymm9, [r8+192] + jl L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+208] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r8+224] +L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm9 + vaesdeclast ymm1, ymm1, ymm9 + vaesdeclast ymm2, ymm2, ymm9 + vaesdeclast ymm3, ymm3, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vpsrlq ymm9, ymm4, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm4, ymm4, 8 + vpxor ymm4, ymm4, ymm10 + vpxor ymm4, ymm4, ymm9 + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vpsrlq ymm9, ymm5, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm5, 8 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + vpxor ymm2, ymm2, ymm6 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vpsrlq ymm9, ymm6, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm6, ymm6, 8 + vpxor ymm6, ymm6, ymm10 + vpxor ymm6, ymm6, ymm9 + vpxor ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + vpsrlq ymm9, ymm7, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm7, ymm7, 8 + vpxor ymm7, ymm7, ymm10 + vpxor ymm7, ymm7, ymm9 + add r13d, 128 + cmp r13d, r11d + jl L_AES_XTS_decrypt_vaes_dec_128 + vextracti128 xmm8, ymm4, 0 +L_AES_XTS_decrypt_vaes_done_128: + cmp r13d, eax + mov r11d, eax + je L_AES_XTS_decrypt_vaes_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_vaes_mul16_64 + sub r11d, 16 + sub r11d, r13d + cmp r11d, 16 + jl L_AES_XTS_decrypt_vaes_last_31_start + add r11d, r13d +L_AES_XTS_decrypt_vaes_mul16_64: + and r11d, 4294967232 + cmp r13d, r11d + je L_AES_XTS_decrypt_vaes_done_64 + ; 64 bytes of input + ; aes_dec_64 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + vpsrlq ymm9, ymm4, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm4, 2 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + ; aes_dec_block + vbroadcasti128 ymm9, [r8] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vpxor ymm1, ymm1, ymm5 + vpxor ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+16] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+32] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+48] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+64] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+80] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+96] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+112] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+128] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+144] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + cmp r10d, 11 + vbroadcasti128 ymm9, [r8+160] + jl L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+176] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + cmp r10d, 13 + vbroadcasti128 ymm9, [r8+192] + jl L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+208] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r8+224] +L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm9 + vaesdeclast ymm1, ymm1, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vextracti128 xmm8, ymm5, 1 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpand xmm9, xmm9, xmm12 + vpxor xmm8, xmm8, xmm9 + add r13d, 64 +L_AES_XTS_decrypt_vaes_done_64: + cmp r13d, eax + mov r11d, eax + je L_AES_XTS_decrypt_vaes_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_vaes_mul16_32 + sub r11d, 16 + sub r11d, r13d + cmp r11d, 16 + jl L_AES_XTS_decrypt_vaes_last_31_start + add r11d, r13d +L_AES_XTS_decrypt_vaes_mul16_32: + and r11d, 4294967264 + cmp r13d, r11d + je L_AES_XTS_decrypt_vaes_done_32 + ; 32 bytes of input + ; aes_dec_32 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu ymm0, YMMWORD PTR [rcx] + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + ; aes_dec_block + vbroadcasti128 ymm9, [r8] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+16] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+32] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+48] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+64] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+80] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+96] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+112] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+128] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+144] + vaesdec ymm0, ymm0, ymm9 + cmp r10d, 11 + vbroadcasti128 ymm9, [r8+160] + jl L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+176] + vaesdec ymm0, ymm0, ymm9 + cmp r10d, 13 + vbroadcasti128 ymm9, [r8+192] + jl L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+208] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r8+224] +L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vextracti128 xmm8, ymm4, 1 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpand xmm9, xmm9, xmm12 + vpxor xmm8, xmm8, xmm9 + add r13d, 32 +L_AES_XTS_decrypt_vaes_done_32: + cmp r13d, eax + mov r11d, eax + je L_AES_XTS_decrypt_vaes_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_vaes_mul16 + sub r11d, 16 + sub r11d, r13d + cmp r11d, 16 + jl L_AES_XTS_decrypt_vaes_last_31_start + add r11d, r13d +L_AES_XTS_decrypt_vaes_mul16: +L_AES_XTS_decrypt_vaes_dec_16: + ; 16 bytes of input + lea rcx, QWORD PTR [rdi+r13] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm8 + ; aes_dec_block + vpxor xmm0, xmm0, [r8] + vmovdqu xmm5, OWORD PTR [r8+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+144] + vaesdec xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r8+160] + jl L_AES_XTS_decrypt_vaes_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+176] + vaesdec xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r8+192] + jl L_AES_XTS_decrypt_vaes_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r8+224] +L_AES_XTS_decrypt_vaes_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r13] + vmovdqu OWORD PTR [rcx], xmm0 + vpshufd xmm4, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpand xmm4, xmm4, xmm12 + vpxor xmm8, xmm8, xmm4 + add r13d, 16 + cmp r13d, r11d + jl L_AES_XTS_decrypt_vaes_dec_16 + cmp r13d, eax + je L_AES_XTS_decrypt_vaes_done_dec +L_AES_XTS_decrypt_vaes_last_31_start: + vpshufd xmm4, xmm8, 19 + vpaddq xmm7, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpand xmm4, xmm4, xmm12 + vpxor xmm7, xmm7, xmm4 + lea rcx, QWORD PTR [rdi+r13] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm7 + ; aes_dec_block + vpxor xmm0, xmm0, [r8] + vmovdqu xmm5, OWORD PTR [r8+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+144] + vaesdec xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r8+160] + jl L_AES_XTS_decrypt_vaes_last_31_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+176] + vaesdec xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r8+192] + jl L_AES_XTS_decrypt_vaes_last_31_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r8+224] +L_AES_XTS_decrypt_vaes_last_31_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm7 + vmovdqu OWORD PTR [rsp], xmm0 + add r13, 16 + xor rdx, rdx +L_AES_XTS_decrypt_vaes_last_31_byte_loop: + mov r11b, BYTE PTR [rsp+rdx] + mov cl, BYTE PTR [rdi+r13] + mov BYTE PTR [rsi+r13], r11b + mov BYTE PTR [rsp+rdx], cl + inc r13d + inc edx + cmp r13d, eax + jl L_AES_XTS_decrypt_vaes_last_31_byte_loop + sub r13, rdx + vmovdqu xmm0, OWORD PTR [rsp] + vpxor xmm0, xmm0, xmm8 + ; aes_dec_block + vpxor xmm0, xmm0, [r8] + vmovdqu xmm5, OWORD PTR [r8+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+144] + vaesdec xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r8+160] + jl L_AES_XTS_decrypt_vaes_last_31_2_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+176] + vaesdec xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r8+192] + jl L_AES_XTS_decrypt_vaes_last_31_2_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r8+224] +L_AES_XTS_decrypt_vaes_last_31_2_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + sub r13, 16 + lea rcx, QWORD PTR [rsi+r13] + vmovdqu OWORD PTR [rcx], xmm0 +L_AES_XTS_decrypt_vaes_done_dec: + vmovdqu xmm6, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm8, OWORD PTR [rsp+96] + vmovdqu xmm9, OWORD PTR [rsp+112] + vmovdqu xmm10, OWORD PTR [rsp+128] + vmovdqu xmm11, OWORD PTR [rsp+144] + vmovdqu xmm12, OWORD PTR [rsp+160] + vmovdqu xmm13, OWORD PTR [rsp+176] + vmovdqu xmm14, OWORD PTR [rsp+192] + vmovdqu xmm15, OWORD PTR [rsp+208] + add rsp, 224 + pop r13 + pop r12 + pop rsi + pop rdi + ret +AES_XTS_decrypt_vaes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_XTS_decrypt_update_vaes PROC + push rdi + push rsi + push r12 + mov rdi, rcx + mov rsi, rdx + mov rax, r8 + mov r10, r9 + mov r8, QWORD PTR [rsp+64] + mov r9d, DWORD PTR [rsp+72] + sub rsp, 224 + vmovdqu OWORD PTR [rsp+64], xmm6 + vmovdqu OWORD PTR [rsp+80], xmm7 + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + vmovdqu OWORD PTR [rsp+128], xmm10 + vmovdqu OWORD PTR [rsp+144], xmm11 + vmovdqu OWORD PTR [rsp+160], xmm12 + vmovdqu OWORD PTR [rsp+176], xmm13 + vmovdqu OWORD PTR [rsp+192], xmm14 + vmovdqu OWORD PTR [rsp+208], xmm15 + vmovdqu xmm12, OWORD PTR L_vaes_aes_xts_gc_xts + vbroadcasti128 ymm13, ptr_L_vaes_aes_xts_poly + vmovdqu ymm14, YMMWORD PTR L_vaes_aes_xts_shl + vmovdqu ymm15, YMMWORD PTR L_vaes_aes_xts_shr + vmovdqu xmm8, OWORD PTR [r8] + xor r12d, r12d + mov r11d, eax + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_update_vaes_mul16_128 + sub r11d, 16 + cmp r11d, 16 + jl L_AES_XTS_decrypt_update_vaes_last_31_start +L_AES_XTS_decrypt_update_vaes_mul16_128: + cmp r11d, 32 + jl L_AES_XTS_decrypt_update_vaes_done_128 + cmp r11d, 128 + jl L_AES_XTS_decrypt_update_vaes_done_128 + and r11d, 4294967168 + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + vpsrlq ymm9, ymm4, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm4, 2 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + vpsrlq ymm9, ymm5, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm6, ymm5, 2 + vpxor ymm6, ymm6, ymm10 + vpxor ymm6, ymm6, ymm9 + vpsrlq ymm9, ymm6, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm7, ymm6, 2 + vpxor ymm7, ymm7, ymm10 + vpxor ymm7, ymm7, ymm9 +L_AES_XTS_decrypt_update_vaes_dec_128: + ; 128 bytes of input + ; aes_dec_128 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + ; aes_dec_block + vbroadcasti128 ymm9, [r10] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vpxor ymm1, ymm1, ymm5 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm6 + vpxor ymm2, ymm2, ymm9 + vpxor ymm3, ymm3, ymm7 + vpxor ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+16] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+32] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+48] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+64] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+80] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+96] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+112] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+128] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+144] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + cmp r9d, 11 + vbroadcasti128 ymm9, [r10+160] + jl L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+176] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + cmp r9d, 13 + vbroadcasti128 ymm9, [r10+192] + jl L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+208] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vaesdec ymm2, ymm2, ymm9 + vaesdec ymm3, ymm3, ymm9 + vbroadcasti128 ymm9, [r10+224] +L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm9 + vaesdeclast ymm1, ymm1, ymm9 + vaesdeclast ymm2, ymm2, ymm9 + vaesdeclast ymm3, ymm3, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vpsrlq ymm9, ymm4, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm4, ymm4, 8 + vpxor ymm4, ymm4, ymm10 + vpxor ymm4, ymm4, ymm9 + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vpsrlq ymm9, ymm5, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm5, 8 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + vpxor ymm2, ymm2, ymm6 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vpsrlq ymm9, ymm6, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm6, ymm6, 8 + vpxor ymm6, ymm6, ymm10 + vpxor ymm6, ymm6, ymm9 + vpxor ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + vpsrlq ymm9, ymm7, 56 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm7, ymm7, 8 + vpxor ymm7, ymm7, ymm10 + vpxor ymm7, ymm7, ymm9 + add r12d, 128 + cmp r12d, r11d + jl L_AES_XTS_decrypt_update_vaes_dec_128 + vextracti128 xmm8, ymm4, 0 +L_AES_XTS_decrypt_update_vaes_done_128: + cmp r12d, eax + mov r11d, eax + je L_AES_XTS_decrypt_update_vaes_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_update_vaes_mul16_64 + sub r11d, 16 + sub r11d, r12d + cmp r11d, 16 + jl L_AES_XTS_decrypt_update_vaes_last_31_start + add r11d, r12d +L_AES_XTS_decrypt_update_vaes_mul16_64: + and r11d, 4294967232 + cmp r12d, r11d + je L_AES_XTS_decrypt_update_vaes_done_64 + ; 64 bytes of input + ; aes_dec_64 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + vpsrlq ymm9, ymm4, 62 + vpclmulqdq ymm10, ymm9, ymm13, 1 + vpslldq ymm9, ymm9, 8 + vpsllq ymm5, ymm4, 2 + vpxor ymm5, ymm5, ymm10 + vpxor ymm5, ymm5, ymm9 + ; aes_dec_block + vbroadcasti128 ymm9, [r10] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vpxor ymm1, ymm1, ymm5 + vpxor ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+16] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+32] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+48] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+64] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+80] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+96] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+112] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+128] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+144] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + cmp r9d, 11 + vbroadcasti128 ymm9, [r10+160] + jl L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+176] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + cmp r9d, 13 + vbroadcasti128 ymm9, [r10+192] + jl L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+208] + vaesdec ymm0, ymm0, ymm9 + vaesdec ymm1, ymm1, ymm9 + vbroadcasti128 ymm9, [r10+224] +L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm9 + vaesdeclast ymm1, ymm1, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vpxor ymm1, ymm1, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vextracti128 xmm8, ymm5, 1 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpand xmm9, xmm9, xmm12 + vpxor xmm8, xmm8, xmm9 + add r12d, 64 +L_AES_XTS_decrypt_update_vaes_done_64: + cmp r12d, eax + mov r11d, eax + je L_AES_XTS_decrypt_update_vaes_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_update_vaes_mul16_32 + sub r11d, 16 + sub r11d, r12d + cmp r11d, 16 + jl L_AES_XTS_decrypt_update_vaes_last_31_start + add r11d, r12d +L_AES_XTS_decrypt_update_vaes_mul16_32: + and r11d, 4294967264 + cmp r12d, r11d + je L_AES_XTS_decrypt_update_vaes_done_32 + ; 32 bytes of input + ; aes_dec_32 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu ymm0, YMMWORD PTR [rcx] + vperm2i128 ymm5, ymm8, ymm8, 0 + vpsrlvq ymm6, ymm5, ymm15 + vpclmulqdq ymm7, ymm6, ymm13, 1 + vpslldq ymm6, ymm6, 8 + vpsllvq ymm4, ymm5, ymm14 + vpxor ymm4, ymm4, ymm7 + vpxor ymm4, ymm4, ymm6 + ; aes_dec_block + vbroadcasti128 ymm9, [r10] + vpxor ymm0, ymm0, ymm4 + vpxor ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+16] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+32] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+48] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+64] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+80] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+96] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+112] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+128] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+144] + vaesdec ymm0, ymm0, ymm9 + cmp r9d, 11 + vbroadcasti128 ymm9, [r10+160] + jl L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+176] + vaesdec ymm0, ymm0, ymm9 + cmp r9d, 13 + vbroadcasti128 ymm9, [r10+192] + jl L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+208] + vaesdec ymm0, ymm0, ymm9 + vbroadcasti128 ymm9, [r10+224] +L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm9 + vpxor ymm0, ymm0, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm0 + vextracti128 xmm8, ymm4, 1 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpand xmm9, xmm9, xmm12 + vpxor xmm8, xmm8, xmm9 + add r12d, 32 +L_AES_XTS_decrypt_update_vaes_done_32: + cmp r12d, eax + mov r11d, eax + je L_AES_XTS_decrypt_update_vaes_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_update_vaes_mul16 + sub r11d, 16 + sub r11d, r12d + cmp r11d, 16 + jl L_AES_XTS_decrypt_update_vaes_last_31_start + add r11d, r12d +L_AES_XTS_decrypt_update_vaes_mul16: +L_AES_XTS_decrypt_update_vaes_dec_16: + ; 16 bytes of input + lea rcx, QWORD PTR [rdi+r12] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm8 + ; aes_dec_block + vpxor xmm0, xmm0, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesdec xmm0, xmm0, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_decrypt_update_vaes_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesdec xmm0, xmm0, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_decrypt_update_vaes_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_decrypt_update_vaes_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu OWORD PTR [rcx], xmm0 + vpshufd xmm4, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpand xmm4, xmm4, xmm12 + vpxor xmm8, xmm8, xmm4 + add r12d, 16 + cmp r12d, r11d + jl L_AES_XTS_decrypt_update_vaes_dec_16 + cmp r12d, eax + je L_AES_XTS_decrypt_update_vaes_done_dec +L_AES_XTS_decrypt_update_vaes_last_31_start: + vpshufd xmm4, xmm8, 19 + vpaddq xmm7, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpand xmm4, xmm4, xmm12 + vpxor xmm7, xmm7, xmm4 + lea rcx, QWORD PTR [rdi+r12] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm7 + ; aes_dec_block + vpxor xmm0, xmm0, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesdec xmm0, xmm0, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_decrypt_update_vaes_last_31_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesdec xmm0, xmm0, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_decrypt_update_vaes_last_31_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_decrypt_update_vaes_last_31_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm7 + vmovdqu OWORD PTR [rsp], xmm0 + add r12, 16 + xor rdx, rdx +L_AES_XTS_decrypt_update_vaes_last_31_byte_loop: + mov r11b, BYTE PTR [rsp+rdx] + mov cl, BYTE PTR [rdi+r12] + mov BYTE PTR [rsi+r12], r11b + mov BYTE PTR [rsp+rdx], cl + inc r12d + inc edx + cmp r12d, eax + jl L_AES_XTS_decrypt_update_vaes_last_31_byte_loop + sub r12, rdx + vmovdqu xmm0, OWORD PTR [rsp] + vpxor xmm0, xmm0, xmm8 + ; aes_dec_block + vpxor xmm0, xmm0, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesdec xmm0, xmm0, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_decrypt_update_vaes_last_31_2_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesdec xmm0, xmm0, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_decrypt_update_vaes_last_31_2_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_decrypt_update_vaes_last_31_2_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + sub r12, 16 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu OWORD PTR [rcx], xmm0 +L_AES_XTS_decrypt_update_vaes_done_dec: + vmovdqu OWORD PTR [r8], xmm8 + vmovdqu xmm6, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm8, OWORD PTR [rsp+96] + vmovdqu xmm9, OWORD PTR [rsp+112] + vmovdqu xmm10, OWORD PTR [rsp+128] + vmovdqu xmm11, OWORD PTR [rsp+144] + vmovdqu xmm12, OWORD PTR [rsp+160] + vmovdqu xmm13, OWORD PTR [rsp+176] + vmovdqu xmm14, OWORD PTR [rsp+192] + vmovdqu xmm15, OWORD PTR [rsp+208] + add rsp, 224 + pop r12 + pop rsi + pop rdi + ret +AES_XTS_decrypt_update_vaes ENDP +_TEXT ENDS +ENDIF +IFDEF HAVE_INTEL_AVX512 +_TEXT SEGMENT READONLY PARA +AES_XTS_init_avx512 PROC + vmovdqu xmm0, OWORD PTR [rcx] + ; aes_enc_block + vpxor xmm0, xmm0, [rdx] + vmovdqu xmm2, OWORD PTR [rdx+16] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+32] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+48] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+64] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+80] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+96] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+112] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+128] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rdx+144] + vaesenc xmm0, xmm0, xmm2 + cmp r8d, 11 + vmovdqu xmm2, OWORD PTR [rdx+160] + jl L_AES_XTS_init_avx512_tweak_aes_enc_block_last + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm3, OWORD PTR [rdx+176] + vaesenc xmm0, xmm0, xmm3 + cmp r8d, 13 + vmovdqu xmm2, OWORD PTR [rdx+192] + jl L_AES_XTS_init_avx512_tweak_aes_enc_block_last + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm3, OWORD PTR [rdx+208] + vaesenc xmm0, xmm0, xmm3 + vmovdqu xmm2, OWORD PTR [rdx+224] +L_AES_XTS_init_avx512_tweak_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm2 + vmovdqu OWORD PTR [rcx], xmm0 + ret +AES_XTS_init_avx512 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_xts_gc_xts DWORD \ + 00000087h, 00000000h, 00000001h, 00000000h +ptr_L_avx512_aes_xts_gc_xts QWORD L_avx512_aes_xts_gc_xts +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_xts_poly DWORD \ + 00000087h, 00000000h, 00000000h, 00000000h +ptr_L_avx512_aes_xts_poly QWORD L_avx512_aes_xts_poly +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_xts_shl DWORD \ + 00000000h, 00000000h, 00000000h, 00000000h, + 00000001h, 00000000h, 00000001h, 00000000h, + 00000002h, 00000000h, 00000002h, 00000000h, + 00000003h, 00000000h, 00000003h, 00000000h +ptr_L_avx512_aes_xts_shl QWORD L_avx512_aes_xts_shl +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx512_aes_xts_shr DWORD \ + 00000040h, 00000000h, 00000040h, 00000000h, + 0000003fh, 00000000h, 0000003fh, 00000000h, + 0000003eh, 00000000h, 0000003eh, 00000000h, + 0000003dh, 00000000h, 0000003dh, 00000000h +ptr_L_avx512_aes_xts_shr QWORD L_avx512_aes_xts_shr +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_XTS_encrypt_avx512 PROC + push rdi + push rsi + push r12 + push r13 + mov rdi, rcx + mov rsi, rdx + mov rax, r8 + mov r12, r9 + mov r8, QWORD PTR [rsp+72] + mov r9, QWORD PTR [rsp+80] + mov r10d, DWORD PTR [rsp+88] + sub rsp, 224 + vmovdqu OWORD PTR [rsp+64], xmm6 + vmovdqu OWORD PTR [rsp+80], xmm7 + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + vmovdqu OWORD PTR [rsp+128], xmm10 + vmovdqu OWORD PTR [rsp+144], xmm11 + vmovdqu OWORD PTR [rsp+160], xmm12 + vmovdqu OWORD PTR [rsp+176], xmm13 + vmovdqu OWORD PTR [rsp+192], xmm14 + vmovdqu OWORD PTR [rsp+208], xmm15 + vmovdqu xmm12, OWORD PTR L_avx512_aes_xts_gc_xts + vbroadcasti32x4 zmm13, ptr_L_avx512_aes_xts_poly + vmovdqu64 zmm14, ptr_L_avx512_aes_xts_shl + vmovdqu64 zmm15, ptr_L_avx512_aes_xts_shr + vmovdqu xmm8, OWORD PTR [r12] + ; aes_enc_block + vpxor xmm8, xmm8, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm8, xmm8, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_XTS_encrypt_avx512_tweak_aes_enc_block_last + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm8, xmm8, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_XTS_encrypt_avx512_tweak_aes_enc_block_last + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm8, xmm8, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_XTS_encrypt_avx512_tweak_aes_enc_block_last: + vaesenclast xmm8, xmm8, xmm5 + xor r13d, r13d + cmp eax, 32 + jl L_AES_XTS_encrypt_avx512_done_128 + vbroadcasti32x4 zmm16, [r8] + vbroadcasti32x4 zmm17, [r8+16] + vbroadcasti32x4 zmm18, [r8+32] + vbroadcasti32x4 zmm19, [r8+48] + vbroadcasti32x4 zmm20, [r8+64] + vbroadcasti32x4 zmm21, [r8+80] + vbroadcasti32x4 zmm22, [r8+96] + vbroadcasti32x4 zmm23, [r8+112] + vbroadcasti32x4 zmm24, [r8+128] + vbroadcasti32x4 zmm25, [r8+144] + vbroadcasti32x4 zmm26, [r8+160] + cmp r10d, 11 + jl L_AES_XTS_encrypt_avx512_key_cached + vbroadcasti32x4 zmm27, [r8+176] + vbroadcasti32x4 zmm28, [r8+192] + cmp r10d, 13 + jl L_AES_XTS_encrypt_avx512_key_cached + vbroadcasti32x4 zmm29, [r8+208] + vbroadcasti32x4 zmm30, [r8+224] +L_AES_XTS_encrypt_avx512_key_cached: + cmp eax, 256 + mov r11d, eax + jl L_AES_XTS_encrypt_avx512_done_256 + and r11d, 4294967040 + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + vpsrlq zmm9, zmm4, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm4, 4 + vpternlogq zmm5, zmm10, zmm9, 150 + vpsrlq zmm9, zmm5, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm6, zmm5, 4 + vpternlogq zmm6, zmm10, zmm9, 150 + vpsrlq zmm9, zmm6, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm7, zmm6, 4 + vpternlogq zmm7, zmm10, zmm9, 150 +L_AES_XTS_encrypt_avx512_enc_256: + ; 256 bytes of input + ; aes_enc_256 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu64 zmm0, [rcx] + vmovdqu64 zmm1, [rcx+64] + vmovdqu64 zmm2, [rcx+128] + vmovdqu64 zmm3, [rcx+192] + ; aes_enc_block + vpternlogq zmm0, zmm16, zmm4, 150 + vpternlogq zmm1, zmm16, zmm5, 150 + vpternlogq zmm2, zmm16, zmm6, 150 + vpternlogq zmm3, zmm16, zmm7, 150 + vaesenc zmm0, zmm0, zmm17 + vaesenc zmm1, zmm1, zmm17 + vaesenc zmm2, zmm2, zmm17 + vaesenc zmm3, zmm3, zmm17 + vaesenc zmm0, zmm0, zmm18 + vaesenc zmm1, zmm1, zmm18 + vaesenc zmm2, zmm2, zmm18 + vaesenc zmm3, zmm3, zmm18 + vaesenc zmm0, zmm0, zmm19 + vaesenc zmm1, zmm1, zmm19 + vaesenc zmm2, zmm2, zmm19 + vaesenc zmm3, zmm3, zmm19 + vaesenc zmm0, zmm0, zmm20 + vaesenc zmm1, zmm1, zmm20 + vaesenc zmm2, zmm2, zmm20 + vaesenc zmm3, zmm3, zmm20 + vaesenc zmm0, zmm0, zmm21 + vaesenc zmm1, zmm1, zmm21 + vaesenc zmm2, zmm2, zmm21 + vaesenc zmm3, zmm3, zmm21 + vaesenc zmm0, zmm0, zmm22 + vaesenc zmm1, zmm1, zmm22 + vaesenc zmm2, zmm2, zmm22 + vaesenc zmm3, zmm3, zmm22 + vaesenc zmm0, zmm0, zmm23 + vaesenc zmm1, zmm1, zmm23 + vaesenc zmm2, zmm2, zmm23 + vaesenc zmm3, zmm3, zmm23 + vaesenc zmm0, zmm0, zmm24 + vaesenc zmm1, zmm1, zmm24 + vaesenc zmm2, zmm2, zmm24 + vaesenc zmm3, zmm3, zmm24 + vaesenc zmm0, zmm0, zmm25 + vaesenc zmm1, zmm1, zmm25 + vaesenc zmm2, zmm2, zmm25 + vaesenc zmm3, zmm3, zmm25 + cmp r10d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_encrypt_avx512_aes_enc_256_aes_enc_block_last + vaesenc zmm0, zmm0, zmm26 + vaesenc zmm1, zmm1, zmm26 + vaesenc zmm2, zmm2, zmm26 + vaesenc zmm3, zmm3, zmm26 + vaesenc zmm0, zmm0, zmm27 + vaesenc zmm1, zmm1, zmm27 + vaesenc zmm2, zmm2, zmm27 + vaesenc zmm3, zmm3, zmm27 + cmp r10d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_encrypt_avx512_aes_enc_256_aes_enc_block_last + vaesenc zmm0, zmm0, zmm28 + vaesenc zmm1, zmm1, zmm28 + vaesenc zmm2, zmm2, zmm28 + vaesenc zmm3, zmm3, zmm28 + vaesenc zmm0, zmm0, zmm29 + vaesenc zmm1, zmm1, zmm29 + vaesenc zmm2, zmm2, zmm29 + vaesenc zmm3, zmm3, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_encrypt_avx512_aes_enc_256_aes_enc_block_last: + vaesenclast zmm0, zmm0, zmm9 + vaesenclast zmm1, zmm1, zmm9 + vaesenclast zmm2, zmm2, zmm9 + vaesenclast zmm3, zmm3, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vpsrlq zmm9, zmm4, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm4, zmm4, 16 + vpternlogq zmm4, zmm10, zmm9, 150 + vpxorq zmm1, zmm1, zmm5 + vmovdqu64 [rdx+64], zmm1 + vpsrlq zmm9, zmm5, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm5, 16 + vpternlogq zmm5, zmm10, zmm9, 150 + vpxorq zmm2, zmm2, zmm6 + vmovdqu64 [rdx+128], zmm2 + vpsrlq zmm9, zmm6, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm6, zmm6, 16 + vpternlogq zmm6, zmm10, zmm9, 150 + vpxorq zmm3, zmm3, zmm7 + vmovdqu64 [rdx+192], zmm3 + vpsrlq zmm9, zmm7, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm7, zmm7, 16 + vpternlogq zmm7, zmm10, zmm9, 150 + add r13d, 256 + cmp r13d, r11d + jl L_AES_XTS_encrypt_avx512_enc_256 + vextracti32x4 xmm8, zmm4, 0 +L_AES_XTS_encrypt_avx512_done_256: + mov r11d, eax + and r11d, 4294967168 + cmp r13d, r11d + je L_AES_XTS_encrypt_avx512_done_128 + ; 128 bytes of input + ; aes_enc_128 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu64 zmm0, [rcx] + vmovdqu64 zmm1, [rcx+64] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + vpsrlq zmm9, zmm4, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm4, 4 + vpternlogq zmm5, zmm10, zmm9, 150 + ; aes_enc_block + vpternlogq zmm0, zmm16, zmm4, 150 + vpternlogq zmm1, zmm16, zmm5, 150 + vaesenc zmm0, zmm0, zmm17 + vaesenc zmm1, zmm1, zmm17 + vaesenc zmm0, zmm0, zmm18 + vaesenc zmm1, zmm1, zmm18 + vaesenc zmm0, zmm0, zmm19 + vaesenc zmm1, zmm1, zmm19 + vaesenc zmm0, zmm0, zmm20 + vaesenc zmm1, zmm1, zmm20 + vaesenc zmm0, zmm0, zmm21 + vaesenc zmm1, zmm1, zmm21 + vaesenc zmm0, zmm0, zmm22 + vaesenc zmm1, zmm1, zmm22 + vaesenc zmm0, zmm0, zmm23 + vaesenc zmm1, zmm1, zmm23 + vaesenc zmm0, zmm0, zmm24 + vaesenc zmm1, zmm1, zmm24 + vaesenc zmm0, zmm0, zmm25 + vaesenc zmm1, zmm1, zmm25 + cmp r10d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_encrypt_avx512_aes_enc_128_aes_enc_block_last + vaesenc zmm0, zmm0, zmm26 + vaesenc zmm1, zmm1, zmm26 + vaesenc zmm0, zmm0, zmm27 + vaesenc zmm1, zmm1, zmm27 + cmp r10d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_encrypt_avx512_aes_enc_128_aes_enc_block_last + vaesenc zmm0, zmm0, zmm28 + vaesenc zmm1, zmm1, zmm28 + vaesenc zmm0, zmm0, zmm29 + vaesenc zmm1, zmm1, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_encrypt_avx512_aes_enc_128_aes_enc_block_last: + vaesenclast zmm0, zmm0, zmm9 + vaesenclast zmm1, zmm1, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vpxorq zmm1, zmm1, zmm5 + vmovdqu64 [rdx+64], zmm1 + vextracti32x4 xmm8, zmm5, 3 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpternlogd xmm8, xmm9, xmm12, 120 + add r13d, 128 +L_AES_XTS_encrypt_avx512_done_128: + mov r11d, eax + and r11d, 4294967232 + cmp r13d, r11d + je L_AES_XTS_encrypt_avx512_done_64 + ; 64 bytes of input + ; aes_enc_64 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu64 zmm0, [rcx] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + ; aes_enc_block + vpternlogq zmm0, zmm16, zmm4, 150 + vaesenc zmm0, zmm0, zmm17 + vaesenc zmm0, zmm0, zmm18 + vaesenc zmm0, zmm0, zmm19 + vaesenc zmm0, zmm0, zmm20 + vaesenc zmm0, zmm0, zmm21 + vaesenc zmm0, zmm0, zmm22 + vaesenc zmm0, zmm0, zmm23 + vaesenc zmm0, zmm0, zmm24 + vaesenc zmm0, zmm0, zmm25 + cmp r10d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_encrypt_avx512_aes_enc_64_aes_enc_block_last + vaesenc zmm0, zmm0, zmm26 + vaesenc zmm0, zmm0, zmm27 + cmp r10d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_encrypt_avx512_aes_enc_64_aes_enc_block_last + vaesenc zmm0, zmm0, zmm28 + vaesenc zmm0, zmm0, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_encrypt_avx512_aes_enc_64_aes_enc_block_last: + vaesenclast zmm0, zmm0, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vextracti32x4 xmm8, zmm4, 3 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpternlogd xmm8, xmm9, xmm12, 120 + add r13d, 64 +L_AES_XTS_encrypt_avx512_done_64: + mov r11d, eax + and r11d, 4294967264 + cmp r13d, r11d + je L_AES_XTS_encrypt_avx512_done_32 + ; 32 bytes of input + ; aes_enc_32 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu64 ymm0, [rcx] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + ; aes_enc_block + vpternlogq ymm0, ymm16, ymm4, 150 + vaesenc ymm0, ymm0, ymm17 + vaesenc ymm0, ymm0, ymm18 + vaesenc ymm0, ymm0, ymm19 + vaesenc ymm0, ymm0, ymm20 + vaesenc ymm0, ymm0, ymm21 + vaesenc ymm0, ymm0, ymm22 + vaesenc ymm0, ymm0, ymm23 + vaesenc ymm0, ymm0, ymm24 + vaesenc ymm0, ymm0, ymm25 + cmp r10d, 11 + vmovdqa64 ymm9, ymm26 + jl L_AES_XTS_encrypt_avx512_aes_enc_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm26 + vaesenc ymm0, ymm0, ymm27 + cmp r10d, 13 + vmovdqa64 ymm9, ymm28 + jl L_AES_XTS_encrypt_avx512_aes_enc_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm28 + vaesenc ymm0, ymm0, ymm29 + vmovdqa64 ymm9, ymm30 +L_AES_XTS_encrypt_avx512_aes_enc_32_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm9 + vpxorq ymm0, ymm0, ymm4 + vmovdqu64 [rdx], ymm0 + vextracti32x4 xmm8, zmm4, 2 + add r13d, 32 +L_AES_XTS_encrypt_avx512_done_32: + cmp r13d, eax + mov r11d, eax + je L_AES_XTS_encrypt_avx512_done_enc + sub r11d, r13d + cmp r11d, 16 + mov r11d, eax + jl L_AES_XTS_encrypt_avx512_last_15 + and r11d, 4294967280 + ; 16 bytes of input +L_AES_XTS_encrypt_avx512_enc_16: + lea rcx, QWORD PTR [rdi+r13] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm8 + ; aes_enc_block + vpxor xmm0, xmm0, [r8] + vmovdqu xmm5, OWORD PTR [r8+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+144] + vaesenc xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r8+160] + jl L_AES_XTS_encrypt_avx512_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+176] + vaesenc xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r8+192] + jl L_AES_XTS_encrypt_avx512_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r8+224] +L_AES_XTS_encrypt_avx512_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r13] + vmovdqu OWORD PTR [rcx], xmm0 + vpshufd xmm4, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpternlogd xmm8, xmm4, xmm12, 120 + add r13d, 16 + cmp r13d, r11d + jl L_AES_XTS_encrypt_avx512_enc_16 + cmp r13d, eax + je L_AES_XTS_encrypt_avx512_done_enc +L_AES_XTS_encrypt_avx512_last_15: + sub r13, 16 + lea rcx, QWORD PTR [rsi+r13] + vmovdqu xmm0, OWORD PTR [rcx] + add r13, 16 + vmovdqu OWORD PTR [rsp], xmm0 + xor rdx, rdx +L_AES_XTS_encrypt_avx512_last_15_byte_loop: + mov r11b, BYTE PTR [rsp+rdx] + mov cl, BYTE PTR [rdi+r13] + mov BYTE PTR [rsi+r13], r11b + mov BYTE PTR [rsp+rdx], cl + inc r13d + inc edx + cmp r13d, eax + jl L_AES_XTS_encrypt_avx512_last_15_byte_loop + sub r13, rdx + vmovdqu xmm0, OWORD PTR [rsp] + sub r13, 16 + vpxor xmm0, xmm0, xmm8 + ; aes_enc_block + vpxor xmm0, xmm0, [r8] + vmovdqu xmm5, OWORD PTR [r8+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+144] + vaesenc xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r8+160] + jl L_AES_XTS_encrypt_avx512_last_15_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+176] + vaesenc xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r8+192] + jl L_AES_XTS_encrypt_avx512_last_15_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r8+224] +L_AES_XTS_encrypt_avx512_last_15_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r13] + vmovdqu OWORD PTR [rcx], xmm0 +L_AES_XTS_encrypt_avx512_done_enc: + vmovdqu xmm6, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm8, OWORD PTR [rsp+96] + vmovdqu xmm9, OWORD PTR [rsp+112] + vmovdqu xmm10, OWORD PTR [rsp+128] + vmovdqu xmm11, OWORD PTR [rsp+144] + vmovdqu xmm12, OWORD PTR [rsp+160] + vmovdqu xmm13, OWORD PTR [rsp+176] + vmovdqu xmm14, OWORD PTR [rsp+192] + vmovdqu xmm15, OWORD PTR [rsp+208] + add rsp, 224 + pop r13 + pop r12 + pop rsi + pop rdi + ret +AES_XTS_encrypt_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_XTS_encrypt_update_avx512 PROC + push rdi + push rsi + push r12 + mov rdi, rcx + mov rsi, rdx + mov rax, r8 + mov r10, r9 + mov r8, QWORD PTR [rsp+64] + mov r9d, DWORD PTR [rsp+72] + sub rsp, 224 + vmovdqu OWORD PTR [rsp+64], xmm6 + vmovdqu OWORD PTR [rsp+80], xmm7 + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + vmovdqu OWORD PTR [rsp+128], xmm10 + vmovdqu OWORD PTR [rsp+144], xmm11 + vmovdqu OWORD PTR [rsp+160], xmm12 + vmovdqu OWORD PTR [rsp+176], xmm13 + vmovdqu OWORD PTR [rsp+192], xmm14 + vmovdqu OWORD PTR [rsp+208], xmm15 + vmovdqu xmm12, OWORD PTR L_avx512_aes_xts_gc_xts + vbroadcasti32x4 zmm13, ptr_L_avx512_aes_xts_poly + vmovdqu64 zmm14, ptr_L_avx512_aes_xts_shl + vmovdqu64 zmm15, ptr_L_avx512_aes_xts_shr + vmovdqu xmm8, OWORD PTR [r8] + xor r12d, r12d + cmp eax, 32 + jl L_AES_XTS_encrypt_update_avx512_done_128 + vbroadcasti32x4 zmm16, [r10] + vbroadcasti32x4 zmm17, [r10+16] + vbroadcasti32x4 zmm18, [r10+32] + vbroadcasti32x4 zmm19, [r10+48] + vbroadcasti32x4 zmm20, [r10+64] + vbroadcasti32x4 zmm21, [r10+80] + vbroadcasti32x4 zmm22, [r10+96] + vbroadcasti32x4 zmm23, [r10+112] + vbroadcasti32x4 zmm24, [r10+128] + vbroadcasti32x4 zmm25, [r10+144] + vbroadcasti32x4 zmm26, [r10+160] + cmp r9d, 11 + jl L_AES_XTS_encrypt_update_avx512_key_cached + vbroadcasti32x4 zmm27, [r10+176] + vbroadcasti32x4 zmm28, [r10+192] + cmp r9d, 13 + jl L_AES_XTS_encrypt_update_avx512_key_cached + vbroadcasti32x4 zmm29, [r10+208] + vbroadcasti32x4 zmm30, [r10+224] +L_AES_XTS_encrypt_update_avx512_key_cached: + cmp eax, 256 + mov r11d, eax + jl L_AES_XTS_encrypt_update_avx512_done_256 + and r11d, 4294967040 + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + vpsrlq zmm9, zmm4, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm4, 4 + vpternlogq zmm5, zmm10, zmm9, 150 + vpsrlq zmm9, zmm5, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm6, zmm5, 4 + vpternlogq zmm6, zmm10, zmm9, 150 + vpsrlq zmm9, zmm6, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm7, zmm6, 4 + vpternlogq zmm7, zmm10, zmm9, 150 +L_AES_XTS_encrypt_update_avx512_enc_256: + ; 256 bytes of input + ; aes_enc_256 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 zmm0, [rcx] + vmovdqu64 zmm1, [rcx+64] + vmovdqu64 zmm2, [rcx+128] + vmovdqu64 zmm3, [rcx+192] + ; aes_enc_block + vpternlogq zmm0, zmm16, zmm4, 150 + vpternlogq zmm1, zmm16, zmm5, 150 + vpternlogq zmm2, zmm16, zmm6, 150 + vpternlogq zmm3, zmm16, zmm7, 150 + vaesenc zmm0, zmm0, zmm17 + vaesenc zmm1, zmm1, zmm17 + vaesenc zmm2, zmm2, zmm17 + vaesenc zmm3, zmm3, zmm17 + vaesenc zmm0, zmm0, zmm18 + vaesenc zmm1, zmm1, zmm18 + vaesenc zmm2, zmm2, zmm18 + vaesenc zmm3, zmm3, zmm18 + vaesenc zmm0, zmm0, zmm19 + vaesenc zmm1, zmm1, zmm19 + vaesenc zmm2, zmm2, zmm19 + vaesenc zmm3, zmm3, zmm19 + vaesenc zmm0, zmm0, zmm20 + vaesenc zmm1, zmm1, zmm20 + vaesenc zmm2, zmm2, zmm20 + vaesenc zmm3, zmm3, zmm20 + vaesenc zmm0, zmm0, zmm21 + vaesenc zmm1, zmm1, zmm21 + vaesenc zmm2, zmm2, zmm21 + vaesenc zmm3, zmm3, zmm21 + vaesenc zmm0, zmm0, zmm22 + vaesenc zmm1, zmm1, zmm22 + vaesenc zmm2, zmm2, zmm22 + vaesenc zmm3, zmm3, zmm22 + vaesenc zmm0, zmm0, zmm23 + vaesenc zmm1, zmm1, zmm23 + vaesenc zmm2, zmm2, zmm23 + vaesenc zmm3, zmm3, zmm23 + vaesenc zmm0, zmm0, zmm24 + vaesenc zmm1, zmm1, zmm24 + vaesenc zmm2, zmm2, zmm24 + vaesenc zmm3, zmm3, zmm24 + vaesenc zmm0, zmm0, zmm25 + vaesenc zmm1, zmm1, zmm25 + vaesenc zmm2, zmm2, zmm25 + vaesenc zmm3, zmm3, zmm25 + cmp r9d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_256_aes_enc_block_last + vaesenc zmm0, zmm0, zmm26 + vaesenc zmm1, zmm1, zmm26 + vaesenc zmm2, zmm2, zmm26 + vaesenc zmm3, zmm3, zmm26 + vaesenc zmm0, zmm0, zmm27 + vaesenc zmm1, zmm1, zmm27 + vaesenc zmm2, zmm2, zmm27 + vaesenc zmm3, zmm3, zmm27 + cmp r9d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_256_aes_enc_block_last + vaesenc zmm0, zmm0, zmm28 + vaesenc zmm1, zmm1, zmm28 + vaesenc zmm2, zmm2, zmm28 + vaesenc zmm3, zmm3, zmm28 + vaesenc zmm0, zmm0, zmm29 + vaesenc zmm1, zmm1, zmm29 + vaesenc zmm2, zmm2, zmm29 + vaesenc zmm3, zmm3, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_encrypt_update_avx512_aes_enc_256_aes_enc_block_last: + vaesenclast zmm0, zmm0, zmm9 + vaesenclast zmm1, zmm1, zmm9 + vaesenclast zmm2, zmm2, zmm9 + vaesenclast zmm3, zmm3, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vpsrlq zmm9, zmm4, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm4, zmm4, 16 + vpternlogq zmm4, zmm10, zmm9, 150 + vpxorq zmm1, zmm1, zmm5 + vmovdqu64 [rdx+64], zmm1 + vpsrlq zmm9, zmm5, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm5, 16 + vpternlogq zmm5, zmm10, zmm9, 150 + vpxorq zmm2, zmm2, zmm6 + vmovdqu64 [rdx+128], zmm2 + vpsrlq zmm9, zmm6, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm6, zmm6, 16 + vpternlogq zmm6, zmm10, zmm9, 150 + vpxorq zmm3, zmm3, zmm7 + vmovdqu64 [rdx+192], zmm3 + vpsrlq zmm9, zmm7, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm7, zmm7, 16 + vpternlogq zmm7, zmm10, zmm9, 150 + add r12d, 256 + cmp r12d, r11d + jl L_AES_XTS_encrypt_update_avx512_enc_256 + vextracti32x4 xmm8, zmm4, 0 +L_AES_XTS_encrypt_update_avx512_done_256: + mov r11d, eax + and r11d, 4294967168 + cmp r12d, r11d + je L_AES_XTS_encrypt_update_avx512_done_128 + ; 128 bytes of input + ; aes_enc_128 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 zmm0, [rcx] + vmovdqu64 zmm1, [rcx+64] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + vpsrlq zmm9, zmm4, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm4, 4 + vpternlogq zmm5, zmm10, zmm9, 150 + ; aes_enc_block + vpternlogq zmm0, zmm16, zmm4, 150 + vpternlogq zmm1, zmm16, zmm5, 150 + vaesenc zmm0, zmm0, zmm17 + vaesenc zmm1, zmm1, zmm17 + vaesenc zmm0, zmm0, zmm18 + vaesenc zmm1, zmm1, zmm18 + vaesenc zmm0, zmm0, zmm19 + vaesenc zmm1, zmm1, zmm19 + vaesenc zmm0, zmm0, zmm20 + vaesenc zmm1, zmm1, zmm20 + vaesenc zmm0, zmm0, zmm21 + vaesenc zmm1, zmm1, zmm21 + vaesenc zmm0, zmm0, zmm22 + vaesenc zmm1, zmm1, zmm22 + vaesenc zmm0, zmm0, zmm23 + vaesenc zmm1, zmm1, zmm23 + vaesenc zmm0, zmm0, zmm24 + vaesenc zmm1, zmm1, zmm24 + vaesenc zmm0, zmm0, zmm25 + vaesenc zmm1, zmm1, zmm25 + cmp r9d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_128_aes_enc_block_last + vaesenc zmm0, zmm0, zmm26 + vaesenc zmm1, zmm1, zmm26 + vaesenc zmm0, zmm0, zmm27 + vaesenc zmm1, zmm1, zmm27 + cmp r9d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_128_aes_enc_block_last + vaesenc zmm0, zmm0, zmm28 + vaesenc zmm1, zmm1, zmm28 + vaesenc zmm0, zmm0, zmm29 + vaesenc zmm1, zmm1, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_encrypt_update_avx512_aes_enc_128_aes_enc_block_last: + vaesenclast zmm0, zmm0, zmm9 + vaesenclast zmm1, zmm1, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vpxorq zmm1, zmm1, zmm5 + vmovdqu64 [rdx+64], zmm1 + vextracti32x4 xmm8, zmm5, 3 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpternlogd xmm8, xmm9, xmm12, 120 + add r12d, 128 +L_AES_XTS_encrypt_update_avx512_done_128: + mov r11d, eax + and r11d, 4294967232 + cmp r12d, r11d + je L_AES_XTS_encrypt_update_avx512_done_64 + ; 64 bytes of input + ; aes_enc_64 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 zmm0, [rcx] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + ; aes_enc_block + vpternlogq zmm0, zmm16, zmm4, 150 + vaesenc zmm0, zmm0, zmm17 + vaesenc zmm0, zmm0, zmm18 + vaesenc zmm0, zmm0, zmm19 + vaesenc zmm0, zmm0, zmm20 + vaesenc zmm0, zmm0, zmm21 + vaesenc zmm0, zmm0, zmm22 + vaesenc zmm0, zmm0, zmm23 + vaesenc zmm0, zmm0, zmm24 + vaesenc zmm0, zmm0, zmm25 + cmp r9d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_64_aes_enc_block_last + vaesenc zmm0, zmm0, zmm26 + vaesenc zmm0, zmm0, zmm27 + cmp r9d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_64_aes_enc_block_last + vaesenc zmm0, zmm0, zmm28 + vaesenc zmm0, zmm0, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_encrypt_update_avx512_aes_enc_64_aes_enc_block_last: + vaesenclast zmm0, zmm0, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vextracti32x4 xmm8, zmm4, 3 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpternlogd xmm8, xmm9, xmm12, 120 + add r12d, 64 +L_AES_XTS_encrypt_update_avx512_done_64: + mov r11d, eax + and r11d, 4294967264 + cmp r12d, r11d + je L_AES_XTS_encrypt_update_avx512_done_32 + ; 32 bytes of input + ; aes_enc_32 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 ymm0, [rcx] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + ; aes_enc_block + vpternlogq ymm0, ymm16, ymm4, 150 + vaesenc ymm0, ymm0, ymm17 + vaesenc ymm0, ymm0, ymm18 + vaesenc ymm0, ymm0, ymm19 + vaesenc ymm0, ymm0, ymm20 + vaesenc ymm0, ymm0, ymm21 + vaesenc ymm0, ymm0, ymm22 + vaesenc ymm0, ymm0, ymm23 + vaesenc ymm0, ymm0, ymm24 + vaesenc ymm0, ymm0, ymm25 + cmp r9d, 11 + vmovdqa64 ymm9, ymm26 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm26 + vaesenc ymm0, ymm0, ymm27 + cmp r9d, 13 + vmovdqa64 ymm9, ymm28 + jl L_AES_XTS_encrypt_update_avx512_aes_enc_32_aes_enc_block_last + vaesenc ymm0, ymm0, ymm28 + vaesenc ymm0, ymm0, ymm29 + vmovdqa64 ymm9, ymm30 +L_AES_XTS_encrypt_update_avx512_aes_enc_32_aes_enc_block_last: + vaesenclast ymm0, ymm0, ymm9 + vpxorq ymm0, ymm0, ymm4 + vmovdqu64 [rdx], ymm0 + vextracti32x4 xmm8, zmm4, 2 + add r12d, 32 +L_AES_XTS_encrypt_update_avx512_done_32: + cmp r12d, eax + mov r11d, eax + je L_AES_XTS_encrypt_update_avx512_done_enc + sub r11d, r12d + cmp r11d, 16 + mov r11d, eax + jl L_AES_XTS_encrypt_update_avx512_last_15 + and r11d, 4294967280 + ; 16 bytes of input +L_AES_XTS_encrypt_update_avx512_enc_16: + lea rcx, QWORD PTR [rdi+r12] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm8 + ; aes_enc_block + vpxor xmm0, xmm0, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesenc xmm0, xmm0, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_encrypt_update_avx512_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesenc xmm0, xmm0, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_encrypt_update_avx512_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_encrypt_update_avx512_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu OWORD PTR [rcx], xmm0 + vpshufd xmm4, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpternlogd xmm8, xmm4, xmm12, 120 + add r12d, 16 + cmp r12d, r11d + jl L_AES_XTS_encrypt_update_avx512_enc_16 + cmp r12d, eax + je L_AES_XTS_encrypt_update_avx512_done_enc +L_AES_XTS_encrypt_update_avx512_last_15: + sub r12, 16 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu xmm0, OWORD PTR [rcx] + add r12, 16 + vmovdqu OWORD PTR [rsp], xmm0 + xor rdx, rdx +L_AES_XTS_encrypt_update_avx512_last_15_byte_loop: + mov r11b, BYTE PTR [rsp+rdx] + mov cl, BYTE PTR [rdi+r12] + mov BYTE PTR [rsi+r12], r11b + mov BYTE PTR [rsp+rdx], cl + inc r12d + inc edx + cmp r12d, eax + jl L_AES_XTS_encrypt_update_avx512_last_15_byte_loop + sub r12, rdx + vmovdqu xmm0, OWORD PTR [rsp] + sub r12, 16 + vpxor xmm0, xmm0, xmm8 + ; aes_enc_block + vpxor xmm0, xmm0, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesenc xmm0, xmm0, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_encrypt_update_avx512_last_15_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesenc xmm0, xmm0, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_encrypt_update_avx512_last_15_aes_enc_block_last + vaesenc xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesenc xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_encrypt_update_avx512_last_15_aes_enc_block_last: + vaesenclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu OWORD PTR [rcx], xmm0 +L_AES_XTS_encrypt_update_avx512_done_enc: + vmovdqu OWORD PTR [r8], xmm8 + vmovdqu xmm6, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm8, OWORD PTR [rsp+96] + vmovdqu xmm9, OWORD PTR [rsp+112] + vmovdqu xmm10, OWORD PTR [rsp+128] + vmovdqu xmm11, OWORD PTR [rsp+144] + vmovdqu xmm12, OWORD PTR [rsp+160] + vmovdqu xmm13, OWORD PTR [rsp+176] + vmovdqu xmm14, OWORD PTR [rsp+192] + vmovdqu xmm15, OWORD PTR [rsp+208] + add rsp, 224 + pop r12 + pop rsi + pop rdi + ret +AES_XTS_encrypt_update_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_XTS_decrypt_avx512 PROC + push rdi + push rsi + push r12 + push r13 + mov rdi, rcx + mov rsi, rdx + mov rax, r8 + mov r12, r9 + mov r8, QWORD PTR [rsp+72] + mov r9, QWORD PTR [rsp+80] + mov r10d, DWORD PTR [rsp+88] + sub rsp, 224 + vmovdqu OWORD PTR [rsp+64], xmm6 + vmovdqu OWORD PTR [rsp+80], xmm7 + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + vmovdqu OWORD PTR [rsp+128], xmm10 + vmovdqu OWORD PTR [rsp+144], xmm11 + vmovdqu OWORD PTR [rsp+160], xmm12 + vmovdqu OWORD PTR [rsp+176], xmm13 + vmovdqu OWORD PTR [rsp+192], xmm14 + vmovdqu OWORD PTR [rsp+208], xmm15 + vmovdqu xmm12, OWORD PTR L_avx512_aes_xts_gc_xts + vbroadcasti32x4 zmm13, ptr_L_avx512_aes_xts_poly + vmovdqu64 zmm14, ptr_L_avx512_aes_xts_shl + vmovdqu64 zmm15, ptr_L_avx512_aes_xts_shr + vmovdqu xmm8, OWORD PTR [r12] + ; aes_enc_block + vpxor xmm8, xmm8, [r9] + vmovdqu xmm5, OWORD PTR [r9+16] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+32] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+48] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+64] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+80] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+96] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+112] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+128] + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm5, OWORD PTR [r9+144] + vaesenc xmm8, xmm8, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r9+160] + jl L_AES_XTS_decrypt_avx512_tweak_aes_enc_block_last + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r9+176] + vaesenc xmm8, xmm8, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r9+192] + jl L_AES_XTS_decrypt_avx512_tweak_aes_enc_block_last + vaesenc xmm8, xmm8, xmm5 + vmovdqu xmm6, OWORD PTR [r9+208] + vaesenc xmm8, xmm8, xmm6 + vmovdqu xmm5, OWORD PTR [r9+224] +L_AES_XTS_decrypt_avx512_tweak_aes_enc_block_last: + vaesenclast xmm8, xmm8, xmm5 + xor r13d, r13d + mov r11d, eax + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_avx512_mul16_256 + sub r11d, 16 + cmp r11d, 16 + jl L_AES_XTS_decrypt_avx512_last_31_start +L_AES_XTS_decrypt_avx512_mul16_256: + cmp r11d, 32 + jl L_AES_XTS_decrypt_avx512_done_128 + vbroadcasti32x4 zmm16, [r8] + vbroadcasti32x4 zmm17, [r8+16] + vbroadcasti32x4 zmm18, [r8+32] + vbroadcasti32x4 zmm19, [r8+48] + vbroadcasti32x4 zmm20, [r8+64] + vbroadcasti32x4 zmm21, [r8+80] + vbroadcasti32x4 zmm22, [r8+96] + vbroadcasti32x4 zmm23, [r8+112] + vbroadcasti32x4 zmm24, [r8+128] + vbroadcasti32x4 zmm25, [r8+144] + vbroadcasti32x4 zmm26, [r8+160] + cmp r10d, 11 + jl L_AES_XTS_decrypt_avx512_key_cached + vbroadcasti32x4 zmm27, [r8+176] + vbroadcasti32x4 zmm28, [r8+192] + cmp r10d, 13 + jl L_AES_XTS_decrypt_avx512_key_cached + vbroadcasti32x4 zmm29, [r8+208] + vbroadcasti32x4 zmm30, [r8+224] +L_AES_XTS_decrypt_avx512_key_cached: + cmp r11d, 256 + jl L_AES_XTS_decrypt_avx512_done_256 + and r11d, 4294967040 + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + vpsrlq zmm9, zmm4, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm4, 4 + vpternlogq zmm5, zmm10, zmm9, 150 + vpsrlq zmm9, zmm5, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm6, zmm5, 4 + vpternlogq zmm6, zmm10, zmm9, 150 + vpsrlq zmm9, zmm6, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm7, zmm6, 4 + vpternlogq zmm7, zmm10, zmm9, 150 +L_AES_XTS_decrypt_avx512_dec_256: + ; 256 bytes of input + ; aes_dec_256 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu64 zmm0, [rcx] + vmovdqu64 zmm1, [rcx+64] + vmovdqu64 zmm2, [rcx+128] + vmovdqu64 zmm3, [rcx+192] + ; aes_dec_block + vpternlogq zmm0, zmm16, zmm4, 150 + vpternlogq zmm1, zmm16, zmm5, 150 + vpternlogq zmm2, zmm16, zmm6, 150 + vpternlogq zmm3, zmm16, zmm7, 150 + vaesdec zmm0, zmm0, zmm17 + vaesdec zmm1, zmm1, zmm17 + vaesdec zmm2, zmm2, zmm17 + vaesdec zmm3, zmm3, zmm17 + vaesdec zmm0, zmm0, zmm18 + vaesdec zmm1, zmm1, zmm18 + vaesdec zmm2, zmm2, zmm18 + vaesdec zmm3, zmm3, zmm18 + vaesdec zmm0, zmm0, zmm19 + vaesdec zmm1, zmm1, zmm19 + vaesdec zmm2, zmm2, zmm19 + vaesdec zmm3, zmm3, zmm19 + vaesdec zmm0, zmm0, zmm20 + vaesdec zmm1, zmm1, zmm20 + vaesdec zmm2, zmm2, zmm20 + vaesdec zmm3, zmm3, zmm20 + vaesdec zmm0, zmm0, zmm21 + vaesdec zmm1, zmm1, zmm21 + vaesdec zmm2, zmm2, zmm21 + vaesdec zmm3, zmm3, zmm21 + vaesdec zmm0, zmm0, zmm22 + vaesdec zmm1, zmm1, zmm22 + vaesdec zmm2, zmm2, zmm22 + vaesdec zmm3, zmm3, zmm22 + vaesdec zmm0, zmm0, zmm23 + vaesdec zmm1, zmm1, zmm23 + vaesdec zmm2, zmm2, zmm23 + vaesdec zmm3, zmm3, zmm23 + vaesdec zmm0, zmm0, zmm24 + vaesdec zmm1, zmm1, zmm24 + vaesdec zmm2, zmm2, zmm24 + vaesdec zmm3, zmm3, zmm24 + vaesdec zmm0, zmm0, zmm25 + vaesdec zmm1, zmm1, zmm25 + vaesdec zmm2, zmm2, zmm25 + vaesdec zmm3, zmm3, zmm25 + cmp r10d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_decrypt_avx512_aes_dec_256_aes_dec_block_last + vaesdec zmm0, zmm0, zmm26 + vaesdec zmm1, zmm1, zmm26 + vaesdec zmm2, zmm2, zmm26 + vaesdec zmm3, zmm3, zmm26 + vaesdec zmm0, zmm0, zmm27 + vaesdec zmm1, zmm1, zmm27 + vaesdec zmm2, zmm2, zmm27 + vaesdec zmm3, zmm3, zmm27 + cmp r10d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_decrypt_avx512_aes_dec_256_aes_dec_block_last + vaesdec zmm0, zmm0, zmm28 + vaesdec zmm1, zmm1, zmm28 + vaesdec zmm2, zmm2, zmm28 + vaesdec zmm3, zmm3, zmm28 + vaesdec zmm0, zmm0, zmm29 + vaesdec zmm1, zmm1, zmm29 + vaesdec zmm2, zmm2, zmm29 + vaesdec zmm3, zmm3, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_decrypt_avx512_aes_dec_256_aes_dec_block_last: + vaesdeclast zmm0, zmm0, zmm9 + vaesdeclast zmm1, zmm1, zmm9 + vaesdeclast zmm2, zmm2, zmm9 + vaesdeclast zmm3, zmm3, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vpsrlq zmm9, zmm4, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm4, zmm4, 16 + vpternlogq zmm4, zmm10, zmm9, 150 + vpxorq zmm1, zmm1, zmm5 + vmovdqu64 [rdx+64], zmm1 + vpsrlq zmm9, zmm5, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm5, 16 + vpternlogq zmm5, zmm10, zmm9, 150 + vpxorq zmm2, zmm2, zmm6 + vmovdqu64 [rdx+128], zmm2 + vpsrlq zmm9, zmm6, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm6, zmm6, 16 + vpternlogq zmm6, zmm10, zmm9, 150 + vpxorq zmm3, zmm3, zmm7 + vmovdqu64 [rdx+192], zmm3 + vpsrlq zmm9, zmm7, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm7, zmm7, 16 + vpternlogq zmm7, zmm10, zmm9, 150 + add r13d, 256 + cmp r13d, r11d + jl L_AES_XTS_decrypt_avx512_dec_256 + vextracti32x4 xmm8, zmm4, 0 +L_AES_XTS_decrypt_avx512_done_256: + cmp r13d, eax + mov r11d, eax + je L_AES_XTS_decrypt_avx512_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_avx512_mul16_128 + sub r11d, 16 + sub r11d, r13d + cmp r11d, 16 + jl L_AES_XTS_decrypt_avx512_last_31_start + add r11d, r13d +L_AES_XTS_decrypt_avx512_mul16_128: + and r11d, 4294967168 + cmp r13d, r11d + je L_AES_XTS_decrypt_avx512_done_128 + ; 128 bytes of input + ; aes_dec_128 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu64 zmm0, [rcx] + vmovdqu64 zmm1, [rcx+64] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + vpsrlq zmm9, zmm4, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm4, 4 + vpternlogq zmm5, zmm10, zmm9, 150 + ; aes_dec_block + vpternlogq zmm0, zmm16, zmm4, 150 + vpternlogq zmm1, zmm16, zmm5, 150 + vaesdec zmm0, zmm0, zmm17 + vaesdec zmm1, zmm1, zmm17 + vaesdec zmm0, zmm0, zmm18 + vaesdec zmm1, zmm1, zmm18 + vaesdec zmm0, zmm0, zmm19 + vaesdec zmm1, zmm1, zmm19 + vaesdec zmm0, zmm0, zmm20 + vaesdec zmm1, zmm1, zmm20 + vaesdec zmm0, zmm0, zmm21 + vaesdec zmm1, zmm1, zmm21 + vaesdec zmm0, zmm0, zmm22 + vaesdec zmm1, zmm1, zmm22 + vaesdec zmm0, zmm0, zmm23 + vaesdec zmm1, zmm1, zmm23 + vaesdec zmm0, zmm0, zmm24 + vaesdec zmm1, zmm1, zmm24 + vaesdec zmm0, zmm0, zmm25 + vaesdec zmm1, zmm1, zmm25 + cmp r10d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_decrypt_avx512_aes_dec_128_aes_dec_block_last + vaesdec zmm0, zmm0, zmm26 + vaesdec zmm1, zmm1, zmm26 + vaesdec zmm0, zmm0, zmm27 + vaesdec zmm1, zmm1, zmm27 + cmp r10d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_decrypt_avx512_aes_dec_128_aes_dec_block_last + vaesdec zmm0, zmm0, zmm28 + vaesdec zmm1, zmm1, zmm28 + vaesdec zmm0, zmm0, zmm29 + vaesdec zmm1, zmm1, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_decrypt_avx512_aes_dec_128_aes_dec_block_last: + vaesdeclast zmm0, zmm0, zmm9 + vaesdeclast zmm1, zmm1, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vpxorq zmm1, zmm1, zmm5 + vmovdqu64 [rdx+64], zmm1 + vextracti32x4 xmm8, zmm5, 3 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpternlogd xmm8, xmm9, xmm12, 120 + add r13d, 128 +L_AES_XTS_decrypt_avx512_done_128: + cmp r13d, eax + mov r11d, eax + je L_AES_XTS_decrypt_avx512_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_avx512_mul16_64 + sub r11d, 16 + sub r11d, r13d + cmp r11d, 16 + jl L_AES_XTS_decrypt_avx512_last_31_start + add r11d, r13d +L_AES_XTS_decrypt_avx512_mul16_64: + and r11d, 4294967232 + cmp r13d, r11d + je L_AES_XTS_decrypt_avx512_done_64 + ; 64 bytes of input + ; aes_dec_64 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu64 zmm0, [rcx] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + ; aes_dec_block + vpternlogq zmm0, zmm16, zmm4, 150 + vaesdec zmm0, zmm0, zmm17 + vaesdec zmm0, zmm0, zmm18 + vaesdec zmm0, zmm0, zmm19 + vaesdec zmm0, zmm0, zmm20 + vaesdec zmm0, zmm0, zmm21 + vaesdec zmm0, zmm0, zmm22 + vaesdec zmm0, zmm0, zmm23 + vaesdec zmm0, zmm0, zmm24 + vaesdec zmm0, zmm0, zmm25 + cmp r10d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_decrypt_avx512_aes_dec_64_aes_dec_block_last + vaesdec zmm0, zmm0, zmm26 + vaesdec zmm0, zmm0, zmm27 + cmp r10d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_decrypt_avx512_aes_dec_64_aes_dec_block_last + vaesdec zmm0, zmm0, zmm28 + vaesdec zmm0, zmm0, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_decrypt_avx512_aes_dec_64_aes_dec_block_last: + vaesdeclast zmm0, zmm0, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vextracti32x4 xmm8, zmm4, 3 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpternlogd xmm8, xmm9, xmm12, 120 + add r13d, 64 +L_AES_XTS_decrypt_avx512_done_64: + cmp r13d, eax + mov r11d, eax + je L_AES_XTS_decrypt_avx512_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_avx512_mul16_32 + sub r11d, 16 + sub r11d, r13d + cmp r11d, 16 + jl L_AES_XTS_decrypt_avx512_last_31_start + add r11d, r13d +L_AES_XTS_decrypt_avx512_mul16_32: + and r11d, 4294967264 + cmp r13d, r11d + je L_AES_XTS_decrypt_avx512_done_32 + ; 32 bytes of input + ; aes_dec_32 + lea rcx, QWORD PTR [rdi+r13] + lea rdx, QWORD PTR [rsi+r13] + vmovdqu64 ymm0, [rcx] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + ; aes_dec_block + vpternlogq ymm0, ymm16, ymm4, 150 + vaesdec ymm0, ymm0, ymm17 + vaesdec ymm0, ymm0, ymm18 + vaesdec ymm0, ymm0, ymm19 + vaesdec ymm0, ymm0, ymm20 + vaesdec ymm0, ymm0, ymm21 + vaesdec ymm0, ymm0, ymm22 + vaesdec ymm0, ymm0, ymm23 + vaesdec ymm0, ymm0, ymm24 + vaesdec ymm0, ymm0, ymm25 + cmp r10d, 11 + vmovdqa64 ymm9, ymm26 + jl L_AES_XTS_decrypt_avx512_aes_dec_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm26 + vaesdec ymm0, ymm0, ymm27 + cmp r10d, 13 + vmovdqa64 ymm9, ymm28 + jl L_AES_XTS_decrypt_avx512_aes_dec_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm28 + vaesdec ymm0, ymm0, ymm29 + vmovdqa64 ymm9, ymm30 +L_AES_XTS_decrypt_avx512_aes_dec_32_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm9 + vpxorq ymm0, ymm0, ymm4 + vmovdqu64 [rdx], ymm0 + vextracti32x4 xmm8, zmm4, 2 + add r13d, 32 +L_AES_XTS_decrypt_avx512_done_32: + cmp r13d, eax + mov r11d, eax + je L_AES_XTS_decrypt_avx512_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_avx512_mul16 + sub r11d, 16 + sub r11d, r13d + cmp r11d, 16 + jl L_AES_XTS_decrypt_avx512_last_31_start + add r11d, r13d +L_AES_XTS_decrypt_avx512_mul16: +L_AES_XTS_decrypt_avx512_dec_16: + ; 16 bytes of input + lea rcx, QWORD PTR [rdi+r13] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm8 + ; aes_dec_block + vpxor xmm0, xmm0, [r8] + vmovdqu xmm5, OWORD PTR [r8+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+144] + vaesdec xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r8+160] + jl L_AES_XTS_decrypt_avx512_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+176] + vaesdec xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r8+192] + jl L_AES_XTS_decrypt_avx512_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r8+224] +L_AES_XTS_decrypt_avx512_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r13] + vmovdqu OWORD PTR [rcx], xmm0 + vpshufd xmm4, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpternlogd xmm8, xmm4, xmm12, 120 + add r13d, 16 + cmp r13d, r11d + jl L_AES_XTS_decrypt_avx512_dec_16 + cmp r13d, eax + je L_AES_XTS_decrypt_avx512_done_dec +L_AES_XTS_decrypt_avx512_last_31_start: + vpshufd xmm4, xmm8, 19 + vpaddq xmm7, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpternlogd xmm7, xmm4, xmm12, 120 + lea rcx, QWORD PTR [rdi+r13] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm7 + ; aes_dec_block + vpxor xmm0, xmm0, [r8] + vmovdqu xmm5, OWORD PTR [r8+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+144] + vaesdec xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r8+160] + jl L_AES_XTS_decrypt_avx512_last_31_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+176] + vaesdec xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r8+192] + jl L_AES_XTS_decrypt_avx512_last_31_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r8+224] +L_AES_XTS_decrypt_avx512_last_31_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm7 + vmovdqu OWORD PTR [rsp], xmm0 + add r13, 16 + xor rdx, rdx +L_AES_XTS_decrypt_avx512_last_31_byte_loop: + mov r11b, BYTE PTR [rsp+rdx] + mov cl, BYTE PTR [rdi+r13] + mov BYTE PTR [rsi+r13], r11b + mov BYTE PTR [rsp+rdx], cl + inc r13d + inc edx + cmp r13d, eax + jl L_AES_XTS_decrypt_avx512_last_31_byte_loop + sub r13, rdx + vmovdqu xmm0, OWORD PTR [rsp] + vpxor xmm0, xmm0, xmm8 + ; aes_dec_block + vpxor xmm0, xmm0, [r8] + vmovdqu xmm5, OWORD PTR [r8+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r8+144] + vaesdec xmm0, xmm0, xmm5 + cmp r10d, 11 + vmovdqu xmm5, OWORD PTR [r8+160] + jl L_AES_XTS_decrypt_avx512_last_31_2_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+176] + vaesdec xmm0, xmm0, xmm6 + cmp r10d, 13 + vmovdqu xmm5, OWORD PTR [r8+192] + jl L_AES_XTS_decrypt_avx512_last_31_2_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r8+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r8+224] +L_AES_XTS_decrypt_avx512_last_31_2_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + sub r13, 16 + lea rcx, QWORD PTR [rsi+r13] + vmovdqu OWORD PTR [rcx], xmm0 +L_AES_XTS_decrypt_avx512_done_dec: + vmovdqu xmm6, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm8, OWORD PTR [rsp+96] + vmovdqu xmm9, OWORD PTR [rsp+112] + vmovdqu xmm10, OWORD PTR [rsp+128] + vmovdqu xmm11, OWORD PTR [rsp+144] + vmovdqu xmm12, OWORD PTR [rsp+160] + vmovdqu xmm13, OWORD PTR [rsp+176] + vmovdqu xmm14, OWORD PTR [rsp+192] + vmovdqu xmm15, OWORD PTR [rsp+208] + add rsp, 224 + pop r13 + pop r12 + pop rsi + pop rdi + ret +AES_XTS_decrypt_avx512 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_XTS_decrypt_update_avx512 PROC + push rdi + push rsi + push r12 + mov rdi, rcx + mov rsi, rdx + mov rax, r8 + mov r10, r9 + mov r8, QWORD PTR [rsp+64] + mov r9d, DWORD PTR [rsp+72] + sub rsp, 224 + vmovdqu OWORD PTR [rsp+64], xmm6 + vmovdqu OWORD PTR [rsp+80], xmm7 + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + vmovdqu OWORD PTR [rsp+128], xmm10 + vmovdqu OWORD PTR [rsp+144], xmm11 + vmovdqu OWORD PTR [rsp+160], xmm12 + vmovdqu OWORD PTR [rsp+176], xmm13 + vmovdqu OWORD PTR [rsp+192], xmm14 + vmovdqu OWORD PTR [rsp+208], xmm15 + vmovdqu xmm12, OWORD PTR L_avx512_aes_xts_gc_xts + vbroadcasti32x4 zmm13, ptr_L_avx512_aes_xts_poly + vmovdqu64 zmm14, ptr_L_avx512_aes_xts_shl + vmovdqu64 zmm15, ptr_L_avx512_aes_xts_shr + vmovdqu xmm8, OWORD PTR [r8] + xor r12d, r12d + mov r11d, eax + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_update_avx512_mul16_256 + sub r11d, 16 + cmp r11d, 16 + jl L_AES_XTS_decrypt_update_avx512_last_31_start +L_AES_XTS_decrypt_update_avx512_mul16_256: + cmp r11d, 32 + jl L_AES_XTS_decrypt_update_avx512_done_128 + vbroadcasti32x4 zmm16, [r10] + vbroadcasti32x4 zmm17, [r10+16] + vbroadcasti32x4 zmm18, [r10+32] + vbroadcasti32x4 zmm19, [r10+48] + vbroadcasti32x4 zmm20, [r10+64] + vbroadcasti32x4 zmm21, [r10+80] + vbroadcasti32x4 zmm22, [r10+96] + vbroadcasti32x4 zmm23, [r10+112] + vbroadcasti32x4 zmm24, [r10+128] + vbroadcasti32x4 zmm25, [r10+144] + vbroadcasti32x4 zmm26, [r10+160] + cmp r9d, 11 + jl L_AES_XTS_decrypt_update_avx512_key_cached + vbroadcasti32x4 zmm27, [r10+176] + vbroadcasti32x4 zmm28, [r10+192] + cmp r9d, 13 + jl L_AES_XTS_decrypt_update_avx512_key_cached + vbroadcasti32x4 zmm29, [r10+208] + vbroadcasti32x4 zmm30, [r10+224] +L_AES_XTS_decrypt_update_avx512_key_cached: + cmp r11d, 256 + jl L_AES_XTS_decrypt_update_avx512_done_256 + and r11d, 4294967040 + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + vpsrlq zmm9, zmm4, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm4, 4 + vpternlogq zmm5, zmm10, zmm9, 150 + vpsrlq zmm9, zmm5, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm6, zmm5, 4 + vpternlogq zmm6, zmm10, zmm9, 150 + vpsrlq zmm9, zmm6, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm7, zmm6, 4 + vpternlogq zmm7, zmm10, zmm9, 150 +L_AES_XTS_decrypt_update_avx512_dec_256: + ; 256 bytes of input + ; aes_dec_256 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 zmm0, [rcx] + vmovdqu64 zmm1, [rcx+64] + vmovdqu64 zmm2, [rcx+128] + vmovdqu64 zmm3, [rcx+192] + ; aes_dec_block + vpternlogq zmm0, zmm16, zmm4, 150 + vpternlogq zmm1, zmm16, zmm5, 150 + vpternlogq zmm2, zmm16, zmm6, 150 + vpternlogq zmm3, zmm16, zmm7, 150 + vaesdec zmm0, zmm0, zmm17 + vaesdec zmm1, zmm1, zmm17 + vaesdec zmm2, zmm2, zmm17 + vaesdec zmm3, zmm3, zmm17 + vaesdec zmm0, zmm0, zmm18 + vaesdec zmm1, zmm1, zmm18 + vaesdec zmm2, zmm2, zmm18 + vaesdec zmm3, zmm3, zmm18 + vaesdec zmm0, zmm0, zmm19 + vaesdec zmm1, zmm1, zmm19 + vaesdec zmm2, zmm2, zmm19 + vaesdec zmm3, zmm3, zmm19 + vaesdec zmm0, zmm0, zmm20 + vaesdec zmm1, zmm1, zmm20 + vaesdec zmm2, zmm2, zmm20 + vaesdec zmm3, zmm3, zmm20 + vaesdec zmm0, zmm0, zmm21 + vaesdec zmm1, zmm1, zmm21 + vaesdec zmm2, zmm2, zmm21 + vaesdec zmm3, zmm3, zmm21 + vaesdec zmm0, zmm0, zmm22 + vaesdec zmm1, zmm1, zmm22 + vaesdec zmm2, zmm2, zmm22 + vaesdec zmm3, zmm3, zmm22 + vaesdec zmm0, zmm0, zmm23 + vaesdec zmm1, zmm1, zmm23 + vaesdec zmm2, zmm2, zmm23 + vaesdec zmm3, zmm3, zmm23 + vaesdec zmm0, zmm0, zmm24 + vaesdec zmm1, zmm1, zmm24 + vaesdec zmm2, zmm2, zmm24 + vaesdec zmm3, zmm3, zmm24 + vaesdec zmm0, zmm0, zmm25 + vaesdec zmm1, zmm1, zmm25 + vaesdec zmm2, zmm2, zmm25 + vaesdec zmm3, zmm3, zmm25 + cmp r9d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_256_aes_dec_block_last + vaesdec zmm0, zmm0, zmm26 + vaesdec zmm1, zmm1, zmm26 + vaesdec zmm2, zmm2, zmm26 + vaesdec zmm3, zmm3, zmm26 + vaesdec zmm0, zmm0, zmm27 + vaesdec zmm1, zmm1, zmm27 + vaesdec zmm2, zmm2, zmm27 + vaesdec zmm3, zmm3, zmm27 + cmp r9d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_256_aes_dec_block_last + vaesdec zmm0, zmm0, zmm28 + vaesdec zmm1, zmm1, zmm28 + vaesdec zmm2, zmm2, zmm28 + vaesdec zmm3, zmm3, zmm28 + vaesdec zmm0, zmm0, zmm29 + vaesdec zmm1, zmm1, zmm29 + vaesdec zmm2, zmm2, zmm29 + vaesdec zmm3, zmm3, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_decrypt_update_avx512_aes_dec_256_aes_dec_block_last: + vaesdeclast zmm0, zmm0, zmm9 + vaesdeclast zmm1, zmm1, zmm9 + vaesdeclast zmm2, zmm2, zmm9 + vaesdeclast zmm3, zmm3, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vpsrlq zmm9, zmm4, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm4, zmm4, 16 + vpternlogq zmm4, zmm10, zmm9, 150 + vpxorq zmm1, zmm1, zmm5 + vmovdqu64 [rdx+64], zmm1 + vpsrlq zmm9, zmm5, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm5, 16 + vpternlogq zmm5, zmm10, zmm9, 150 + vpxorq zmm2, zmm2, zmm6 + vmovdqu64 [rdx+128], zmm2 + vpsrlq zmm9, zmm6, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm6, zmm6, 16 + vpternlogq zmm6, zmm10, zmm9, 150 + vpxorq zmm3, zmm3, zmm7 + vmovdqu64 [rdx+192], zmm3 + vpsrlq zmm9, zmm7, 48 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm7, zmm7, 16 + vpternlogq zmm7, zmm10, zmm9, 150 + add r12d, 256 + cmp r12d, r11d + jl L_AES_XTS_decrypt_update_avx512_dec_256 + vextracti32x4 xmm8, zmm4, 0 +L_AES_XTS_decrypt_update_avx512_done_256: + cmp r12d, eax + mov r11d, eax + je L_AES_XTS_decrypt_update_avx512_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_update_avx512_mul16_128 + sub r11d, 16 + sub r11d, r12d + cmp r11d, 16 + jl L_AES_XTS_decrypt_update_avx512_last_31_start + add r11d, r12d +L_AES_XTS_decrypt_update_avx512_mul16_128: + and r11d, 4294967168 + cmp r12d, r11d + je L_AES_XTS_decrypt_update_avx512_done_128 + ; 128 bytes of input + ; aes_dec_128 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 zmm0, [rcx] + vmovdqu64 zmm1, [rcx+64] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + vpsrlq zmm9, zmm4, 60 + vpclmulqdq zmm10, zmm9, zmm13, 1 + vpslldq zmm9, zmm9, 8 + vpsllq zmm5, zmm4, 4 + vpternlogq zmm5, zmm10, zmm9, 150 + ; aes_dec_block + vpternlogq zmm0, zmm16, zmm4, 150 + vpternlogq zmm1, zmm16, zmm5, 150 + vaesdec zmm0, zmm0, zmm17 + vaesdec zmm1, zmm1, zmm17 + vaesdec zmm0, zmm0, zmm18 + vaesdec zmm1, zmm1, zmm18 + vaesdec zmm0, zmm0, zmm19 + vaesdec zmm1, zmm1, zmm19 + vaesdec zmm0, zmm0, zmm20 + vaesdec zmm1, zmm1, zmm20 + vaesdec zmm0, zmm0, zmm21 + vaesdec zmm1, zmm1, zmm21 + vaesdec zmm0, zmm0, zmm22 + vaesdec zmm1, zmm1, zmm22 + vaesdec zmm0, zmm0, zmm23 + vaesdec zmm1, zmm1, zmm23 + vaesdec zmm0, zmm0, zmm24 + vaesdec zmm1, zmm1, zmm24 + vaesdec zmm0, zmm0, zmm25 + vaesdec zmm1, zmm1, zmm25 + cmp r9d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_128_aes_dec_block_last + vaesdec zmm0, zmm0, zmm26 + vaesdec zmm1, zmm1, zmm26 + vaesdec zmm0, zmm0, zmm27 + vaesdec zmm1, zmm1, zmm27 + cmp r9d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_128_aes_dec_block_last + vaesdec zmm0, zmm0, zmm28 + vaesdec zmm1, zmm1, zmm28 + vaesdec zmm0, zmm0, zmm29 + vaesdec zmm1, zmm1, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_decrypt_update_avx512_aes_dec_128_aes_dec_block_last: + vaesdeclast zmm0, zmm0, zmm9 + vaesdeclast zmm1, zmm1, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vpxorq zmm1, zmm1, zmm5 + vmovdqu64 [rdx+64], zmm1 + vextracti32x4 xmm8, zmm5, 3 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpternlogd xmm8, xmm9, xmm12, 120 + add r12d, 128 +L_AES_XTS_decrypt_update_avx512_done_128: + cmp r12d, eax + mov r11d, eax + je L_AES_XTS_decrypt_update_avx512_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_update_avx512_mul16_64 + sub r11d, 16 + sub r11d, r12d + cmp r11d, 16 + jl L_AES_XTS_decrypt_update_avx512_last_31_start + add r11d, r12d +L_AES_XTS_decrypt_update_avx512_mul16_64: + and r11d, 4294967232 + cmp r12d, r11d + je L_AES_XTS_decrypt_update_avx512_done_64 + ; 64 bytes of input + ; aes_dec_64 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 zmm0, [rcx] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + ; aes_dec_block + vpternlogq zmm0, zmm16, zmm4, 150 + vaesdec zmm0, zmm0, zmm17 + vaesdec zmm0, zmm0, zmm18 + vaesdec zmm0, zmm0, zmm19 + vaesdec zmm0, zmm0, zmm20 + vaesdec zmm0, zmm0, zmm21 + vaesdec zmm0, zmm0, zmm22 + vaesdec zmm0, zmm0, zmm23 + vaesdec zmm0, zmm0, zmm24 + vaesdec zmm0, zmm0, zmm25 + cmp r9d, 11 + vmovdqa64 zmm9, zmm26 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_64_aes_dec_block_last + vaesdec zmm0, zmm0, zmm26 + vaesdec zmm0, zmm0, zmm27 + cmp r9d, 13 + vmovdqa64 zmm9, zmm28 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_64_aes_dec_block_last + vaesdec zmm0, zmm0, zmm28 + vaesdec zmm0, zmm0, zmm29 + vmovdqa64 zmm9, zmm30 +L_AES_XTS_decrypt_update_avx512_aes_dec_64_aes_dec_block_last: + vaesdeclast zmm0, zmm0, zmm9 + vpxorq zmm0, zmm0, zmm4 + vmovdqu64 [rdx], zmm0 + vextracti32x4 xmm8, zmm4, 3 + vpshufd xmm9, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm9, xmm9, 31 + vpternlogd xmm8, xmm9, xmm12, 120 + add r12d, 64 +L_AES_XTS_decrypt_update_avx512_done_64: + cmp r12d, eax + mov r11d, eax + je L_AES_XTS_decrypt_update_avx512_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_update_avx512_mul16_32 + sub r11d, 16 + sub r11d, r12d + cmp r11d, 16 + jl L_AES_XTS_decrypt_update_avx512_last_31_start + add r11d, r12d +L_AES_XTS_decrypt_update_avx512_mul16_32: + and r11d, 4294967264 + cmp r12d, r11d + je L_AES_XTS_decrypt_update_avx512_done_32 + ; 32 bytes of input + ; aes_dec_32 + lea rcx, QWORD PTR [rdi+r12] + lea rdx, QWORD PTR [rsi+r12] + vmovdqu64 ymm0, [rcx] + vshufi64x2 zmm5, zmm8, zmm8, 0 + vpsrlvq zmm6, zmm5, zmm15 + vpclmulqdq zmm7, zmm6, zmm13, 1 + vpslldq zmm6, zmm6, 8 + vpsllvq zmm4, zmm5, zmm14 + vpternlogq zmm4, zmm7, zmm6, 150 + ; aes_dec_block + vpternlogq ymm0, ymm16, ymm4, 150 + vaesdec ymm0, ymm0, ymm17 + vaesdec ymm0, ymm0, ymm18 + vaesdec ymm0, ymm0, ymm19 + vaesdec ymm0, ymm0, ymm20 + vaesdec ymm0, ymm0, ymm21 + vaesdec ymm0, ymm0, ymm22 + vaesdec ymm0, ymm0, ymm23 + vaesdec ymm0, ymm0, ymm24 + vaesdec ymm0, ymm0, ymm25 + cmp r9d, 11 + vmovdqa64 ymm9, ymm26 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm26 + vaesdec ymm0, ymm0, ymm27 + cmp r9d, 13 + vmovdqa64 ymm9, ymm28 + jl L_AES_XTS_decrypt_update_avx512_aes_dec_32_aes_dec_block_last + vaesdec ymm0, ymm0, ymm28 + vaesdec ymm0, ymm0, ymm29 + vmovdqa64 ymm9, ymm30 +L_AES_XTS_decrypt_update_avx512_aes_dec_32_aes_dec_block_last: + vaesdeclast ymm0, ymm0, ymm9 + vpxorq ymm0, ymm0, ymm4 + vmovdqu64 [rdx], ymm0 + vextracti32x4 xmm8, zmm4, 2 + add r12d, 32 +L_AES_XTS_decrypt_update_avx512_done_32: + cmp r12d, eax + mov r11d, eax + je L_AES_XTS_decrypt_update_avx512_done_dec + and r11d, 4294967280 + cmp r11d, eax + je L_AES_XTS_decrypt_update_avx512_mul16 + sub r11d, 16 + sub r11d, r12d + cmp r11d, 16 + jl L_AES_XTS_decrypt_update_avx512_last_31_start + add r11d, r12d +L_AES_XTS_decrypt_update_avx512_mul16: +L_AES_XTS_decrypt_update_avx512_dec_16: + ; 16 bytes of input + lea rcx, QWORD PTR [rdi+r12] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm8 + ; aes_dec_block + vpxor xmm0, xmm0, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesdec xmm0, xmm0, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_decrypt_update_avx512_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesdec xmm0, xmm0, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_decrypt_update_avx512_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_decrypt_update_avx512_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu OWORD PTR [rcx], xmm0 + vpshufd xmm4, xmm8, 19 + vpaddq xmm8, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpternlogd xmm8, xmm4, xmm12, 120 + add r12d, 16 + cmp r12d, r11d + jl L_AES_XTS_decrypt_update_avx512_dec_16 + cmp r12d, eax + je L_AES_XTS_decrypt_update_avx512_done_dec +L_AES_XTS_decrypt_update_avx512_last_31_start: + vpshufd xmm4, xmm8, 19 + vpaddq xmm7, xmm8, xmm8 + vpsrad xmm4, xmm4, 31 + vpternlogd xmm7, xmm4, xmm12, 120 + lea rcx, QWORD PTR [rdi+r12] + vmovdqu xmm0, OWORD PTR [rcx] + vpxor xmm0, xmm0, xmm7 + ; aes_dec_block + vpxor xmm0, xmm0, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesdec xmm0, xmm0, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_decrypt_update_avx512_last_31_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesdec xmm0, xmm0, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_decrypt_update_avx512_last_31_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_decrypt_update_avx512_last_31_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm7 + vmovdqu OWORD PTR [rsp], xmm0 + add r12, 16 + xor rdx, rdx +L_AES_XTS_decrypt_update_avx512_last_31_byte_loop: + mov r11b, BYTE PTR [rsp+rdx] + mov cl, BYTE PTR [rdi+r12] + mov BYTE PTR [rsi+r12], r11b + mov BYTE PTR [rsp+rdx], cl + inc r12d + inc edx + cmp r12d, eax + jl L_AES_XTS_decrypt_update_avx512_last_31_byte_loop + sub r12, rdx + vmovdqu xmm0, OWORD PTR [rsp] + vpxor xmm0, xmm0, xmm8 + ; aes_dec_block + vpxor xmm0, xmm0, [r10] + vmovdqu xmm5, OWORD PTR [r10+16] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+32] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+48] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+64] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+80] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+96] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+112] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+128] + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm5, OWORD PTR [r10+144] + vaesdec xmm0, xmm0, xmm5 + cmp r9d, 11 + vmovdqu xmm5, OWORD PTR [r10+160] + jl L_AES_XTS_decrypt_update_avx512_last_31_2_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+176] + vaesdec xmm0, xmm0, xmm6 + cmp r9d, 13 + vmovdqu xmm5, OWORD PTR [r10+192] + jl L_AES_XTS_decrypt_update_avx512_last_31_2_aes_dec_block_last + vaesdec xmm0, xmm0, xmm5 + vmovdqu xmm6, OWORD PTR [r10+208] + vaesdec xmm0, xmm0, xmm6 + vmovdqu xmm5, OWORD PTR [r10+224] +L_AES_XTS_decrypt_update_avx512_last_31_2_aes_dec_block_last: + vaesdeclast xmm0, xmm0, xmm5 + vpxor xmm0, xmm0, xmm8 + sub r12, 16 + lea rcx, QWORD PTR [rsi+r12] + vmovdqu OWORD PTR [rcx], xmm0 +L_AES_XTS_decrypt_update_avx512_done_dec: + vmovdqu OWORD PTR [r8], xmm8 + vmovdqu xmm6, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm8, OWORD PTR [rsp+96] + vmovdqu xmm9, OWORD PTR [rsp+112] + vmovdqu xmm10, OWORD PTR [rsp+128] + vmovdqu xmm11, OWORD PTR [rsp+144] + vmovdqu xmm12, OWORD PTR [rsp+160] + vmovdqu xmm13, OWORD PTR [rsp+176] + vmovdqu xmm14, OWORD PTR [rsp+192] + vmovdqu xmm15, OWORD PTR [rsp+208] + add rsp, 224 + pop r12 + pop rsi + pop rdi + ret +AES_XTS_decrypt_update_avx512 ENDP +_TEXT ENDS +ENDIF END diff --git a/wolfcrypt/src/chacha_asm.S b/wolfcrypt/src/chacha_asm.S index 6109e22f603..ba8768bd9c2 100644 --- a/wolfcrypt/src/chacha_asm.S +++ b/wolfcrypt/src/chacha_asm.S @@ -46,6 +46,16 @@ #define HAVE_INTEL_AVX2 #endif /* HAVE_INTEL_AVX2 */ #endif /* NO_AVX2_SUPPORT */ +#ifndef NO_VAES_SUPPORT +#ifndef HAVE_INTEL_VAES +#define HAVE_INTEL_VAES +#endif /* HAVE_INTEL_VAES */ +#endif /* NO_VAES_SUPPORT */ +#ifndef NO_AVX512_SUPPORT +#ifndef HAVE_INTEL_AVX512 +#define HAVE_INTEL_AVX512 +#endif /* HAVE_INTEL_AVX512 */ +#endif /* NO_AVX512_SUPPORT */ #ifdef WOLFSSL_X86_64_BUILD #ifndef __APPLE__ diff --git a/wolfcrypt/src/cpuid.c b/wolfcrypt/src/cpuid.c index 2c3670234a6..8963abb49a8 100644 --- a/wolfcrypt/src/cpuid.c +++ b/wolfcrypt/src/cpuid.c @@ -130,6 +130,8 @@ if (cpuid_flag(1, 0, ECX, 22)) { new_cpuid_flags |= CPUID_MOVBE ; } if (cpuid_flag(7, 0, EBX, 3)) { new_cpuid_flags |= CPUID_BMI1 ; } if (cpuid_flag(7, 0, EBX, 29)) { new_cpuid_flags |= CPUID_SHA ; } + if (cpuid_flag(7, 0, ECX, 9)) { new_cpuid_flags |= CPUID_VAES ; } + if (cpuid_flag(7, 0, EBX, 16)) { new_cpuid_flags |= CPUID_AVX512; } (void)wolfSSL_Atomic_Uint_CompareExchange (&cpuid_flags, &old_cpuid_flags, new_cpuid_flags); } diff --git a/wolfcrypt/src/fe_x25519_asm.S b/wolfcrypt/src/fe_x25519_asm.S index 3f0e0dd6a89..7e976fa1f28 100644 --- a/wolfcrypt/src/fe_x25519_asm.S +++ b/wolfcrypt/src/fe_x25519_asm.S @@ -46,6 +46,16 @@ #define HAVE_INTEL_AVX2 #endif /* HAVE_INTEL_AVX2 */ #endif /* NO_AVX2_SUPPORT */ +#ifndef NO_VAES_SUPPORT +#ifndef HAVE_INTEL_VAES +#define HAVE_INTEL_VAES +#endif /* HAVE_INTEL_VAES */ +#endif /* NO_VAES_SUPPORT */ +#ifndef NO_AVX512_SUPPORT +#ifndef HAVE_INTEL_AVX512 +#define HAVE_INTEL_AVX512 +#endif /* HAVE_INTEL_AVX512 */ +#endif /* NO_AVX512_SUPPORT */ #ifndef __APPLE__ .text diff --git a/wolfcrypt/src/include.am b/wolfcrypt/src/include.am index 18d7a339cd5..908c43984cd 100644 --- a/wolfcrypt/src/include.am +++ b/wolfcrypt/src/include.am @@ -16,6 +16,7 @@ EXTRA_DIST += wolfcrypt/src/evp.c EXTRA_DIST += wolfcrypt/src/evp_pk.c EXTRA_DIST += wolfcrypt/src/asm.c EXTRA_DIST += wolfcrypt/src/aes_asm.asm +EXTRA_DIST += wolfcrypt/src/aes_x86_64_asm.asm EXTRA_DIST += wolfcrypt/src/aes_gcm_asm.asm EXTRA_DIST += wolfcrypt/src/aes_xts_asm.asm EXTRA_DIST += wolfcrypt/src/chacha_asm.asm diff --git a/wolfcrypt/src/poly1305_asm.S b/wolfcrypt/src/poly1305_asm.S index 7f73e87b67e..f55cce5a079 100644 --- a/wolfcrypt/src/poly1305_asm.S +++ b/wolfcrypt/src/poly1305_asm.S @@ -46,6 +46,16 @@ #define HAVE_INTEL_AVX2 #endif /* HAVE_INTEL_AVX2 */ #endif /* NO_AVX2_SUPPORT */ +#ifndef NO_VAES_SUPPORT +#ifndef HAVE_INTEL_VAES +#define HAVE_INTEL_VAES +#endif /* HAVE_INTEL_VAES */ +#endif /* NO_VAES_SUPPORT */ +#ifndef NO_AVX512_SUPPORT +#ifndef HAVE_INTEL_AVX512 +#define HAVE_INTEL_AVX512 +#endif /* HAVE_INTEL_AVX512 */ +#endif /* NO_AVX512_SUPPORT */ #ifdef WOLFSSL_X86_64_BUILD #ifdef HAVE_INTEL_AVX1 diff --git a/wolfcrypt/src/sha256_asm.S b/wolfcrypt/src/sha256_asm.S index a407b7de1f5..d91a82aff94 100644 --- a/wolfcrypt/src/sha256_asm.S +++ b/wolfcrypt/src/sha256_asm.S @@ -46,6 +46,16 @@ #define HAVE_INTEL_AVX2 #endif /* HAVE_INTEL_AVX2 */ #endif /* NO_AVX2_SUPPORT */ +#ifndef NO_VAES_SUPPORT +#ifndef HAVE_INTEL_VAES +#define HAVE_INTEL_VAES +#endif /* HAVE_INTEL_VAES */ +#endif /* NO_VAES_SUPPORT */ +#ifndef NO_AVX512_SUPPORT +#ifndef HAVE_INTEL_AVX512 +#define HAVE_INTEL_AVX512 +#endif /* HAVE_INTEL_AVX512 */ +#endif /* NO_AVX512_SUPPORT */ #ifdef WOLFSSL_X86_64_BUILD #ifndef __APPLE__ diff --git a/wolfcrypt/src/sha3_asm.S b/wolfcrypt/src/sha3_asm.S index 6abc9d851b1..810a1c67433 100644 --- a/wolfcrypt/src/sha3_asm.S +++ b/wolfcrypt/src/sha3_asm.S @@ -46,6 +46,16 @@ #define HAVE_INTEL_AVX2 #endif /* HAVE_INTEL_AVX2 */ #endif /* NO_AVX2_SUPPORT */ +#ifndef NO_VAES_SUPPORT +#ifndef HAVE_INTEL_VAES +#define HAVE_INTEL_VAES +#endif /* HAVE_INTEL_VAES */ +#endif /* NO_VAES_SUPPORT */ +#ifndef NO_AVX512_SUPPORT +#ifndef HAVE_INTEL_AVX512 +#define HAVE_INTEL_AVX512 +#endif /* HAVE_INTEL_AVX512 */ +#endif /* NO_AVX512_SUPPORT */ #ifndef __APPLE__ .data diff --git a/wolfcrypt/src/sha512_asm.S b/wolfcrypt/src/sha512_asm.S index d0ca1dd4fd4..b3c377deabc 100644 --- a/wolfcrypt/src/sha512_asm.S +++ b/wolfcrypt/src/sha512_asm.S @@ -46,6 +46,16 @@ #define HAVE_INTEL_AVX2 #endif /* HAVE_INTEL_AVX2 */ #endif /* NO_AVX2_SUPPORT */ +#ifndef NO_VAES_SUPPORT +#ifndef HAVE_INTEL_VAES +#define HAVE_INTEL_VAES +#endif /* HAVE_INTEL_VAES */ +#endif /* NO_VAES_SUPPORT */ +#ifndef NO_AVX512_SUPPORT +#ifndef HAVE_INTEL_AVX512 +#define HAVE_INTEL_AVX512 +#endif /* HAVE_INTEL_AVX512 */ +#endif /* NO_AVX512_SUPPORT */ #ifdef HAVE_INTEL_AVX1 #ifndef __APPLE__ diff --git a/wolfcrypt/src/wc_mldsa_asm.S b/wolfcrypt/src/wc_mldsa_asm.S index 717986e4a5c..e1e77a93783 100644 --- a/wolfcrypt/src/wc_mldsa_asm.S +++ b/wolfcrypt/src/wc_mldsa_asm.S @@ -46,6 +46,16 @@ #define HAVE_INTEL_AVX2 #endif /* HAVE_INTEL_AVX2 */ #endif /* NO_AVX2_SUPPORT */ +#ifndef NO_VAES_SUPPORT +#ifndef HAVE_INTEL_VAES +#define HAVE_INTEL_VAES +#endif /* HAVE_INTEL_VAES */ +#endif /* NO_VAES_SUPPORT */ +#ifndef NO_AVX512_SUPPORT +#ifndef HAVE_INTEL_AVX512 +#define HAVE_INTEL_AVX512 +#endif /* HAVE_INTEL_AVX512 */ +#endif /* NO_AVX512_SUPPORT */ #ifdef WOLFSSL_HAVE_MLDSA #ifdef HAVE_INTEL_AVX2 diff --git a/wolfcrypt/src/wc_mlkem_asm.S b/wolfcrypt/src/wc_mlkem_asm.S index 9b80cf8d432..b399218dfdd 100644 --- a/wolfcrypt/src/wc_mlkem_asm.S +++ b/wolfcrypt/src/wc_mlkem_asm.S @@ -46,6 +46,16 @@ #define HAVE_INTEL_AVX2 #endif /* HAVE_INTEL_AVX2 */ #endif /* NO_AVX2_SUPPORT */ +#ifndef NO_VAES_SUPPORT +#ifndef HAVE_INTEL_VAES +#define HAVE_INTEL_VAES +#endif /* HAVE_INTEL_VAES */ +#endif /* NO_VAES_SUPPORT */ +#ifndef NO_AVX512_SUPPORT +#ifndef HAVE_INTEL_AVX512 +#define HAVE_INTEL_AVX512 +#endif /* HAVE_INTEL_AVX512 */ +#endif /* NO_AVX512_SUPPORT */ #ifdef WOLFSSL_HAVE_MLKEM #ifdef HAVE_INTEL_AVX2 diff --git a/wolfssl-VS2022.vcxproj b/wolfssl-VS2022.vcxproj index a4b8d39b196..81d32758e91 100644 --- a/wolfssl-VS2022.vcxproj +++ b/wolfssl-VS2022.vcxproj @@ -1,577 +1,591 @@ - - - - - Debug - Win32 - - - Debug - x64 - - - Debug - ARM64 - - - DLL Debug - Win32 - - - DLL Debug - x64 - - - DLL Debug - ARM64 - - - DLL Release - Win32 - - - DLL Release - x64 - - - DLL Release - ARM64 - - - Release - Win32 - - - Release - x64 - - - Release - ARM64 - - - - {12226DBE-7278-4DFA-A119-5A0294CF0B33} - wolfssl - Win32Proj - wolfssl - - - - StaticLibrary - v143 - Unicode - true - - - DynamicLibrary - v143 - Unicode - true - - - StaticLibrary - v143 - Unicode - true - - - DynamicLibrary - v143 - Unicode - true - - - StaticLibrary - v143 - Unicode - true - - - DynamicLibrary - v143 - Unicode - true - - - StaticLibrary - v143 - Unicode - - - DynamicLibrary - v143 - Unicode - - - StaticLibrary - v143 - Unicode - - - DynamicLibrary - v143 - Unicode - - - StaticLibrary - v143 - Unicode - - - DynamicLibrary - v143 - Unicode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - - - - Disabled - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - true - EnableFastChecks - MultiThreadedDebugDLL - - Level4 - EditAndContinue - 4206;4214;4706;%(DisableSpecificWarnings) - - - - - Disabled - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - true - EnableFastChecks - MultiThreadedDebugDLL - - - Level4 - EditAndContinue - 4206;4214;4706;%(DisableSpecificWarnings) - - - ws2_32.lib;%(AdditionalDependencies) - false - true - false - - - - - Disabled - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - EnableFastChecks - MultiThreadedDebugDLL - - - Level4 - ProgramDatabase - 4206;4214;4706;%(DisableSpecificWarnings) - - - - - Disabled - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - EnableFastChecks - MultiThreadedDebugDLL - - - Level4 - ProgramDatabase - 4206;4214;4706;%(DisableSpecificWarnings) - - - ws2_32.lib;%(AdditionalDependencies) - false - true - - - - - Disabled - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - EnableFastChecks - MultiThreadedDebugDLL - - - Level4 - ProgramDatabase - 4206;4214;4706;%(DisableSpecificWarnings) - - - - - Disabled - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - EnableFastChecks - MultiThreadedDebugDLL - - - Level4 - ProgramDatabase - 4206;4214;4706;%(DisableSpecificWarnings) - - - ws2_32.lib;%(AdditionalDependencies) - false - true - - - - - MaxSpeed - true - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - MultiThreadedDLL - true - - Level3 - ProgramDatabase - - - - - MaxSpeed - true - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - MultiThreadedDLL - true - - - Level3 - ProgramDatabase - - - ws2_32.lib;%(AdditionalDependencies) - true - - - - - MaxSpeed - true - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - MultiThreadedDLL - true - - - Level3 - ProgramDatabase - - - - - MaxSpeed - true - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - MultiThreadedDLL - true - - - Level3 - ProgramDatabase - - - ws2_32.lib;%(AdditionalDependencies) - true - - - - - MaxSpeed - true - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - MultiThreadedDLL - true - - - Level3 - ProgramDatabase - - - - - MaxSpeed - true - ./;./IDE/WIN;%(AdditionalIncludeDirectories) - WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) - MultiThreadedDLL - true - - - Level3 - ProgramDatabase - - - ws2_32.lib;%(AdditionalDependencies) - true - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - - - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - - - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - - - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - - - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - - - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - false - false - ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) - ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) - $(OutDir)%(Filename).obj - $(IntDir)%(Filename).obj - - - - - - true - true - true - true - true - true - - - - - - + + + + + Debug + Win32 + + + Debug + x64 + + + Debug + ARM64 + + + DLL Debug + Win32 + + + DLL Debug + x64 + + + DLL Debug + ARM64 + + + DLL Release + Win32 + + + DLL Release + x64 + + + DLL Release + ARM64 + + + Release + Win32 + + + Release + x64 + + + Release + ARM64 + + + + {12226DBE-7278-4DFA-A119-5A0294CF0B33} + wolfssl + Win32Proj + wolfssl + + + + StaticLibrary + v143 + Unicode + true + + + DynamicLibrary + v143 + Unicode + true + + + StaticLibrary + v143 + Unicode + true + + + DynamicLibrary + v143 + Unicode + true + + + StaticLibrary + v143 + Unicode + true + + + DynamicLibrary + v143 + Unicode + true + + + StaticLibrary + v143 + Unicode + + + DynamicLibrary + v143 + Unicode + + + StaticLibrary + v143 + Unicode + + + DynamicLibrary + v143 + Unicode + + + StaticLibrary + v143 + Unicode + + + DynamicLibrary + v143 + Unicode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + + + + Disabled + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebugDLL + + Level4 + EditAndContinue + 4206;4214;4706;%(DisableSpecificWarnings) + + + + + Disabled + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebugDLL + + + Level4 + EditAndContinue + 4206;4214;4706;%(DisableSpecificWarnings) + + + ws2_32.lib;%(AdditionalDependencies) + false + true + false + + + + + Disabled + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + Level4 + ProgramDatabase + 4206;4214;4706;%(DisableSpecificWarnings) + + + + + Disabled + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + Level4 + ProgramDatabase + 4206;4214;4706;%(DisableSpecificWarnings) + + + ws2_32.lib;%(AdditionalDependencies) + false + true + + + + + Disabled + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + Level4 + ProgramDatabase + 4206;4214;4706;%(DisableSpecificWarnings) + + + + + Disabled + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + Level4 + ProgramDatabase + 4206;4214;4706;%(DisableSpecificWarnings) + + + ws2_32.lib;%(AdditionalDependencies) + false + true + + + + + MaxSpeed + true + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + MultiThreadedDLL + true + + Level3 + ProgramDatabase + + + + + MaxSpeed + true + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + MultiThreadedDLL + true + + + Level3 + ProgramDatabase + + + ws2_32.lib;%(AdditionalDependencies) + true + + + + + MaxSpeed + true + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + MultiThreadedDLL + true + + + Level3 + ProgramDatabase + + + + + MaxSpeed + true + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + MultiThreadedDLL + true + + + Level3 + ProgramDatabase + + + ws2_32.lib;%(AdditionalDependencies) + true + + + + + MaxSpeed + true + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + MultiThreadedDLL + true + + + Level3 + ProgramDatabase + + + + + MaxSpeed + true + ./;./IDE/WIN;%(AdditionalIncludeDirectories) + WOLFSSL_LIB;BUILDING_WOLFSSL;WOLFSSL_DLL;WOLFSSL_USER_SETTINGS;%(PreprocessorDefinitions) + MultiThreadedDLL + true + + + Level3 + ProgramDatabase + + + ws2_32.lib;%(AdditionalDependencies) + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + + + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + + + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + + + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + + + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + + + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + + + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + + + + + + true + true + true + true + true + true + + + + + + diff --git a/wolfssl.vcxproj b/wolfssl.vcxproj index c38bc90b99d..44c23ab74ee 100644 --- a/wolfssl.vcxproj +++ b/wolfssl.vcxproj @@ -489,6 +489,20 @@ $(OutDir)%(Filename).obj $(IntDir)%(Filename).obj + + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false false diff --git a/wolfssl/wolfcrypt/cpuid.h b/wolfssl/wolfcrypt/cpuid.h index bb7e68436b4..aada8801191 100644 --- a/wolfssl/wolfcrypt/cpuid.h +++ b/wolfssl/wolfcrypt/cpuid.h @@ -67,6 +67,8 @@ typedef word32 cpuid_flags_t; #define CPUID_MOVBE 0x0080 /* Move and byte swap */ #define CPUID_BMI1 0x0100 /* ANDN */ #define CPUID_SHA 0x0200 /* SHA-1 and SHA-256 instructions */ + #define CPUID_VAES 0x0400 + #define CPUID_AVX512 0x0800 #define IS_INTEL_AVX1(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_AVX1) #define IS_INTEL_AVX2(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_AVX2) @@ -78,6 +80,8 @@ typedef word32 cpuid_flags_t; #define IS_INTEL_MOVBE(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_MOVBE) #define IS_INTEL_BMI1(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_BMI1) #define IS_INTEL_SHA(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_SHA) + #define IS_INTEL_VAES(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_VAES) + #define IS_INTEL_AVX512(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_AVX512) #elif defined(HAVE_CPUID_AARCH64) diff --git a/wrapper/CSharp/wolfssl.vcxproj b/wrapper/CSharp/wolfssl.vcxproj index 66694f76438..7a963cbd913 100644 --- a/wrapper/CSharp/wolfssl.vcxproj +++ b/wrapper/CSharp/wolfssl.vcxproj @@ -371,6 +371,20 @@ $(OutDir)%(Filename).obj $(IntDir)%(Filename).obj + + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false false