diff --git a/tests/api/test_tls.c b/tests/api/test_tls.c index d7380f7fd7c..8c7882a7bef 100644 --- a/tests/api/test_tls.c +++ b/tests/api/test_tls.c @@ -1829,7 +1829,9 @@ int test_tls12_corrupted_finished(void) } else { ExpectIntGE(finishedSz, finishedLen); - XMEMCPY(finishedMsg, test_ctx.s_buff + finishedOffInMsg, finishedLen); + if (EXPECT_SUCCESS()) { + XMEMCPY(finishedMsg, test_ctx.s_buff + finishedOffInMsg, finishedLen); + } finishedSz = finishedLen; ExpectIntEQ(test_memio_modify_message_len(&test_ctx, 0, finishedMsgPos, finishedOffInMsg), 0); diff --git a/wolfcrypt/src/chacha.c b/wolfcrypt/src/chacha.c index 49c5dcefbeb..55456345136 100644 --- a/wolfcrypt/src/chacha.c +++ b/wolfcrypt/src/chacha.c @@ -109,13 +109,38 @@ Public domain. static cpuid_flags_t cpuidFlags = WC_CPUID_INITIALIZER; #endif +/* The aarch64 ChaCha assembly is NEON-only. When NEON might be absent, also + * build the C implementation: dispatch on ASIMD at runtime when NEON is + * compiled in, or use only the C path when NEON is disabled at build time. */ +#if defined(USE_ARM_CHACHA_SPEEDUP) && defined(__aarch64__) + #ifdef WOLFSSL_ARMASM_NO_NEON + #define WOLFSSL_ARM_CHACHA_C_ONLY + #else + #define WOLFSSL_ARM_CHACHA_NEON_FALLBACK + #endif +#endif +#if defined(WOLFSSL_ARM_CHACHA_NEON_FALLBACK) || \ + defined(WOLFSSL_ARM_CHACHA_C_ONLY) + #define WOLFSSL_ARM_CHACHA_NEED_C +#endif + +#ifdef WOLFSSL_ARM_CHACHA_NEON_FALLBACK + static cpuid_flags_t chacha_cpuid_flags = WC_CPUID_INITIALIZER; + /* Return non-zero when NEON/ASIMD is present and the asm path should run. */ + static WC_INLINE int chacha_use_neon(void) + { + cpuid_get_flags_ex(&chacha_cpuid_flags); + return IS_AARCH64_ASIMD(chacha_cpuid_flags); + } +#endif + /** * Set up iv(nonce). Earlier versions used 64 bits instead of 96, this version * uses the typical AEAD 96 bit nonce and can do record sizes of 256 GB. */ int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter) { -#if !defined(USE_ARM_CHACHA_SPEEDUP) +#if !defined(USE_ARM_CHACHA_SPEEDUP) || defined(WOLFSSL_ARM_CHACHA_NEED_C) word32 temp[CHACHA_IV_WORDS];/* used for alignment of memory */ #endif @@ -124,24 +149,31 @@ int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter) ctx->left = 0; /* resets state */ -#if !defined(USE_ARM_CHACHA_SPEEDUP) - XMEMCPY(temp, inIv, CHACHA_IV_BYTES); - /* block counter */ - ctx->X[CHACHA_MATRIX_CNT_IV+0] = counter; - /* fixed variable from nonce */ - ctx->X[CHACHA_MATRIX_CNT_IV+1] = LITTLE32(temp[0]); - /* counter from nonce */ - ctx->X[CHACHA_MATRIX_CNT_IV+2] = LITTLE32(temp[1]); - /* counter from nonce */ - ctx->X[CHACHA_MATRIX_CNT_IV+3] = LITTLE32(temp[2]); -#else +#ifdef WOLFSSL_ARM_CHACHA_NEON_FALLBACK + if (chacha_use_neon()) + wc_chacha_setiv(ctx->X, inIv, counter); + else +#elif defined(USE_ARM_CHACHA_SPEEDUP) && !defined(WOLFSSL_ARM_CHACHA_C_ONLY) wc_chacha_setiv(ctx->X, inIv, counter); #endif +#if !defined(USE_ARM_CHACHA_SPEEDUP) || defined(WOLFSSL_ARM_CHACHA_NEED_C) + { + XMEMCPY(temp, inIv, CHACHA_IV_BYTES); + /* block counter */ + ctx->X[CHACHA_MATRIX_CNT_IV+0] = counter; + /* fixed variable from nonce */ + ctx->X[CHACHA_MATRIX_CNT_IV+1] = LITTLE32(temp[0]); + /* counter from nonce */ + ctx->X[CHACHA_MATRIX_CNT_IV+2] = LITTLE32(temp[1]); + /* counter from nonce */ + ctx->X[CHACHA_MATRIX_CNT_IV+3] = LITTLE32(temp[2]); + } +#endif return 0; } -#if !defined(USE_ARM_CHACHA_SPEEDUP) +#if !defined(USE_ARM_CHACHA_SPEEDUP) || defined(WOLFSSL_ARM_CHACHA_NEED_C) /* "expand 32-byte k" as unsigned 32 byte */ static const word32 sigma[4] = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}; /* "expand 16-byte k" as unsigned 16 byte */ @@ -153,7 +185,7 @@ static const word32 tau[4] = {0x61707865, 0x3120646e, 0x79622d36, 0x6b206574}; */ int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz) { -#if !defined(USE_ARM_CHACHA_SPEEDUP) +#if !defined(USE_ARM_CHACHA_SPEEDUP) || defined(WOLFSSL_ARM_CHACHA_NEED_C) const word32* constants; const byte* k; #ifdef XSTREAM_ALIGN @@ -167,7 +199,15 @@ int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz) if (keySz != (CHACHA_MAX_KEY_SZ/2) && keySz != CHACHA_MAX_KEY_SZ) return BAD_FUNC_ARG; -#if !defined(USE_ARM_CHACHA_SPEEDUP) +#ifdef WOLFSSL_ARM_CHACHA_NEON_FALLBACK + if (chacha_use_neon()) + wc_chacha_setkey(ctx->X, key, keySz); + else +#elif defined(USE_ARM_CHACHA_SPEEDUP) && !defined(WOLFSSL_ARM_CHACHA_C_ONLY) + wc_chacha_setkey(ctx->X, key, keySz); +#endif +#if !defined(USE_ARM_CHACHA_SPEEDUP) || defined(WOLFSSL_ARM_CHACHA_NEED_C) + { #ifdef XSTREAM_ALIGN if ((wc_ptr_t)key % 4) { WOLFSSL_MSG("wc_ChachaSetKey unaligned key"); @@ -211,8 +251,7 @@ int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz) ctx->X[ 1] = constants[1]; ctx->X[ 2] = constants[2]; ctx->X[ 3] = constants[3]; -#else - wc_chacha_setkey(ctx->X, key, keySz); + } #endif ctx->left = 0; /* resets state */ @@ -220,7 +259,8 @@ int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz) return 0; } -#if !defined(USE_INTEL_CHACHA_SPEEDUP) && !defined(USE_ARM_CHACHA_SPEEDUP) +#if (!defined(USE_INTEL_CHACHA_SPEEDUP) && !defined(USE_ARM_CHACHA_SPEEDUP)) || \ + defined(WOLFSSL_ARM_CHACHA_NEED_C) /** * Converts word into bytes with rotations having been done. */ @@ -267,7 +307,8 @@ extern void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c, #endif -#if !defined(USE_INTEL_CHACHA_SPEEDUP) && !defined(USE_ARM_CHACHA_SPEEDUP) +#if (!defined(USE_INTEL_CHACHA_SPEEDUP) && !defined(USE_ARM_CHACHA_SPEEDUP)) || \ + defined(WOLFSSL_ARM_CHACHA_NEED_C) /** * Encrypt a stream of bytes */ @@ -366,23 +407,39 @@ int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input, return 0; } #elif defined(USE_ARM_CHACHA_SPEEDUP) - /* Handle left over bytes from last block. */ - if ((msglen > 0) && (ctx->left > 0)) { - byte* over = ((byte*)ctx->over) + CHACHA_CHUNK_BYTES - ctx->left; - word32 l = min(msglen, ctx->left); - - wc_chacha_use_over(over, output, input, l); +#ifdef WOLFSSL_ARM_CHACHA_NEON_FALLBACK + if (chacha_use_neon()) +#endif +#ifndef WOLFSSL_ARM_CHACHA_C_ONLY + { + /* Handle left over bytes from last block. */ + if ((msglen > 0) && (ctx->left > 0)) { + byte* over = ((byte*)ctx->over) + CHACHA_CHUNK_BYTES - ctx->left; + word32 l = min(msglen, ctx->left); + + wc_chacha_use_over(over, output, input, l); + + ctx->left -= l; + input += l; + output += l; + msglen -= l; + } - ctx->left -= l; - input += l; - output += l; - msglen -= l; + if (msglen != 0) { + wc_chacha_crypt_bytes(ctx, output, input, msglen); + } + return 0; } - - if (msglen != 0) { - wc_chacha_crypt_bytes(ctx, output, input, msglen); +#endif +#ifdef WOLFSSL_ARM_CHACHA_NEED_C +#ifdef WOLFSSL_ARM_CHACHA_NEON_FALLBACK + else +#endif + { + wc_Chacha_encrypt_bytes(ctx, input, output, msglen); + return 0; } - return 0; +#endif #else wc_Chacha_encrypt_bytes(ctx, input, output, msglen); return 0; diff --git a/wolfcrypt/src/cpuid.c b/wolfcrypt/src/cpuid.c index 2c3670234a6..28323457fd3 100644 --- a/wolfcrypt/src/cpuid.c +++ b/wolfcrypt/src/cpuid.c @@ -144,6 +144,7 @@ #define CPUID_AARCH64_FEAT_SHA3 ((word64)1 << 32) #define CPUID_AARCH64_FEAT_SM3 ((word64)1 << 36) #define CPUID_AARCH64_FEAT_SM4 ((word64)1 << 40) +#define CPUID_AARCH64_FEAT_ASMID ((word64)0xf << 20) #ifdef WOLFSSL_AARCH64_PRIVILEGE_MODE /* https://developer.arm.com/documentation/ddi0601/2024-09/AArch64-Registers @@ -156,6 +157,19 @@ old_cpuid_flags = WC_CPUID_INITIALIZER; word64 features; + #ifndef WOLFSSL_ARMASM_NO_NEON + __asm__ __volatile ( + "mrs %[feat], ID_AA64PFR0_EL1\n" + : [feat] "=r" (features) + : + : + ); + + if ((features & CPUID_AARCH64_FEAT_ASMID) != + CPUID_AARCH64_FEAT_ASMID) + new_cpuid_flags |= CPUID_ASIMD; + #endif + __asm__ __volatile ( "mrs %[feat], ID_AA64ISAR0_EL1\n" : [feat] "=r" (features) @@ -163,6 +177,7 @@ : ); + #ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO if (features & CPUID_AARCH64_FEAT_AES) new_cpuid_flags |= CPUID_AES; if (features & CPUID_AARCH64_FEAT_AES_PMULL) { @@ -171,16 +186,27 @@ } if (features & CPUID_AARCH64_FEAT_SHA256) new_cpuid_flags |= CPUID_SHA256; + #endif + #ifdef WOLFSSL_ARMASM_CRYPTO_SHA512 if (features & CPUID_AARCH64_FEAT_SHA256_512) new_cpuid_flags |= CPUID_SHA256 | CPUID_SHA512; + #endif + #if !defined(WOLFSSL_AARCH64_NO_SQRDMLSH) if (features & CPUID_AARCH64_FEAT_RDM) new_cpuid_flags |= CPUID_RDM; + #endif + #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 if (features & CPUID_AARCH64_FEAT_SHA3) new_cpuid_flags |= CPUID_SHA3; + #endif + #ifdef WOLFSSL_ARMASM_CRYPTO_SM3 if (features & CPUID_AARCH64_FEAT_SM3) new_cpuid_flags |= CPUID_SM3; + #endif + #ifdef WOLFSSL_ARMASM_CRYPTO_SM4 if (features & CPUID_AARCH64_FEAT_SM4) new_cpuid_flags |= CPUID_SM4; + #endif (void)wolfSSL_Atomic_Uint_CompareExchange (&cpuid_flags, &old_cpuid_flags, new_cpuid_flags); @@ -200,6 +226,11 @@ old_cpuid_flags = WC_CPUID_INITIALIZER; word64 hwcaps = getauxval(AT_HWCAP); + #ifndef WOLFSSL_ARMASM_NO_NEON + if (hwcaps & HWCAP_ASIMD) + new_cpuid_flags |= CPUID_ASIMD; + #endif + #ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO if (hwcaps & HWCAP_AES) new_cpuid_flags |= CPUID_AES; @@ -247,12 +278,18 @@ old_cpuid_flags = WC_CPUID_INITIALIZER; word64 features = android_getCpuFeatures(); + #ifndef WOLFSSL_ARMASM_NO_NEON + /* All Android AArch64 chips support NEON. */ + new_cpuid_flags |= CPUID_ASIMD; + #endif + #ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO if (features & ANDROID_CPU_ARM_FEATURE_AES) new_cpuid_flags |= CPUID_AES; if (features & ANDROID_CPU_ARM_FEATURE_PMULL) new_cpuid_flags |= CPUID_PMULL; if (features & ANDROID_CPU_ARM_FEATURE_SHA2) new_cpuid_flags |= CPUID_SHA256; + #endif (void)wolfSSL_Atomic_Uint_CompareExchange (&cpuid_flags, &old_cpuid_flags, new_cpuid_flags); @@ -279,18 +316,31 @@ if (WOLFSSL_ATOMIC_LOAD(cpuid_flags) == WC_CPUID_INITIALIZER) { cpuid_flags_t new_cpuid_flags = 0, old_cpuid_flags = WC_CPUID_INITIALIZER; + + #ifndef WOLFSSL_ARMASM_NO_NEON + /* All Mac AArch64 chips support NEON. */ + new_cpuid_flags |= CPUID_ASIMD; + #endif + #ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO if (cpuid_get_sysctlbyname("hw.optional.arm.FEAT_AES") != 0) new_cpuid_flags |= CPUID_AES; if (cpuid_get_sysctlbyname("hw.optional.arm.FEAT_PMULL") != 0) new_cpuid_flags |= CPUID_PMULL; if (cpuid_get_sysctlbyname("hw.optional.arm.FEAT_SHA256") != 0) new_cpuid_flags |= CPUID_SHA256; + #endif + #ifdef WOLFSSL_ARMASM_CRYPTO_SHA512 if (cpuid_get_sysctlbyname("hw.optional.arm.FEAT_SHA512") != 0) new_cpuid_flags |= CPUID_SHA512; + #endif + #if !defined(WOLFSSL_AARCH64_NO_SQRDMLSH) if (cpuid_get_sysctlbyname("hw.optional.arm.FEAT_RDM") != 0) new_cpuid_flags |= CPUID_RDM; + #endif + #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 if (cpuid_get_sysctlbyname("hw.optional.arm.FEAT_SHA3") != 0) new_cpuid_flags |= CPUID_SHA3; + #endif #ifdef WOLFSSL_ARMASM_CRYPTO_SM3 new_cpuid_flags |= CPUID_SM3; #endif @@ -316,24 +366,40 @@ elf_aux_info(AT_HWCAP, &features, sizeof(features)); - if (features & CPUID_AARCH64_FEAT_AES) - new_cpuid_flags |= CPUID_AES; - if (features & CPUID_AARCH64_FEAT_AES_PMULL) { + #ifndef WOLFSSL_ARMASM_NO_NEON + if (features & HWCAP_ASIMD) + new_cpuid_flags |= CPUID_ASIMD; + #endif + + #ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO + if (features & HWCAP_AES) new_cpuid_flags |= CPUID_AES; + if (features & HWCAP_PMULL) new_cpuid_flags |= CPUID_PMULL; - } - if (features & CPUID_AARCH64_FEAT_SHA256) + if (features & HWCAP_SHA2) new_cpuid_flags |= CPUID_SHA256; - if (features & CPUID_AARCH64_FEAT_SHA256_512) - new_cpuid_flags |= CPUID_SHA256 | CPUID_SHA512; - if (features & CPUID_AARCH64_FEAT_RDM) + #endif + + #ifdef WOLFSSL_ARMASM_CRYPTO_SHA512 + if (features & HWCAP_SHA512) + new_cpuid_flags |= CPUID_SHA512; + #endif + #if defined(HWCAP_ASIMDRDM) && !defined(WOLFSSL_AARCH64_NO_SQRDMLSH) + if (features & HWCAP_ASIMDRDM) new_cpuid_flags |= CPUID_RDM; - if (features & CPUID_AARCH64_FEAT_SHA3) + #endif + #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + if (features & HWCAP_SHA3) new_cpuid_flags |= CPUID_SHA3; - if (features & CPUID_AARCH64_FEAT_SM3) + #endif + #ifdef WOLFSSL_ARMASM_CRYPTO_SM3 + if (features & HWCAP_SM3) new_cpuid_flags |= CPUID_SM3; - if (features & CPUID_AARCH64_FEAT_SM4) + #endif + #ifdef WOLFSSL_ARMASM_CRYPTO_SM4 + if (features & HWCAP_SM4) new_cpuid_flags |= CPUID_SM4; + #endif (void)wolfSSL_Atomic_Uint_CompareExchange (&cpuid_flags, &old_cpuid_flags, new_cpuid_flags); @@ -345,6 +411,9 @@ if (WOLFSSL_ATOMIC_LOAD(cpuid_flags) == WC_CPUID_INITIALIZER) { cpuid_flags_t new_cpuid_flags = 0, old_cpuid_flags = WC_CPUID_INITIALIZER; + #ifndef WOLFSSL_ARMASM_NO_NEON + new_cpuid_flags |= CPUID_ASIMD; + #endif #ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO new_cpuid_flags |= CPUID_AES; new_cpuid_flags |= CPUID_PMULL; diff --git a/wolfcrypt/src/poly1305.c b/wolfcrypt/src/poly1305.c index 42627121019..af24420fd58 100644 --- a/wolfcrypt/src/poly1305.c +++ b/wolfcrypt/src/poly1305.c @@ -209,7 +209,8 @@ WOLFSSL_LOCAL void poly1305_final_avx2(Poly1305* ctx, byte* mac); #endif #elif defined(POLY130564) -#if !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_RISCV_ASM) +#if (!defined(WOLFSSL_ARMASM) || defined(WOLFSSL_ARM_POLY1305_NEED_C)) && \ + !defined(WOLFSSL_RISCV_ASM) static word64 U8TO64(const byte* p) { return @@ -272,35 +273,74 @@ static WC_INLINE void u32tole64(const word32 inLe32, byte outLe64[8]) } -#if !defined(WOLFSSL_RISCV_ASM) -/* -This local function operates on a message with a given number of bytes -with a given ctx pointer to a Poly1305 structure. -*/ -static int poly1305_blocks(Poly1305* ctx, const unsigned char *m, - size_t bytes) +/* The portable C Poly1305 implementation is needed for builds without an + * assembly back end, and as the runtime fallback on aarch64 when NEON is + * unavailable. */ +#if (!defined(USE_INTEL_POLY1305_SPEEDUP) && !defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_RISCV_ASM)) || defined(WOLFSSL_ARM_POLY1305_NEED_C) + #define WOLFSSL_POLY1305_C +#endif + +#ifdef WOLFSSL_POLY1305_C + +/* On aarch64 the assembly owns the r/h/pad context fields, so the C state lives + * in the appended c_* fields; every other build uses the standard fields. */ +#ifdef WOLFSSL_ARM_POLY1305_NEED_C + #define POLY1305_CTX_R c_r + #define POLY1305_CTX_H c_h + #define POLY1305_CTX_PAD c_pad +#else + #define POLY1305_CTX_R r + #define POLY1305_CTX_H h + #define POLY1305_CTX_PAD pad +#endif + +static void poly1305_c_setkey(Poly1305* ctx, const byte* key) { -#ifdef USE_INTEL_POLY1305_SPEEDUP - /* AVX2 is handled in wc_Poly1305Update. */ - SAVE_VECTOR_REGISTERS(return _svr_ret;); - poly1305_blocks_avx(ctx, m, bytes); - RESTORE_VECTOR_REGISTERS(); - return 0; -#elif defined(WOLFSSL_ARMASM) && defined(__aarch64__) - poly1305_arm64_blocks(ctx, m, bytes); - return 0; -#elif defined(WOLFSSL_ARMASM) && defined(WOLFSSL_ARMASM_THUMB2) - poly1305_blocks_thumb2_16(ctx, m, bytes, 1); - return 0; -#elif defined(WOLFSSL_ARMASM) -#ifndef WOLFSSL_ARMASM_NO_NEON - poly1305_arm32_blocks(ctx, m, bytes); - return 0; +#if defined(POLY130564) + word64 t0,t1; + + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + t0 = U8TO64(key + 0); + t1 = U8TO64(key + 8); + + ctx->POLY1305_CTX_R[0] = ( t0 ) & 0xffc0fffffff; + ctx->POLY1305_CTX_R[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff; + ctx->POLY1305_CTX_R[2] = ((t1 >> 24) ) & 0x00ffffffc0f; + + ctx->POLY1305_CTX_H[0] = 0; + ctx->POLY1305_CTX_H[1] = 0; + ctx->POLY1305_CTX_H[2] = 0; + + ctx->POLY1305_CTX_PAD[0] = U8TO64(key + 16); + ctx->POLY1305_CTX_PAD[1] = U8TO64(key + 24); #else - poly1305_arm32_blocks_16(ctx, m, bytes, 1); - return 0; + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + ctx->r[0] = (U8TO32(key + 0) ) & 0x3ffffff; + ctx->r[1] = (U8TO32(key + 3) >> 2) & 0x3ffff03; + ctx->r[2] = (U8TO32(key + 6) >> 4) & 0x3ffc0ff; + ctx->r[3] = (U8TO32(key + 9) >> 6) & 0x3f03fff; + ctx->r[4] = (U8TO32(key + 12) >> 8) & 0x00fffff; + + ctx->h[0] = 0; + ctx->h[1] = 0; + ctx->h[2] = 0; + ctx->h[3] = 0; + ctx->h[4] = 0; + + ctx->pad[0] = U8TO32(key + 16); + ctx->pad[1] = U8TO32(key + 20); + ctx->pad[2] = U8TO32(key + 24); + ctx->pad[3] = U8TO32(key + 28); #endif -#elif defined(POLY130564) + + ctx->leftover = 0; + ctx->finished = 0; +} + +static int poly1305_c_blocks(Poly1305* ctx, const byte* m, size_t bytes) +{ +#if defined(POLY130564) const word64 hibit = (ctx->finished) ? 0 : ((word64)1 << 40); /* 1 << 128 */ word64 r0,r1,r2; word64 s1,s2; @@ -308,13 +348,13 @@ static int poly1305_blocks(Poly1305* ctx, const unsigned char *m, word64 c; poly1305_word128 d0,d1,d2,d; - r0 = ctx->r[0]; - r1 = ctx->r[1]; - r2 = ctx->r[2]; + r0 = ctx->POLY1305_CTX_R[0]; + r1 = ctx->POLY1305_CTX_R[1]; + r2 = ctx->POLY1305_CTX_R[2]; - h0 = ctx->h[0]; - h1 = ctx->h[1]; - h2 = ctx->h[2]; + h0 = ctx->POLY1305_CTX_H[0]; + h1 = ctx->POLY1305_CTX_H[1]; + h2 = ctx->POLY1305_CTX_H[2]; s1 = r1 * (5 << 2); s2 = r2 * (5 << 2); @@ -346,9 +386,9 @@ static int poly1305_blocks(Poly1305* ctx, const unsigned char *m, bytes -= POLY1305_BLOCK_SIZE; } - ctx->h[0] = h0; - ctx->h[1] = h1; - ctx->h[2] = h2; + ctx->POLY1305_CTX_H[0] = h0; + ctx->POLY1305_CTX_H[1] = h1; + ctx->POLY1305_CTX_H[2] = h2; return 0; @@ -489,202 +529,20 @@ static int poly1305_blocks(Poly1305* ctx, const unsigned char *m, #endif return 0; - #endif /* end of 64 bit cpu blocks or 32 bit cpu */ } -/* -This local function is used for the last call when a message with a given -number of bytes is less than the block size. -*/ -static WC_INLINE int poly1305_block(Poly1305* ctx, const unsigned char *m) -{ -#ifdef USE_INTEL_POLY1305_SPEEDUP - /* No call to poly1305_block when AVX2, AVX2 does 4 blocks at a time. */ - SAVE_VECTOR_REGISTERS(return _svr_ret;); - poly1305_block_avx(ctx, m); - RESTORE_VECTOR_REGISTERS(); - return 0; -#elif defined(WOLFSSL_ARMASM) && defined(WOLFSSL_ARMASM_THUMB2) - poly1305_blocks_thumb2_16(ctx, m, POLY1305_BLOCK_SIZE, !ctx->finished); - return 0; -#elif defined(WOLFSSL_ARMASM) && !defined(__aarch64__) - poly1305_arm32_blocks_16(ctx, m, POLY1305_BLOCK_SIZE, !ctx->finished); - return 0; -#elif defined(WOLFSSL_ARMASM) - /* Only called from finished. */ - poly1305_arm64_block_16(ctx, m); - return 0; -#else - return poly1305_blocks(ctx, m, POLY1305_BLOCK_SIZE); -#endif -} - -int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz) -{ -#if defined(POLY130564) && !defined(USE_INTEL_POLY1305_SPEEDUP) && \ - !defined(WOLFSSL_ARMASM) - word64 t0,t1; -#endif - - if (key == NULL) - return BAD_FUNC_ARG; - -#ifdef CHACHA_AEAD_TEST - word32 k; - printf("Poly key used:\n"); - for (k = 0; k < keySz; k++) { - printf("%02x", key[k]); - if ((k+1) % 8 == 0) - printf("\n"); - } - printf("\n"); -#endif - - if ((ctx == NULL) || (key == NULL) || (keySz != 32)) { - return BAD_FUNC_ARG; - } - -#ifdef USE_INTEL_POLY1305_SPEEDUP - cpuid_get_flags_ex(&intel_flags); - SAVE_VECTOR_REGISTERS(return _svr_ret;); - #ifdef HAVE_INTEL_AVX2 - if (IS_INTEL_AVX2(intel_flags)) - poly1305_setkey_avx2(ctx, key); - else - #endif - poly1305_setkey_avx(ctx, key); - RESTORE_VECTOR_REGISTERS(); - ctx->started = 0; -#elif defined(WOLFSSL_ARMASM) - poly1305_set_key(ctx, key); - ctx->finished = 0; -#elif defined(POLY130564) - - /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ - t0 = U8TO64(key + 0); - t1 = U8TO64(key + 8); - - ctx->r[0] = ( t0 ) & 0xffc0fffffff; - ctx->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff; - ctx->r[2] = ((t1 >> 24) ) & 0x00ffffffc0f; - - /* h (accumulator) = 0 */ - ctx->h[0] = 0; - ctx->h[1] = 0; - ctx->h[2] = 0; - - /* save pad for later */ - ctx->pad[0] = U8TO64(key + 16); - ctx->pad[1] = U8TO64(key + 24); - - ctx->leftover = 0; - ctx->finished = 0; - -#else /* if not 64 bit then use 32 bit */ - - /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ - ctx->r[0] = (U8TO32(key + 0) ) & 0x3ffffff; - ctx->r[1] = (U8TO32(key + 3) >> 2) & 0x3ffff03; - ctx->r[2] = (U8TO32(key + 6) >> 4) & 0x3ffc0ff; - ctx->r[3] = (U8TO32(key + 9) >> 6) & 0x3f03fff; - ctx->r[4] = (U8TO32(key + 12) >> 8) & 0x00fffff; - - /* h = 0 */ - ctx->h[0] = 0; - ctx->h[1] = 0; - ctx->h[2] = 0; - ctx->h[3] = 0; - ctx->h[4] = 0; - - /* save pad for later */ - ctx->pad[0] = U8TO32(key + 16); - ctx->pad[1] = U8TO32(key + 20); - ctx->pad[2] = U8TO32(key + 24); - ctx->pad[3] = U8TO32(key + 28); - - ctx->leftover = 0; - ctx->finished = 0; - -#endif - - return 0; -} - -int wc_Poly1305Final(Poly1305* ctx, byte* mac) +static void poly1305_c_final(Poly1305* ctx, byte* mac) { -#ifdef USE_INTEL_POLY1305_SPEEDUP -#elif defined(WOLFSSL_ARMASM) -#elif defined(POLY130564) - +#if defined(POLY130564) word64 h0,h1,h2,c; word64 g0,g1,g2; word64 t0,t1; -#else - - word32 h0,h1,h2,h3,h4,c; - word32 g0,g1,g2,g3,g4; -#ifdef WOLFSSL_W64_WRAPPER - w64wrapper f; -#else - word64 f; -#endif - word32 mask; - -#endif - - if (ctx == NULL || mac == NULL) - return BAD_FUNC_ARG; - -#ifdef USE_INTEL_POLY1305_SPEEDUP - SAVE_VECTOR_REGISTERS(return _svr_ret;); - #ifdef HAVE_INTEL_AVX2 - if (IS_INTEL_AVX2(intel_flags)) - poly1305_final_avx2(ctx, mac); - else - #endif - poly1305_final_avx(ctx, mac); - RESTORE_VECTOR_REGISTERS(); -#elif defined(WOLFSSL_ARMASM) - #if !defined(WOLFSSL_ARMASM_THUMB2) && !defined(WOLFSSL_ARMASM_NO_NEON) && \ - !defined(__aarch64__) - if (ctx->leftover >= POLY1305_BLOCK_SIZE) { - size_t len = ctx->leftover & (~(POLY1305_BLOCK_SIZE - 1)); - poly1305_arm32_blocks(ctx, ctx->buffer, len); - ctx->leftover -= len; - if (ctx->leftover) { - XMEMCPY(ctx->buffer, ctx->buffer + len, ctx->leftover); - } - } - #endif - if (ctx->leftover) { - size_t i = ctx->leftover; - ctx->buffer[i++] = 1; - for (; i < POLY1305_BLOCK_SIZE; i++) { - ctx->buffer[i] = 0; - } - ctx->finished = 1; - poly1305_block(ctx, ctx->buffer); - } - - poly1305_final(ctx, mac); -#elif defined(POLY130564) - - /* process the remaining block */ - if (ctx->leftover) { - size_t i = ctx->leftover; - ctx->buffer[i] = 1; - for (i = i + 1; i < POLY1305_BLOCK_SIZE; i++) - ctx->buffer[i] = 0; - ctx->finished = 1; - poly1305_block(ctx, ctx->buffer); - } - /* fully carry h */ - h0 = ctx->h[0]; - h1 = ctx->h[1]; - h2 = ctx->h[2]; + h0 = ctx->POLY1305_CTX_H[0]; + h1 = ctx->POLY1305_CTX_H[1]; + h2 = ctx->POLY1305_CTX_H[2]; c = (h1 >> 44); h1 &= 0xfffffffffff; h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; @@ -710,8 +568,8 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac) h2 = (h2 & c) | g2; /* h = (h + pad) */ - t0 = ctx->pad[0]; - t1 = ctx->pad[1]; + t0 = ctx->POLY1305_CTX_PAD[0]; + t1 = ctx->POLY1305_CTX_PAD[1]; h0 += (( t0 ) & 0xfffffffffff) ; c = (h0 >> 44); h0 &= 0xfffffffffff; @@ -728,26 +586,25 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac) U64TO8(mac + 8, h1); /* zero out the state */ - ctx->h[0] = 0; - ctx->h[1] = 0; - ctx->h[2] = 0; - ctx->r[0] = 0; - ctx->r[1] = 0; - ctx->r[2] = 0; - ctx->pad[0] = 0; - ctx->pad[1] = 0; + ctx->POLY1305_CTX_H[0] = 0; + ctx->POLY1305_CTX_H[1] = 0; + ctx->POLY1305_CTX_H[2] = 0; + ctx->POLY1305_CTX_R[0] = 0; + ctx->POLY1305_CTX_R[1] = 0; + ctx->POLY1305_CTX_R[2] = 0; + ctx->POLY1305_CTX_PAD[0] = 0; + ctx->POLY1305_CTX_PAD[1] = 0; #else /* if not 64 bit then use 32 bit */ - /* process the remaining block */ - if (ctx->leftover) { - size_t i = ctx->leftover; - ctx->buffer[i++] = 1; - for (; i < POLY1305_BLOCK_SIZE; i++) - ctx->buffer[i] = 0; - ctx->finished = 1; - poly1305_block(ctx, ctx->buffer); - } + word32 h0,h1,h2,h3,h4,c; + word32 g0,g1,g2,g3,g4; +#ifdef WOLFSSL_W64_WRAPPER + w64wrapper f; +#else + word64 f; +#endif + word32 mask; /* fully carry h */ h0 = ctx->h[0]; @@ -837,7 +694,235 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac) ctx->pad[1] = 0; ctx->pad[2] = 0; ctx->pad[3] = 0; +#endif +} + +#undef POLY1305_CTX_R +#undef POLY1305_CTX_H +#undef POLY1305_CTX_PAD + +#endif /* WOLFSSL_POLY1305_C */ + +#ifdef WOLFSSL_ARM_POLY1305_NEED_C +#ifdef WOLFSSL_ARM_POLY1305_NEON_FALLBACK +static cpuid_flags_t poly1305_cpuid_flags = WC_CPUID_INITIALIZER; +/* Return non-zero when NEON/ASIMD is present and the asm path should run. */ +static WC_INLINE int poly1305_use_neon(void) +{ + cpuid_get_flags_ex(&poly1305_cpuid_flags); + return IS_AARCH64_ASIMD(poly1305_cpuid_flags); +} +#endif + +/* Dispatch each Poly1305 operation to the NEON assembly or the C + * implementation, choosing at runtime when both are available. */ +void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char* m, size_t bytes) +{ +#ifdef WOLFSSL_ARM_POLY1305_C_ONLY + poly1305_c_blocks(ctx, m, bytes); +#else + if (poly1305_use_neon()) + poly1305_arm64_blocks(ctx, m, bytes); + else + poly1305_c_blocks(ctx, m, bytes); +#endif +} +void poly1305_block_aarch64(Poly1305* ctx, const unsigned char* m) +{ +#ifdef WOLFSSL_ARM_POLY1305_C_ONLY + poly1305_c_blocks(ctx, m, POLY1305_BLOCK_SIZE); +#else + if (poly1305_use_neon()) + poly1305_arm64_block_16(ctx, m); + else + poly1305_c_blocks(ctx, m, POLY1305_BLOCK_SIZE); +#endif +} + +static void poly1305_setkey_aarch64(Poly1305* ctx, const byte* key) +{ +#ifdef WOLFSSL_ARM_POLY1305_C_ONLY + poly1305_c_setkey(ctx, key); +#else + if (poly1305_use_neon()) + poly1305_set_key(ctx, key); + else + poly1305_c_setkey(ctx, key); +#endif +} + +static void poly1305_final_aarch64(Poly1305* ctx, byte* mac) +{ +#ifdef WOLFSSL_ARM_POLY1305_C_ONLY + poly1305_c_final(ctx, mac); +#else + if (poly1305_use_neon()) + poly1305_final(ctx, mac); + else + poly1305_c_final(ctx, mac); +#endif +} + +#endif /* WOLFSSL_ARM_POLY1305_NEED_C */ + +#if !defined(WOLFSSL_RISCV_ASM) +/* +This local function operates on a message with a given number of bytes +with a given ctx pointer to a Poly1305 structure. +*/ +static int poly1305_blocks(Poly1305* ctx, const unsigned char *m, + size_t bytes) +{ +#ifdef USE_INTEL_POLY1305_SPEEDUP + /* AVX2 is handled in wc_Poly1305Update. */ + SAVE_VECTOR_REGISTERS(return _svr_ret;); + poly1305_blocks_avx(ctx, m, bytes); + RESTORE_VECTOR_REGISTERS(); + return 0; +#elif defined(WOLFSSL_ARMASM) && defined(__aarch64__) + poly1305_blocks_aarch64(ctx, m, bytes); + return 0; +#elif defined(WOLFSSL_ARMASM) && defined(WOLFSSL_ARMASM_THUMB2) + poly1305_blocks_thumb2_16(ctx, m, bytes, 1); + return 0; +#elif defined(WOLFSSL_ARMASM) +#ifndef WOLFSSL_ARMASM_NO_NEON + poly1305_arm32_blocks(ctx, m, bytes); + return 0; +#else + poly1305_arm32_blocks_16(ctx, m, bytes, 1); + return 0; +#endif +#else + return poly1305_c_blocks(ctx, m, bytes); +#endif +} + +/* +This local function is used for the last call when a message with a given +number of bytes is less than the block size. +*/ +static WC_INLINE int poly1305_block(Poly1305* ctx, const unsigned char *m) +{ +#ifdef USE_INTEL_POLY1305_SPEEDUP + /* No call to poly1305_block when AVX2, AVX2 does 4 blocks at a time. */ + SAVE_VECTOR_REGISTERS(return _svr_ret;); + poly1305_block_avx(ctx, m); + RESTORE_VECTOR_REGISTERS(); + return 0; +#elif defined(WOLFSSL_ARMASM) && defined(WOLFSSL_ARMASM_THUMB2) + poly1305_blocks_thumb2_16(ctx, m, POLY1305_BLOCK_SIZE, !ctx->finished); + return 0; +#elif defined(WOLFSSL_ARMASM) && !defined(__aarch64__) + poly1305_arm32_blocks_16(ctx, m, POLY1305_BLOCK_SIZE, !ctx->finished); + return 0; +#elif defined(WOLFSSL_ARMASM) + poly1305_block_aarch64(ctx, m); + return 0; +#else + return poly1305_blocks(ctx, m, POLY1305_BLOCK_SIZE); +#endif +} + +int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz) +{ + + if (key == NULL) + return BAD_FUNC_ARG; + +#ifdef CHACHA_AEAD_TEST + word32 k; + printf("Poly key used:\n"); + for (k = 0; k < keySz; k++) { + printf("%02x", key[k]); + if ((k+1) % 8 == 0) + printf("\n"); + } + printf("\n"); +#endif + + if ((ctx == NULL) || (key == NULL) || (keySz != 32)) { + return BAD_FUNC_ARG; + } + +#ifdef USE_INTEL_POLY1305_SPEEDUP + cpuid_get_flags_ex(&intel_flags); + SAVE_VECTOR_REGISTERS(return _svr_ret;); + #ifdef HAVE_INTEL_AVX2 + if (IS_INTEL_AVX2(intel_flags)) + poly1305_setkey_avx2(ctx, key); + else + #endif + poly1305_setkey_avx(ctx, key); + RESTORE_VECTOR_REGISTERS(); + ctx->started = 0; +#elif defined(WOLFSSL_ARMASM) +#ifdef __aarch64__ + poly1305_setkey_aarch64(ctx, key); +#else + poly1305_set_key(ctx, key); +#endif + ctx->finished = 0; +#else + poly1305_c_setkey(ctx, key); +#endif + + return 0; +} + +int wc_Poly1305Final(Poly1305* ctx, byte* mac) +{ + if (ctx == NULL || mac == NULL) + return BAD_FUNC_ARG; + +#ifdef USE_INTEL_POLY1305_SPEEDUP + SAVE_VECTOR_REGISTERS(return _svr_ret;); + #ifdef HAVE_INTEL_AVX2 + if (IS_INTEL_AVX2(intel_flags)) + poly1305_final_avx2(ctx, mac); + else + #endif + poly1305_final_avx(ctx, mac); + RESTORE_VECTOR_REGISTERS(); +#elif defined(WOLFSSL_ARMASM) + #if !defined(WOLFSSL_ARMASM_THUMB2) && !defined(WOLFSSL_ARMASM_NO_NEON) && \ + !defined(__aarch64__) + if (ctx->leftover >= POLY1305_BLOCK_SIZE) { + size_t len = ctx->leftover & (~(POLY1305_BLOCK_SIZE - 1)); + poly1305_arm32_blocks(ctx, ctx->buffer, len); + ctx->leftover -= len; + if (ctx->leftover) { + XMEMCPY(ctx->buffer, ctx->buffer + len, ctx->leftover); + } + } + #endif + if (ctx->leftover) { + size_t i = ctx->leftover; + ctx->buffer[i++] = 1; + for (; i < POLY1305_BLOCK_SIZE; i++) { + ctx->buffer[i] = 0; + } + ctx->finished = 1; + poly1305_block(ctx, ctx->buffer); + } + +#ifdef __aarch64__ + poly1305_final_aarch64(ctx, mac); +#else + poly1305_final(ctx, mac); +#endif +#else + /* process the remaining block */ + if (ctx->leftover) { + size_t i = ctx->leftover; + ctx->buffer[i++] = 1; + for (; i < POLY1305_BLOCK_SIZE; i++) + ctx->buffer[i] = 0; + ctx->finished = 1; + poly1305_block(ctx, ctx->buffer); + } + poly1305_c_final(ctx, mac); #endif return 0; diff --git a/wolfcrypt/src/sha256.c b/wolfcrypt/src/sha256.c index 047c57dade8..c22333f8728 100644 --- a/wolfcrypt/src/sha256.c +++ b/wolfcrypt/src/sha256.c @@ -274,6 +274,9 @@ on the specific device platform. #define SHA256_SETTRANSFORM_ARGS void #endif static void Sha256_SetTransform(SHA256_SETTRANSFORM_ARGS); +#elif defined(WOLFSSL_ARMASM) && defined(__aarch64__) && \ + !defined(WOLF_CRYPTO_CB_ONLY_SHA256) +static void Sha256_SetTransform(void); #endif static int InitSha256(wc_Sha256* sha256) @@ -310,6 +313,9 @@ static int InitSha256(wc_Sha256* sha256) #else Sha256_SetTransform(); #endif +#elif defined(WOLFSSL_ARMASM) && defined(__aarch64__) && \ + !defined(WOLF_CRYPTO_CB_ONLY_SHA256) + Sha256_SetTransform(); #endif #ifdef WOLFSSL_MAXQ10XX_CRYPTO @@ -1151,7 +1157,101 @@ static int Transform_Sha256(wc_Sha256* sha256, const byte* data) #define XTRANSFORM Transform_Sha256 #define XTRANSFORM_LEN Transform_Sha256_Len -#elif defined(WOLFSSL_ARMASM) && !defined(WOLF_CRYPTO_CB_ONLY_SHA256) +#elif defined(WOLFSSL_ARMASM) && defined(__aarch64__) && \ + !defined(WOLF_CRYPTO_CB_ONLY_SHA256) + +static int transform_check = 0; +static cpuid_flags_t cpuid_flags = WC_CPUID_INITIALIZER; + +static int Transform_Sha256(wc_Sha256* sha256, const byte* data); +static int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, + word32 len); + +/* Initialize to the software fallback so the pointer is never NULL if it is + * read before Sha256_SetTransform() has published the selected variant. */ +static int (*Transform_Sha256_Len_p)(wc_Sha256* sha256, const byte* data, + word32 len) = Transform_Sha256_Len; + +static WC_INLINE int Transform_Sha256_aarch64(wc_Sha256* sha256, + const byte* data) +{ + return (*Transform_Sha256_Len_p)(sha256, data, WC_SHA256_BLOCK_SIZE); +} + +static WC_INLINE int Transform_Sha256_Len_aarch64(wc_Sha256* sha256, + const byte* data, word32 len) +{ + return (*Transform_Sha256_Len_p)(sha256, data, len); +} + +#if !defined(WOLFSSL_ARMASM_NO_NEON) +#if !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) +static int Transform_Sha256_Len_crypto_aarch64(wc_Sha256* sha256, + const byte* data, word32 len) +{ + Transform_Sha256_Len_crypto(sha256, data, len); + return 0; +} +#endif + +static int Transform_Sha256_Len_neon_aarch64(wc_Sha256* sha256, + const byte* data, word32 len) +{ + Transform_Sha256_Len_neon(sha256, data, len); + return 0; +} +#endif + +static int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, + word32 len) +{ + int ret = 0; + + while (len >= WC_SHA256_BLOCK_SIZE) { + word32 buffer[WC_SHA256_BLOCK_SIZE / sizeof(word32)]; + + XMEMCPY(buffer, data, WC_SHA256_BLOCK_SIZE); + #ifdef LITTLE_ENDIAN_ORDER + ByteReverseWords(buffer, buffer, WC_SHA256_BLOCK_SIZE); + #endif + ret = Transform_Sha256(sha256, (const byte*)buffer); + if (ret != 0) + break; + data += WC_SHA256_BLOCK_SIZE; + len -= WC_SHA256_BLOCK_SIZE; + } + + return ret; +} + +static void Sha256_SetTransform(void) +{ + if (transform_check) + return; + + cpuid_get_flags_ex(&cpuid_flags); + +#if !defined(WOLFSSL_ARMASM_NO_NEON) +#if !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + if (IS_AARCH64_SHA256(cpuid_flags)) { + Transform_Sha256_Len_p = Transform_Sha256_Len_crypto_aarch64; + } + else +#endif + if (IS_AARCH64_ASIMD(cpuid_flags)) { + Transform_Sha256_Len_p = Transform_Sha256_Len_neon_aarch64; + } + else +#endif + { + Transform_Sha256_Len_p = Transform_Sha256_Len; + } + + transform_check = 1; +} + +#define XTRANSFORM Transform_Sha256_aarch64 +#define XTRANSFORM_LEN Transform_Sha256_Len_aarch64 int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId) { @@ -1171,6 +1271,37 @@ int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId) (void)devId; #endif +#ifdef WOLFSSL_SMALL_STACK_CACHE + sha256->W = (word32*)XMALLOC(sizeof(word32) * WC_SHA256_BLOCK_SIZE, + sha256->heap, DYNAMIC_TYPE_DIGEST); + if (sha256->W == NULL) + return MEMORY_E; +#endif + + return ret; +} + +#define NEED_SOFT_SHA256 + +#elif defined(WOLFSSL_ARMASM) && !defined(WOLF_CRYPTO_CB_ONLY_SHA256) + +int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId) +{ + int ret = 0; + + if (sha256 == NULL) + return BAD_FUNC_ARG; + ret = InitSha256(sha256); + if (ret != 0) + return ret; + + sha256->heap = heap; +#ifdef WOLF_CRYPTO_CB + sha256->devId = devId; + sha256->devCtx = NULL; +#else + (void)devId; +#endif #ifdef WOLFSSL_SMALL_STACK_CACHE sha256->W = NULL; @@ -1203,6 +1334,7 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, #endif return 0; } + #define XTRANSFORM Transform_Sha256 #define XTRANSFORM_LEN Transform_Sha256_Len @@ -1645,7 +1777,7 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, if (sha256 == NULL) { return BAD_FUNC_ARG; } - if (data == NULL && len == 0) { + if (len == 0) { /* valid, but do nothing */ return 0; } @@ -1912,11 +2044,21 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, } #ifdef WOLFSSL_ARMASM + #ifdef __aarch64__ + if (Transform_Sha256_Len_p == Transform_Sha256_Len) { + return Transform_Sha256(sha256, data); + } + else + #endif { byte buffer[WC_SHA256_BLOCK_SIZE]; ByteReverseWords((word32*)buffer, (word32*)data, WC_SHA256_BLOCK_SIZE); + #ifdef __aarch64__ + return Transform_Sha256_aarch64(sha256, buffer); + #else return Transform_Sha256(sha256, buffer); + #endif } #else return Transform_Sha256(sha256, data); @@ -2253,6 +2395,9 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, #else Sha256_SetTransform(); #endif + #elif defined(WOLFSSL_ARMASM) && defined(__aarch64__) && \ + !defined(WOLF_CRYPTO_CB_ONLY_SHA256) + Sha256_SetTransform(); #endif #ifdef WOLFSSL_HASH_FLAGS sha224->flags = 0; @@ -2342,7 +2487,7 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, if (sha224 == NULL) { return BAD_FUNC_ARG; } - if (data == NULL && len == 0) { + if (len == 0) { /* valid, but do nothing */ return 0; } diff --git a/wolfcrypt/src/sha512.c b/wolfcrypt/src/sha512.c index b2f57b13b86..87b4305edfd 100644 --- a/wolfcrypt/src/sha512.c +++ b/wolfcrypt/src/sha512.c @@ -1422,22 +1422,127 @@ static int InitSha512_256(wc_Sha512* sha512) #elif defined(WOLFSSL_ARMASM) -#if defined(__aarch64__) && defined(WOLFSSL_ARMASM_NO_NEON) -#error "No SHA-512 implementation." -#endif +#ifdef __aarch64__ + +/* AArch64: choose the SHA-512 crypto extension, NEON or the software + * implementation at runtime based on CPU features, so a core that lacks the + * crypto extension and/or NEON still has a working SHA-512. */ +#define NEED_SOFT_SHA512 static int transform_check = 0; -#if defined(__aarch64__) && defined(WOLFSSL_ARMASM_CRYPTO_SHA512) -static word32 cpuid_flags = 0; -static int cpuid_flags_set = 0; +static cpuid_flags_t cpuid_flags = WC_CPUID_INITIALIZER; + +static int _Transform_Sha512(wc_Sha512* sha512); +static int Transform_Sha512_C(wc_Sha512* sha512, const byte* data); +static int Transform_Sha512_Len_C(wc_Sha512* sha512, const byte* data, + word32 len); + +/* Initialize to the software fallback so the pointers are never NULL if they + * are read before Sha512_SetTransform() has published the selected variant. */ +static int (*Transform_Sha512_p)(wc_Sha512* sha512, const byte* data) + = Transform_Sha512_C; +static int (*Transform_Sha512_Len_p)(wc_Sha512* sha512, const byte* data, + word32 len) = Transform_Sha512_Len_C; + +/* Software fallback adapters in the asm (sha512, data[, len]) form. The asm + * transforms consume raw big-endian input and byte-reverse internally, so the + * software path mirrors that by reversing the block before _Transform_Sha512() + * (which reads host-endian words from sha512->buffer). */ +static int Transform_Sha512_C(wc_Sha512* sha512, const byte* data) +{ + if (data != (const byte*)sha512->buffer) + XMEMCPY(sha512->buffer, data, WC_SHA512_BLOCK_SIZE); +#ifdef LITTLE_ENDIAN_ORDER + ByteReverseWords64(sha512->buffer, sha512->buffer, WC_SHA512_BLOCK_SIZE); #endif + return _Transform_Sha512(sha512); +} +static int Transform_Sha512_Len_C(wc_Sha512* sha512, const byte* data, + word32 len) +{ + int ret = 0; + + while (len >= WC_SHA512_BLOCK_SIZE) { + ret = Transform_Sha512_C(sha512, data); + if (ret != 0) + break; + data += WC_SHA512_BLOCK_SIZE; + len -= WC_SHA512_BLOCK_SIZE; + } -#if defined(__aarch64__) && defined(WOLFSSL_ARMASM_CRYPTO_SHA512) -static void Transform_Sha512_crypto(wc_Sha512* sha512, const byte* data) + return ret; +} + +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA512 +static int Transform_Sha512_crypto_aarch64(wc_Sha512* sha512, const byte* data) { Transform_Sha512_Len_crypto(sha512, data, WC_SHA512_BLOCK_SIZE); + return 0; +} +static int Transform_Sha512_Len_crypto_aarch64(wc_Sha512* sha512, + const byte* data, word32 len) +{ + Transform_Sha512_Len_crypto(sha512, data, len); + return 0; +} +#endif +#ifndef WOLFSSL_ARMASM_NO_NEON +static int Transform_Sha512_neon_aarch64(wc_Sha512* sha512, const byte* data) +{ + Transform_Sha512_Len_neon(sha512, data, WC_SHA512_BLOCK_SIZE); + return 0; +} +static int Transform_Sha512_Len_neon_aarch64(wc_Sha512* sha512, + const byte* data, word32 len) +{ + Transform_Sha512_Len_neon(sha512, data, len); + return 0; } #endif + +static WC_INLINE int Transform_Sha512(wc_Sha512 *sha512, const byte* data) +{ + return (*Transform_Sha512_p)(sha512, data); +} +static WC_INLINE int Transform_Sha512_Len(wc_Sha512 *sha512, const byte* data, + word32 len) +{ + return (*Transform_Sha512_Len_p)(sha512, data, len); +} + +static void Sha512_SetTransform(void) +{ + if (transform_check) + return; + + cpuid_get_flags_ex(&cpuid_flags); + +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA512 + if (IS_AARCH64_SHA512(cpuid_flags)) { + Transform_Sha512_p = Transform_Sha512_crypto_aarch64; + Transform_Sha512_Len_p = Transform_Sha512_Len_crypto_aarch64; + } + else +#endif +#ifndef WOLFSSL_ARMASM_NO_NEON + if (IS_AARCH64_ASIMD(cpuid_flags)) { + Transform_Sha512_p = Transform_Sha512_neon_aarch64; + Transform_Sha512_Len_p = Transform_Sha512_Len_neon_aarch64; + } + else +#endif + { + Transform_Sha512_p = Transform_Sha512_C; + Transform_Sha512_Len_p = Transform_Sha512_Len_C; + } + + transform_check = 1; +} + +#else /* !__aarch64__ : 32-bit Arm (Thumb2 / ARMv7) */ + +static int transform_check = 0; + #if !defined(WOLFSSL_ARMASM_THUMB2) && !defined(WOLFSSL_ARMASM_NO_NEON) static void Transform_Sha512_neon(wc_Sha512* sha512, const byte* data) { @@ -1472,20 +1577,6 @@ static void Sha512_SetTransform(void) if (transform_check) return; -#if defined(__aarch64__) && defined(WOLFSSL_ARMASM_CRYPTO_SHA512) - if (!cpuid_flags_set) { - cpuid_flags = cpuid_get_flags(); - cpuid_flags_set = 1; - } -#endif - -#if defined(__aarch64__) && defined(WOLFSSL_ARMASM_CRYPTO_SHA512) - if (IS_AARCH64_SHA512(cpuid_flags)) { - Transform_Sha512_p = Transform_Sha512_crypto; - Transform_Sha512_Len_p = Transform_Sha512_Len_crypto; - } - else -#endif #if !defined(WOLFSSL_ARMASM_THUMB2) && !defined(WOLFSSL_ARMASM_NO_NEON) { Transform_Sha512_p = Transform_Sha512_neon; @@ -1501,6 +1592,8 @@ static void Sha512_SetTransform(void) transform_check = 1; } +#endif /* __aarch64__ */ + #else #define Transform_Sha512(sha512) _Transform_Sha512(sha512) @@ -1611,7 +1704,7 @@ int wc_InitSha512_256_ex(wc_Sha512* sha512, void* heap, int devId) #endif /* WOLFSSL_SHA512 */ -#ifndef WOLFSSL_ARMASM +#if !defined(WOLFSSL_ARMASM) || defined(NEED_SOFT_SHA512) static const word64 K512[80] = { W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd), @@ -1804,7 +1897,7 @@ static WC_INLINE int Sha512Update(wc_Sha512* sha512, const byte* data, word32 le } #endif #ifdef WOLFSSL_ARMASM - Transform_Sha512(sha512, (const byte*)sha512->buffer); + ret = Transform_Sha512(sha512, (const byte*)sha512->buffer); #elif !defined(WOLFSSL_ESP32_CRYPT) || \ defined(NO_WOLFSSL_ESP32_CRYPT_HASH) || \ defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA512) @@ -1833,7 +1926,9 @@ static WC_INLINE int Sha512Update(wc_Sha512* sha512, const byte* data, word32 le if (len >= WC_SHA512_BLOCK_SIZE) { word32 blocksLen = len & ~((word32)WC_SHA512_BLOCK_SIZE-1); - Transform_Sha512_Len(sha512, data, blocksLen); + ret = Transform_Sha512_Len(sha512, data, blocksLen); + if (ret != 0) + return ret; data += blocksLen; len -= blocksLen; } @@ -1993,9 +2088,7 @@ int wc_Sha512Update(wc_Sha512* sha512, const byte* data, word32 len) static WC_INLINE int Sha512Final(wc_Sha512* sha512) { -#ifndef WOLFSSL_ARMASM - int ret; -#endif + int ret = 0; byte* local; if (sha512 == NULL) { @@ -2042,7 +2135,9 @@ static WC_INLINE int Sha512Final(wc_Sha512* sha512) #endif /* LITTLE_ENDIAN_ORDER */ #ifdef WOLFSSL_ARMASM - Transform_Sha512(sha512, (const byte*)sha512->buffer); + ret = Transform_Sha512(sha512, (const byte*)sha512->buffer); + if (ret != 0) + return ret; #else #if defined(WOLFSSL_USE_ESP32_CRYPT_HASH_HW) && \ !defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA512) @@ -2123,7 +2218,9 @@ static WC_INLINE int Sha512Final(wc_Sha512* sha512) #endif #ifdef WOLFSSL_ARMASM - Transform_Sha512(sha512, (const byte*)sha512->buffer); + ret = Transform_Sha512(sha512, (const byte*)sha512->buffer); + if (ret != 0) + return ret; #else #if !defined(WOLFSSL_ESP32_CRYPT) || \ defined(NO_WOLFSSL_ESP32_CRYPT_HASH) || \ diff --git a/wolfssl/wolfcrypt/cpuid.h b/wolfssl/wolfcrypt/cpuid.h index bb7e68436b4..d7ec162d195 100644 --- a/wolfssl/wolfcrypt/cpuid.h +++ b/wolfssl/wolfcrypt/cpuid.h @@ -90,16 +90,18 @@ typedef word32 cpuid_flags_t; #define CPUID_SM3 0x0040 /* SM3 digest */ #define CPUID_SM4 0x0080 /* SM4 enc/dec */ #define CPUID_SB 0x0100 /* Speculation barrier */ - - #define IS_AARCH64_AES(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_AES) - #define IS_AARCH64_PMULL(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_PMULL) - #define IS_AARCH64_SHA256(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_SHA256) - #define IS_AARCH64_SHA512(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_SHA512) - #define IS_AARCH64_RDM(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_RDM) - #define IS_AARCH64_SHA3(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_SHA3) - #define IS_AARCH64_SM3(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_SM3) - #define IS_AARCH64_SM4(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_SM4) - #define IS_AARCH64_SB(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_SB) + #define CPUID_ASIMD 0x0200 /* ASIMD - NEON */ + + #define IS_AARCH64_AES(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_AES) + #define IS_AARCH64_PMULL(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_PMULL) + #define IS_AARCH64_SHA256(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_SHA256) + #define IS_AARCH64_SHA512(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_SHA512) + #define IS_AARCH64_RDM(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_RDM) + #define IS_AARCH64_SHA3(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_SHA3) + #define IS_AARCH64_SM3(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_SM3) + #define IS_AARCH64_SM4(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_SM4) + #define IS_AARCH64_SB(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_SB) + #define IS_AARCH64_ASIMD(f) (WOLFSSL_ATOMIC_COERCE_UINT(f) & CPUID_ASIMD) #endif diff --git a/wolfssl/wolfcrypt/poly1305.h b/wolfssl/wolfcrypt/poly1305.h index ffae004fe49..72cd20148f7 100644 --- a/wolfssl/wolfcrypt/poly1305.h +++ b/wolfssl/wolfcrypt/poly1305.h @@ -63,6 +63,18 @@ #define POLY130532 #endif +/* The aarch64 Poly1305 assembly is NEON-only. Provide a software fallback: + * dispatch on ASIMD at runtime when NEON is built in, or use only the C path + * when NEON is disabled at build time. */ +#if defined(WOLFSSL_ARMASM) && defined(__aarch64__) + #ifdef WOLFSSL_ARMASM_NO_NEON + #define WOLFSSL_ARM_POLY1305_C_ONLY + #else + #define WOLFSSL_ARM_POLY1305_NEON_FALLBACK + #endif + #define WOLFSSL_ARM_POLY1305_NEED_C +#endif + enum { POLY1305 = 7, POLY1305_BLOCK_SIZE = 16, @@ -100,6 +112,14 @@ typedef struct Poly1305 { word64 leftover; unsigned char buffer[POLY1305_BLOCK_SIZE]; unsigned char finished; +#ifdef WOLFSSL_ARM_POLY1305_NEED_C + /* Software fallback state (radix 2^44), used when NEON is unavailable. + * Appended after the assembly state so the asm field offsets are unchanged. + */ + word64 c_r[3]; + word64 c_h[3]; + word64 c_pad[2]; +#endif #elif defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_THUMB2) && \ !defined(WOLFSSL_ARMASM_NO_NEON) /* NEON implementation for ARM32 */