From 064812cc1521e21f43badda4b98c977a1aca3a9a Mon Sep 17 00:00:00 2001
From: kaleb-himes <kaleb@wolfssl.com>
Date: Mon, 22 Jun 2026 13:41:30 -0600
Subject: [PATCH] Phase 3: Security and FIPS Compliance Audit

---
 IDE/WIN-SRTP-KDF-140-3/test.vcxproj         |  10 +
 IDE/WIN-SRTP-KDF-140-3/wolfssl-fips.vcxproj |  10 +
 configure.ac                                |  48 +-
 fips-hash.sh                                |   6 +-
 linuxkm/Kbuild                              |  37 +-
 linuxkm/Makefile                            |  28 +-
 linuxkm/linuxkm_memory.c                    |  14 +
 linuxkm/linuxkm_memory.h                    |   8 +
 linuxkm/linuxkm_wc_port.h                   |  98 ++--
 linuxkm/module_hooks.c                      |  16 +-
 linuxkm/pie_redirect_table.c                |  80 ++-
 linuxkm/x86_vector_register_glue.c          |  36 +-
 src/include.am                              |  65 ++-
 tests/api/test_aes.c                        |  13 +-
 tests/api/test_evp_pkey.c                   |   4 +-
 tests/api/test_ossl_rsa.c                   |  10 +-
 tests/api/test_slhdsa.c                     |   8 +-
 wolfcrypt/benchmark/fips_cast_bench.c       | 363 ++++++++++++++
 wolfcrypt/benchmark/include.am              |  10 +
 wolfcrypt/src/aes.c                         | 214 +++++++-
 wolfcrypt/src/aes_asm.S                     |  15 +-
 wolfcrypt/src/aes_xts_asm.S                 | 512 ++++++++++++++++++++
 wolfcrypt/src/cpuid.c                       |  16 +
 wolfcrypt/src/dh.c                          |  32 +-
 wolfcrypt/src/error.c                       |  15 +
 wolfcrypt/src/ge_operations.c               |   9 +-
 wolfcrypt/src/port/arm/armv8-32-sha3-asm.S  |  10 +
 wolfcrypt/src/random.c                      | 142 +++++-
 wolfcrypt/src/rsa.c                         |  24 +-
 wolfcrypt/src/sha256.c                      |  30 +-
 wolfcrypt/src/sha512.c                      |  17 +
 wolfcrypt/src/wc_lms.c                      |   8 +
 wolfcrypt/src/wc_lms_impl.c                 |  14 +-
 wolfcrypt/src/wc_mldsa.c                    | 120 ++++-
 wolfcrypt/src/wc_mlkem.c                    | 123 +++--
 wolfcrypt/src/wc_mlkem_poly.c               |   9 +
 wolfcrypt/src/wc_slhdsa.c                   | 173 ++++++-
 wolfcrypt/src/wc_xmss.c                     |   8 +
 wolfcrypt/src/wc_xmss_impl.c                |   9 +
 wolfcrypt/test/test.c                       | 310 +++++++++++-
 wolfssl/wolfcrypt/aes.h                     |   8 +-
 wolfssl/wolfcrypt/error-crypt.h             |  14 +-
 wolfssl/wolfcrypt/fips_test.h               |  30 +-
 wolfssl/wolfcrypt/random.h                  |  22 +-
 wolfssl/wolfcrypt/settings.h                |  21 +
 45 files changed, 2529 insertions(+), 240 deletions(-)
 create mode 100644 wolfcrypt/benchmark/fips_cast_bench.c
diff --git a/IDE/WIN-SRTP-KDF-140-3/test.vcxproj b/IDE/WIN-SRTP-KDF-140-3/test.vcxproj
index a41ff9ac49f..2429f2fe9bb 100644
--- a/IDE/WIN-SRTP-KDF-140-3/test.vcxproj
+++ b/IDE/WIN-SRTP-KDF-140-3/test.vcxproj
@@ -162,7 +162,13 @@
       <OptimizeReferences>true</OptimizeReferences>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+      <!-- FIPS in-core integrity (WIN-SRTP-KDF README): ASLR OFF so every
+           build loads at a fixed base and the in-core hash stays stable.
+           Mirrors the Release|x64 config; FixedBaseAddress (/FIXED) strips
+           the .reloc section so the i386 image is pinned at its preferred
+           ImageBase (0x00400000), matching the proven x86_64 (U4) behavior. -->
       <RandomizedBaseAddress>false</RandomizedBaseAddress>
+      <FixedBaseAddress>true</FixedBaseAddress>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
@@ -177,6 +183,10 @@
     </ClCompile>
     <Link>
       <GenerateDebugInformation>true</GenerateDebugInformation>
+      <!-- FIPS in-core integrity (WIN-SRTP-KDF README): ASLR OFF so every
+           build loads at a fixed base and the in-core hash stays stable. -->
+      <RandomizedBaseAddress>false</RandomizedBaseAddress>
+      <FixedBaseAddress>true</FixedBaseAddress>
       <SubSystem>Console</SubSystem>
       <AdditionalDependencies>ws2_32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/IDE/WIN-SRTP-KDF-140-3/wolfssl-fips.vcxproj b/IDE/WIN-SRTP-KDF-140-3/wolfssl-fips.vcxproj
index 65bb39fffa2..390b38f0e92 100644
--- a/IDE/WIN-SRTP-KDF-140-3/wolfssl-fips.vcxproj
+++ b/IDE/WIN-SRTP-KDF-140-3/wolfssl-fips.vcxproj
@@ -314,6 +314,16 @@
     <ClCompile Include="..\..\src\dtls13.c" />
     <ClCompile Include="..\..\src\dtls.c" />
     <ClCompile Include="..\..\wolfcrypt\src\wc_encrypt.c" />
+    <!-- FIPS v7.0.0 post-quantum module sources (FIPS 203/204/205, SP 800-208).
+         In-boundary placement is by .fipsA$/.fipsB$ section, not project order. -->
+    <ClCompile Include="..\..\wolfcrypt\src\wc_mlkem.c" />
+    <ClCompile Include="..\..\wolfcrypt\src\wc_mlkem_poly.c" />
+    <ClCompile Include="..\..\wolfcrypt\src\wc_mldsa.c" />
+    <ClCompile Include="..\..\wolfcrypt\src\wc_lms.c" />
+    <ClCompile Include="..\..\wolfcrypt\src\wc_lms_impl.c" />
+    <ClCompile Include="..\..\wolfcrypt\src\wc_xmss.c" />
+    <ClCompile Include="..\..\wolfcrypt\src\wc_xmss_impl.c" />
+    <ClCompile Include="..\..\wolfcrypt\src\wc_slhdsa.c" />
     <ClCompile Include="..\..\wolfcrypt\src\wolfcrypt_last.c" />
     <ClCompile Include="..\..\wolfcrypt\src\wolfmath.c" />
     <ClCompile Include="..\..\wolfcrypt\src\wolfevent.c" />
diff --git a/configure.ac b/configure.ac
index c1d2dd089ef..82af885af65 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3777,13 +3777,22 @@ then
             AC_MSG_NOTICE([32bit ARMv4 found])
             ;;
         *)
-            AM_CPPFLAGS="$AM_CPPFLAGS -mfpu=crypto-neon-fp-armv8 -marm"
+            # AArch32 ARMv8 crypto-extension asm (armv8-32-*-asm.S: sha256h,
+            # aese/aesmc, pmull) needs an explicit -march=armv8-a+crypto.  The
+            # ARMv8-A crypto extension is OPTIONAL, so the SHA/AES instructions
+            # are gated by the "+crypto" arch feature -- NOT enabled by -mfpu
+            # alone, and NOT by a bare -march=armv8-a (nor -mcpu=cortex-a53 on
+            # some toolchains).  Cross toolchains whose default -mcpu is ARMv7
+            # (e.g. Xilinx Vitis cortex-a9) otherwise reject them with
+            # "selected processor does not support sha256h.32 in ARM mode".
+            # Mirrors the in-kernel ARM armasm enablement (port/arm/*.S crypto).
+            AM_CPPFLAGS="$AM_CPPFLAGS -march=armv8-a+crypto -mfpu=crypto-neon-fp-armv8 -marm"
             # Include options.h
             AM_CCASFLAGS="$AM_CCASFLAGS -DEXTERNAL_OPTS_OPENVPN"
             ENABLED_ARMASM_CRYPTO=yes
             ENABLED_ARMASM_NEON=yes
             ENABLED_ARM_32=yes
-            AC_MSG_NOTICE([32bit ARMv8 found, setting mfpu to crypto-neon-fp-armv8])
+            AC_MSG_NOTICE([32bit ARMv8 found, setting -march=armv8-a+crypto + mfpu=crypto-neon-fp-armv8])
             ;;
         esac
     esac
@@ -6315,13 +6324,7 @@ AS_CASE([$FIPS_VERSION],
             -DWC_RSA_NO_PADDING \
             -DECC_USER_CURVES \
             -DHAVE_ECC384 \
-            -DHAVE_ECC521 \
-            -DWOLFSSL_VALIDATE_FFC_IMPORT \
-            -DHAVE_FFDHE_Q \
-            -DHAVE_FFDHE_3072 \
-            -DHAVE_FFDHE_4096 \
-            -DHAVE_FFDHE_6144 \
-            -DHAVE_FFDHE_8192"
+            -DHAVE_ECC521"
 
         # KCAPI API does not support custom k for sign, don't force enable ECC key sizes and don't use seed callback
         AS_IF([test "x$ENABLED_KCAPI_ECC" = "xno"],
@@ -6335,6 +6338,20 @@ AS_CASE([$FIPS_VERSION],
                 -DHAVE_ECC256"])
 
         DEFAULT_MAX_CLASSIC_ASYM_KEY_BITS=8192
+
+# Classic DH and DSA are OUT OF SCOPE for the FIPS 140-3 v7 PQ module.
+# (FIPS 186-5 retires DSA; v7 boundary keeps only ECDH/ECDSA + PQ KEM/DSA.)
+# Hard-error if explicitly enabled; otherwise force off and add NO_DH/NO_DSA.
+        AS_IF([test "$enable_dh" = "yes"],
+            [AC_MSG_ERROR([--enable-dh is not supported with --enable-fips=$FIPS_VERSION. Classic finite-field DH is out of scope for the FIPS 140-3 v7 PQ module. Use --enable-fips=v6 if you need DH support.])],
+            [test "$ENABLED_DH" != "no"],
+            [ENABLED_DH="no"; enable_dh="no"; AM_CFLAGS="$AM_CFLAGS -DNO_DH"])
+
+        AS_IF([test "$enable_dsa" = "yes"],
+            [AC_MSG_ERROR([--enable-dsa is not supported with --enable-fips=$FIPS_VERSION. DSA is retired by FIPS 186-5 and is out of scope for the FIPS 140-3 v7 PQ module. Use --enable-fips=v6 if you need DSA support.])],
+            [test "$ENABLED_DSA" != "no"],
+            [ENABLED_DSA="no"; enable_dsa="no"; AM_CFLAGS="$AM_CFLAGS -DNO_DSA"])
+
 # optimizations section
 
 # protocol section
@@ -8889,8 +8906,17 @@ then
     fi
     if test "x$ENABLED_DH" = "xno"
     then
-        ENABLED_DH="yes"
-        AM_CFLAGS="$AM_CFLAGS -DHAVE_DH"
+        # Classic DH is out of scope for the FIPS 140-3 v7 PQ module.
+        # JNI normally auto-enables DH for legacy TLS suites; with FIPS v7+
+        # we report and skip the auto-enable rather than silently turning DH
+        # back on (which would conflict with the boundary).
+        if test "$FIPS_VERSION" = "v7" || test "$FIPS_VERSION" = "ready" || test "$FIPS_VERSION" = "dev"
+        then
+            AC_MSG_NOTICE([JNI enabled but FIPS is $FIPS_VERSION, NOT turning on DH with this module])
+        else
+            ENABLED_DH="yes"
+            AM_CFLAGS="$AM_CFLAGS -DHAVE_DH"
+        fi
     fi
     if test "x$ENABLED_PSK" = "xno"
     then
diff --git a/fips-hash.sh b/fips-hash.sh
index 36f320c0bbd..8f8a1a86317 100755
--- a/fips-hash.sh
+++ b/fips-hash.sh
@@ -13,7 +13,11 @@ then
 fi
 
 OUT=$(./wolfcrypt/test/testwolfcrypt | sed -n 's/hash = \(.*\)/\1/p')
-NEWHASH=$(echo "$OUT" | cut -c1-64)
+# FIPS v7.0.0+ uses HMAC-SHA-512 (128 hex chars); older FIPS versions
+# use HMAC-SHA-256 (64 hex chars).  Take the whole captured hash; the
+# static_assert on sizeof(verifyCore) guards against wrong length at
+# compile time after this script runs.
+NEWHASH=$(echo "$OUT" | head -n1 | tr -d '[:space:]')
 if test -n "$NEWHASH"
 then
     cp wolfcrypt/src/fips_test.c wolfcrypt/src/fips_test.c.bak
diff --git a/linuxkm/Kbuild b/linuxkm/Kbuild
index fe3f823942f..eb14106f5e8 100644
--- a/linuxkm/Kbuild
+++ b/linuxkm/Kbuild
@@ -99,6 +99,18 @@ $(LIBWOLFSSL_NAME)-y := $(WOLFSSL_OBJ_FILES) linuxkm/module_hooks.o linuxkm/modu
 
 ifeq "$(FIPS_OPTEST)" "1"
     $(LIBWOLFSSL_NAME)-y += linuxkm/optest-140-3/linuxkm_optest_wrapper.o
+    # The optest TEST wrapper (#includes test.c / invalid_tests.c) aggregates
+    # several AES contexts per invalid-input test function.  Under WOLFSSL_AESNI
+    # the Aes struct carries an inline ALIGN16 streamData[5*WC_AES_BLOCK_SIZE]
+    # (wolfssl/wolfcrypt/aes.h) plus use_aesni, so a handful of those functions
+    # (aes_{,mac_,ofb_,cfb_,kw_}invalid_data_tests) exceed the conservative i386
+    # THREAD_SIZE/4 = 2048 frame *warning*.  They build cleanly on x86_64 (whose
+    # THREAD_SIZE/4 = 4096 already accommodates them) and run in a kernel thread
+    # well within THREAD_SIZE.  Relax the cap to 4096 for the wrapper ONLY: it is
+    # test/evidence tooling OUTSIDE the FIPS module boundary, so the FIPS module
+    # objects keep the strict MAX_STACK_FRAME_SIZE.  No effect on x86_64 (its
+    # default is already 4096).
+    $(obj)/linuxkm/optest-140-3/linuxkm_optest_wrapper.o: ccflags-y += -Wframe-larger-than=4096
 endif
 
 WOLFSSL_CFLAGS_NO_VECTOR_INSNS := $(CFLAGS_SIMD_DISABLE) $(CFLAGS_FPU_DISABLE)
@@ -127,6 +139,10 @@ ifeq "$(ENABLED_LINUXKM_PIE)" "yes"
           endif
         endif
       endif
+      ifeq ($(KERNEL_ARCH),i386)
+        NO_PIE_FLAG := 1
+        $(info Note: disabling -fPIE on 32-bit x86 -- i386 -fPIE routes every local symbol through the GOT (R_386_GOTOFF), which the wolfCrypt PIE containerization forbids.)
+      endif
     endif
 
     ifdef NO_PIE_FLAG
@@ -217,6 +233,25 @@ $(obj)/wolfcrypt/src/wc_mlkem_asm.o: OBJECT_FILES_NON_STANDARD := y
 $(obj)/wolfcrypt/src/wc_mldsa_asm.o: asflags-y := $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE)
 $(obj)/wolfcrypt/src/wc_mldsa_asm.o: OBJECT_FILES_NON_STANDARD := y
 
+# ARM/ARM64 crypto+NEON asm (wolfcrypt/src/port/arm/*.S) needs the crypto/NEON
+# -march enabled at assembly time.  The wolfSSL ARM asm carries no .arch/.fpu
+# directives, and configure leaves ASFLAGS_*_SIMD_ENABLE empty on ARM: the
+# userspace build inherits +crypto from the toolchain's default -mcpu (e.g.
+# cortex-a72), but the kernel build forces its own baseline -march without it,
+# so the AES/SHA/PMULL instructions are rejected ("selected processor does not
+# support `aesd ...'").  Supply the right -march here per kernel arch.  (The
+# wrong-arch port/arm files are #ifdef'd to empty objects, so the flag is a
+# no-op for them; OBJECT_FILES_NON_STANDARD silences objtool on the hand asm.)
+ifeq ($(CONFIG_ARM64),y)
+    WOLFSSL_ARM_ASM_MARCH := -march=armv8-a+crypto
+else ifeq ($(CONFIG_ARM),y)
+    WOLFSSL_ARM_ASM_MARCH := -march=armv8-a -mfpu=crypto-neon-fp-armv8
+endif
+ifdef WOLFSSL_ARM_ASM_MARCH
+$(obj)/wolfcrypt/src/port/arm/%.o: asflags-y := $(WOLFSSL_ASFLAGS) $(WOLFSSL_ARM_ASM_MARCH)
+$(obj)/wolfcrypt/src/port/arm/%.o: OBJECT_FILES_NON_STANDARD := y
+endif
+
 ifndef READELF
     READELF := readelf
 endif
@@ -325,7 +360,7 @@ RENAME_PIE_TEXT_AND_DATA_SECTIONS := \
 			next;						\
 		    }							\
 		    else if ($$4 == "OBJECT") {				\
-			if (! ($$7 in wolfcrypt_data_sections)) {	\
+			if (! ($$7 in wolfcrypt_data_sections) && ! ($$7 in wolfcrypt_text_sections)) {	\
 			    if ((other_sections[$$7] == ".printk_index") || \
 				(($$8 ~ /^_entry\.[0-9]+$$|^kernel_read_file_str$$/) &&		\
 				 (other_sections[$$7] == ".data.rel.ro.local"))) \
diff --git a/linuxkm/Makefile b/linuxkm/Makefile
index 24a867b9356..1edb65bfa91 100644
--- a/linuxkm/Makefile
+++ b/linuxkm/Makefile
@@ -45,7 +45,17 @@ ifndef SRC_TOP
     SRC_TOP=$(shell dirname $(MODULE_TOP))
 endif
 
-WOLFSSL_CFLAGS=-DHAVE_CONFIG_H -I$(SRC_TOP) -DBUILDING_WOLFSSL $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -Wno-declaration-after-statement -Wno-redundant-decls -DLIBWOLFSSL_GLOBAL_EXTRA_CFLAGS="\" $(KERNEL_EXTRA_CFLAGS)\""
+# -Wno-nested-externs: the Linux kernel's compile-time-assert machinery
+# (<linux/compiler_types.h> _compiletime_assert, reached via the atomic / per-CPU /
+# printk-once macros used in linuxkm/x86_vector_register_glue.c) expands to an
+# "extern void __compiletime_assert_N(void)" declaration *inside* a function body.
+# That is by-design kernel code, but trips wolfSSL's -Wnested-externs -> with
+# -Werror it breaks the i386 + AES-NI kernel build (the glue is only compiled when
+# WOLFSSL_USE_SAVE_VECTOR_REGISTERS is set, i.e. with a PAA enabled).  Suppress it
+# for the whole linuxkm build, same as the two kernel-incompatible warnings already
+# stripped here.  This only silences a diagnostic -> emitted object code (and the
+# FIPS in-core hash) is byte-identical on every arch.
+WOLFSSL_CFLAGS=-DHAVE_CONFIG_H -I$(SRC_TOP) -DBUILDING_WOLFSSL $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -Wno-declaration-after-statement -Wno-redundant-decls -Wno-nested-externs -DLIBWOLFSSL_GLOBAL_EXTRA_CFLAGS="\" $(KERNEL_EXTRA_CFLAGS)\""
 ifdef KERNEL_EXTRA_CFLAGS
     WOLFSSL_CFLAGS += $(KERNEL_EXTRA_CFLAGS)
 endif
@@ -55,7 +65,13 @@ endif
 
 WOLFSSL_ASFLAGS=-DHAVE_CONFIG_H -I$(SRC_TOP) -DBUILDING_WOLFSSL $(AM_CCASFLAGS) $(CCASFLAGS)
 
-WOLFSSL_OBJ_FILES=$(patsubst %.lo, %.o, $(patsubst src/src_libwolfssl_la-%, src/%, $(patsubst src/libwolfssl_la-%, src/%, $(patsubst wolfcrypt/src/src_libwolfssl_la-%, wolfcrypt/src/%, $(src_libwolfssl_la_OBJECTS)))))
+# Strip libtool's per-target object prefix (src_libwolfssl_la-) so Kbuild sees the
+# real source-derived object names.  The innermost patsubst handles the ARM asm
+# under wolfcrypt/src/port/arm/ (armv8-*/armv8-32-*/thumb2-*), which the
+# directory-specific src/ and wolfcrypt/src/ patsubsts below do NOT match --
+# without it, --enable-armasm kernel builds fail with "No rule to make target
+# .../src_libwolfssl_la-armv8-aes-asm.o".
+WOLFSSL_OBJ_FILES=$(patsubst %.lo, %.o, $(patsubst src/src_libwolfssl_la-%, src/%, $(patsubst src/libwolfssl_la-%, src/%, $(patsubst wolfcrypt/src/src_libwolfssl_la-%, wolfcrypt/src/%, $(patsubst wolfcrypt/src/port/arm/src_libwolfssl_la-%, wolfcrypt/src/port/arm/%, $(src_libwolfssl_la_OBJECTS))))))
 
 ifeq "$(ENABLED_CRYPT_TESTS)" "yes"
     WOLFSSL_OBJ_FILES+=wolfcrypt/test/test.o
@@ -197,7 +213,7 @@ GENERATE_RELOC_TAB := $(AWK) '								\
 		next;									\
 	}										\
 	/^0/ {										\
-		if ($$3 !~ "^(R_X86_.*|R_AARCH64_.*|R_ARM.*)$$") {			\
+		if ($$3 !~ "^(R_X86_.*|R_386_.*|R_AARCH64_.*|R_ARM.*)$$") {		\
 		    print "Unexpected relocation type in " cur_seg ":\n" $$0 >"/dev/stderr";		\
 		    ++bad_relocs;							\
 		}									\
@@ -361,12 +377,12 @@ module-update-fips-hash: $(LIBWOLFSSL_NAME).ko
 	readarray -t verifyCore_attrs < <($(READELF) --wide --symbols "$<" | \
 	    sed -E -n 's/^[[:space:]]*[0-9]+: ([0-9a-fA-F]+)[[:space:]]+([0-9]+)[[:space:]]+OBJECT[[:space:]]+[A-Z]+[[:space:]]+[A-Z]+[[:space:]]+'"$${rodata_segment[0]}"'[[:space:]]+verifyCore$$/\1\n\2/p'); \
 	if [[ $${#verifyCore_attrs[@]} != 2 ]]; then echo '  unexpected verifyCore_attrs.' >&2; exit 1; fi; \
-	if [[ "$${verifyCore_attrs[1]}" != "65" ]]; then echo "	 verifyCore has unexpected length $${verifyCore_attrs[1]}." >&2; exit 1; fi; \
+	if [[ "$${verifyCore_attrs[1]}" != "129" ]]; then echo "	 verifyCore has unexpected length $${verifyCore_attrs[1]}." >&2; exit 1; fi; \
 	verifyCore_offset=$$((0x$${rodata_segment[1]} + 0x$${verifyCore_attrs[0]})); \
-	current_verifyCore=$$(dd bs=1 if="$<" skip=$$verifyCore_offset count=64 status=none); \
+	current_verifyCore=$$(dd bs=1 if="$<" skip=$$verifyCore_offset count=128 status=none); \
 	if [[ ! "$$current_verifyCore" =~ [0-9a-fA-F]{64} ]]; then echo "  verifyCore at offset $$verifyCore_offset has unexpected value." >&2; exit 1; fi; \
 	if [[ '$(FIPS_HASH)' == "$$current_verifyCore" ]]; then echo '	Supplied FIPS_HASH matches existing verifyCore -- no update needed.'; exit 0; fi; \
-	echo -n '$(FIPS_HASH)' | dd bs=1 conv=notrunc of="$<" seek=$$verifyCore_offset count=64 status=none && \
+	echo -n '$(FIPS_HASH)' | dd bs=1 conv=notrunc of="$<" seek=$$verifyCore_offset count=128 status=none && \
 	echo "	FIPS verifyCore updated successfully." && \
             if [[ -f '$(LIBWOLFSSL_NAME).ko.signed' ]]; then $(MAKE) $(QFLAG) --no-print-directory --no-silent -C . '$(LIBWOLFSSL_NAME).ko.signed'; fi
 
diff --git a/linuxkm/linuxkm_memory.c b/linuxkm/linuxkm_memory.c
index 32aa241404a..15cb32ad918 100644
--- a/linuxkm/linuxkm_memory.c
+++ b/linuxkm/linuxkm_memory.c
@@ -52,6 +52,8 @@ static const struct reloc_layout_ent {
     [WC_R_X86_64_64]                  = { "R_X86_64_64",                                                ~0UL, 64, .is_signed = 0, .is_relative = 0 },
     [WC_R_X86_64_PC32]                = { "R_X86_64_PC32",                                              ~0UL, 32, .is_signed = 1, .is_relative = 1 },
     [WC_R_X86_64_PLT32]               = { "R_X86_64_PLT32",                                             ~0UL, 32, .is_signed = 1, .is_relative = 1 },
+    [WC_R_386_32]                     = { "R_386_32",                                                   ~0UL, 32, .is_signed = 0, .is_relative = 0 },
+    [WC_R_386_PC32]                   = { "R_386_PC32",                                                 ~0UL, 32, .is_signed = 1, .is_relative = 1 },
     [WC_R_AARCH64_ABS32]              = { "R_AARCH64_ABS32",                                            ~0UL, 32, .is_signed = 1, .is_relative = 0, .is_pages = 0, .is_pair_lo = 0, .is_pair_hi = 0 },
     [WC_R_AARCH64_ABS64]              = { "R_AARCH64_ABS64",                                            ~0UL, 64, .is_signed = 1, .is_relative = 0, .is_pages = 0, .is_pair_lo = 0, .is_pair_hi = 0 },
     [WC_R_AARCH64_ADD_ABS_LO12_NC]    = { "R_AARCH64_ADD_ABS_LO12_NC",    0b00000000001111111111110000000000, 32, .is_signed = 0, .is_relative = 0, .is_pages = 0, .is_pair_lo = 1, .is_pair_hi = 0 },
@@ -64,6 +66,10 @@ static const struct reloc_layout_ent {
     [WC_R_AARCH64_LDST64_ABS_LO12_NC] = { "R_AARCH64_LDST64_ABS_LO12_NC", 0b00000000001111111111110000000000, 32, .is_signed = 0, .is_relative = 0, .is_pages = 0, .is_pair_lo = 1, .is_pair_hi = 0 },
     [WC_R_AARCH64_PREL32]             = { "R_AARCH64_PREL32",                                           ~0UL, 32, .is_signed = 1, .is_relative = 1, .is_pages = 0, .is_pair_lo = 0, .is_pair_hi = 0 },
     [WC_R_ARM_ABS32]                  = { "R_ARM_ABS32",                                                ~0UL, 32, .is_signed = 0, .is_relative = 0, .is_pages = 0, .is_pair_lo = 0, .is_pair_hi = 0 },
+    /* ARM-mode BL/B: signed 24-bit word offset in bits [23:0] (cf. AARCH64_CALL26's
+     * 26-bit field).  Emitted by the arm32 ARM-mode (non-Thumb) kernel module build. */
+    [WC_R_ARM_CALL]                   = { "R_ARM_CALL",                   0b00000000111111111111111111111111, 32, .is_signed = 1, .is_relative = 1, .is_pages = 0, .is_pair_lo = 0, .is_pair_hi = 0 },
+    [WC_R_ARM_JUMP24]                 = { "R_ARM_JUMP24",                 0b00000000111111111111111111111111, 32, .is_signed = 1, .is_relative = 1, .is_pages = 0, .is_pair_lo = 0, .is_pair_hi = 0 },
     [WC_R_ARM_PREL31]                 = { "R_ARM_PREL31",                 0b01111111111111111111111111111111, 32, .is_signed = 1, .is_relative = 1, .is_pages = 0, .is_pair_lo = 0, .is_pair_hi = 0 },
     [WC_R_ARM_REL32]                  = { "R_ARM_REL32",                                                ~0UL, 32, .is_signed = 1, .is_relative = 1, .is_pages = 0, .is_pair_lo = 0, .is_pair_hi = 0 },
     [WC_R_ARM_THM_CALL]               = { "R_ARM_THM_CALL",               0b00000111111111110010111111111111, 32, .is_signed = 1, .is_relative = 1, .is_pages = 0, .is_pair_lo = 0, .is_pair_hi = 0 },
@@ -363,6 +369,12 @@ ssize_t wc_reloc_normalize_segment(
         case WC_R_X86_64_32:
         case WC_R_X86_64_32S:
         case WC_R_X86_64_64:
+        /* i386 shares the x86_64 normalization: R_386_32 is absolute
+         * (is_relative=0), R_386_PC32 is PC-relative (is_relative=1).  The
+         * math below is driven by layout->is_relative/is_signed and is
+         * width-correct via uintptr_t (32-bit on i386). */
+        case WC_R_386_32:
+        case WC_R_386_PC32:
 
             if (dest_seg != WC_R_SEG_OTHER) {
 #ifdef DEBUG_LINUXKM_PIE_SUPPORT
@@ -410,6 +422,8 @@ ssize_t wc_reloc_normalize_segment(
             break;
 
         case WC_R_ARM_ABS32:
+        case WC_R_ARM_CALL:
+        case WC_R_ARM_JUMP24:
         case WC_R_ARM_PREL31:
         case WC_R_ARM_REL32:
         case WC_R_ARM_THM_CALL:
diff --git a/linuxkm/linuxkm_memory.h b/linuxkm/linuxkm_memory.h
index 76e681da805..d5111613d28 100644
--- a/linuxkm/linuxkm_memory.h
+++ b/linuxkm/linuxkm_memory.h
@@ -40,6 +40,12 @@ enum wc_reloc_type {
     WC_R_X86_64_64,
     WC_R_X86_64_PC32,
     WC_R_X86_64_PLT32,
+    /* 32-bit x86 (i386).  With NO_PIE_FLAG the wolfCrypt container emits only
+     * R_386_32 (absolute) and R_386_PC32 (PC-relative); these are semantically
+     * identical to R_X86_64_32 / R_X86_64_PC32 and share their canonicalization
+     * case below. */
+    WC_R_386_32,
+    WC_R_386_PC32,
     WC_R_AARCH64_ABS32,
     WC_R_AARCH64_ABS64,
     WC_R_AARCH64_ADD_ABS_LO12_NC,
@@ -52,6 +58,8 @@ enum wc_reloc_type {
     WC_R_AARCH64_LDST64_ABS_LO12_NC,
     WC_R_AARCH64_PREL32,
     WC_R_ARM_ABS32,
+    WC_R_ARM_CALL,
+    WC_R_ARM_JUMP24,
     WC_R_ARM_PREL31,
     WC_R_ARM_REL32,
     WC_R_ARM_THM_CALL,
diff --git a/linuxkm/linuxkm_wc_port.h b/linuxkm/linuxkm_wc_port.h
index 2a47722ad02..6f1f1881fef 100644
--- a/linuxkm/linuxkm_wc_port.h
+++ b/linuxkm/linuxkm_wc_port.h
@@ -682,8 +682,12 @@
         #define WOLFSSL_USE_SAVE_VECTOR_REGISTERS
     #endif
 
+    /* x86 (kernel_fpu_*) and ARM/ARM64 (kernel_neon_*) share the same
+     * arch-neutral save/restore tracker in x86_vector_register_glue.c; the glue
+     * functions keep their historical wc_*_x86 names on all three arches (they
+     * are outside-boundary glue reached via the PIE redirect table). */
     #if defined(WOLFSSL_USE_SAVE_VECTOR_REGISTERS) && \
-        defined(CONFIG_X86)
+        (defined(CONFIG_X86) || defined(CONFIG_ARM) || defined(CONFIG_ARM64))
 
         extern __must_check int allocate_wolfcrypt_linuxkm_fpu_states(void);
         extern void free_wolfcrypt_linuxkm_fpu_states(void);
@@ -691,18 +695,23 @@
         WOLFSSL_API __must_check int wc_save_vector_registers_x86(enum wc_svr_flags flags);
         WOLFSSL_API void wc_restore_vector_registers_x86(enum wc_svr_flags flags);
 
-        #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0)
-            #include <asm/i387.h>
-            #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)
-                /* added by a62b01cd6c */
-                #include <asm-generic/simd.h>
-            #endif
-        #else
-            #include <asm/simd.h>
-            #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
-                /* added by 266d051601 */
-                #include <crypto/internal/simd.h>
+        #ifdef CONFIG_X86
+            #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0)
+                #include <asm/i387.h>
+                #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)
+                    /* added by a62b01cd6c */
+                    #include <asm-generic/simd.h>
+                #endif
+            #else
+                #include <asm/simd.h>
+                #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
+                    /* added by 266d051601 */
+                    #include <crypto/internal/simd.h>
+                #endif
             #endif
+        #else /* CONFIG_ARM || CONFIG_ARM64 */
+            #include <asm/simd.h>   /* may_use_simd() */
+            #include <asm/neon.h>   /* kernel_neon_begin() / kernel_neon_end() */
         #endif
         #ifndef CAN_SAVE_VECTOR_REGISTERS
             #ifdef DEBUG_VECTOR_REGISTER_ACCESS_FUZZING
@@ -742,42 +751,6 @@
             #define REENABLE_VECTOR_REGISTERS() wc_restore_vector_registers_x86(WC_SVR_FLAG_INHIBIT)
         #endif
 
-    #elif defined(WOLFSSL_USE_SAVE_VECTOR_REGISTERS) && (defined(CONFIG_ARM) || defined(CONFIG_ARM64))
-
-        #error kernel module ARM SIMD is not yet tested or usable.
-
-        #include <asm/fpsimd.h>
-
-        static WARN_UNUSED_RESULT inline int save_vector_registers_arm(void)
-        {
-            preempt_disable();
-            if (! may_use_simd()) {
-                preempt_enable();
-                return BAD_STATE_E;
-            } else {
-                fpsimd_preserve_current_state();
-                return 0;
-            }
-        }
-        static inline void restore_vector_registers_arm(void)
-        {
-            fpsimd_restore_current_state();
-            preempt_enable();
-        }
-
-        #ifndef SAVE_VECTOR_REGISTERS
-            #define SAVE_VECTOR_REGISTERS(fail_clause) { int _svr_ret = save_vector_registers_arm(); if (_svr_ret != 0) { fail_clause } }
-        #endif
-        #ifndef SAVE_VECTOR_REGISTERS2
-            #define SAVE_VECTOR_REGISTERS2() save_vector_registers_arm()
-        #endif
-        #ifndef CAN_SAVE_VECTOR_REGISTERS
-            #define CAN_SAVE_VECTOR_REGISTERS() can_save_vector_registers_arm()
-        #endif
-        #ifndef RESTORE_VECTOR_REGISTERS
-            #define RESTORE_VECTOR_REGISTERS() restore_vector_registers_arm()
-        #endif
-
     #elif defined(WOLFSSL_USE_SAVE_VECTOR_REGISTERS)
         #error WOLFSSL_USE_SAVE_VECTOR_REGISTERS is set for an unimplemented architecture.
     #endif /* WOLFSSL_USE_SAVE_VECTOR_REGISTERS */
@@ -958,6 +931,22 @@
     extern int memcmp(const void *s1, const void *s2, size_t n);
 #endif
 
+#ifdef CONFIG_X86_32
+    /* arch/x86/include/asm/string_32.h #defines memcpy/memcmp/memset as
+     * __builtin_* object-like macros (x86_64's string_64.h declares them as
+     * plain functions, so this does not arise on K2).  Left active, those
+     * macros expand inside the PIE redirect-table member declarations below --
+     * "typeof(memcmp) *memcmp;" becomes "... *__builtin_memcmp;" -- so the
+     * table loses its memcmp/memcpy/memset members and the downstream
+     * WC_PIE_INDIRECT_SYM(memcmp) lookups fail to compile.  #undef the macros
+     * here, before the struct; string_32.h still declares the underlying
+     * functions, so typeof() resolves and the members and redirects use the
+     * canonical names.  Mirrors the CONFIG_MIPS handling just above. */
+    #undef memcpy
+    #undef memcmp
+    #undef memset
+#endif
+
     struct wolfssl_linuxkm_pie_redirect_table {
     #ifdef HAVE_FIPS
         typeof(wc_linuxkm_normalize_relocations) *wc_linuxkm_normalize_relocations;
@@ -1091,13 +1080,13 @@
 
         #ifdef WOLFSSL_USE_SAVE_VECTOR_REGISTERS
 
-            #ifdef CONFIG_X86
+            #if defined(CONFIG_X86) || defined(CONFIG_ARM) || defined(CONFIG_ARM64)
                 typeof(allocate_wolfcrypt_linuxkm_fpu_states) *allocate_wolfcrypt_linuxkm_fpu_states;
                 typeof(wc_can_save_vector_registers_x86) *wc_can_save_vector_registers_x86;
                 typeof(free_wolfcrypt_linuxkm_fpu_states) *free_wolfcrypt_linuxkm_fpu_states;
                 typeof(wc_restore_vector_registers_x86) *wc_restore_vector_registers_x86;
                 typeof(wc_save_vector_registers_x86) *wc_save_vector_registers_x86;
-            #else /* !CONFIG_X86 */
+            #else
                 #error WOLFSSL_USE_SAVE_VECTOR_REGISTERS is set for an unimplemented architecture.
             #endif /* arch */
 
@@ -1442,7 +1431,8 @@
     #undef get_current
     #define get_current WC_PIE_INDIRECT_SYM(get_current)
 
-    #if defined(WOLFSSL_USE_SAVE_VECTOR_REGISTERS) && defined(CONFIG_X86)
+    #if defined(WOLFSSL_USE_SAVE_VECTOR_REGISTERS) && \
+        (defined(CONFIG_X86) || defined(CONFIG_ARM) || defined(CONFIG_ARM64))
         #define allocate_wolfcrypt_linuxkm_fpu_states WC_PIE_INDIRECT_SYM(allocate_wolfcrypt_linuxkm_fpu_states)
         #define wc_can_save_vector_registers_x86 WC_PIE_INDIRECT_SYM(wc_can_save_vector_registers_x86)
         #define free_wolfcrypt_linuxkm_fpu_states WC_PIE_INDIRECT_SYM(free_wolfcrypt_linuxkm_fpu_states)
@@ -1751,7 +1741,7 @@
     #if !defined(BUILDING_WOLFSSL)
         /* some caller code needs these. */
         #if defined(WOLFSSL_USE_SAVE_VECTOR_REGISTERS)
-            #if defined(CONFIG_X86)
+            #if defined(CONFIG_X86) || defined(CONFIG_ARM) || defined(CONFIG_ARM64)
                 WOLFSSL_API __must_check int wc_can_save_vector_registers_x86(void);
                 WOLFSSL_API __must_check int wc_save_vector_registers_x86(enum wc_svr_flags flags);
                 WOLFSSL_API void wc_restore_vector_registers_x86(enum wc_svr_flags flags);
@@ -1761,9 +1751,9 @@
                 #ifndef REENABLE_VECTOR_REGISTERS
                     #define REENABLE_VECTOR_REGISTERS() wc_restore_vector_registers_x86(WC_SVR_FLAG_INHIBIT)
                 #endif
-            #else /* !CONFIG_X86 */
+            #else
                 #error WOLFSSL_USE_SAVE_VECTOR_REGISTERS is set for an unimplemented architecture.
-            #endif /* !CONFIG_X86 */
+            #endif
         #endif /* WOLFSSL_USE_SAVE_VECTOR_REGISTERS */
         #ifdef WC_LINUXKM_USE_HEAP_WRAPPERS
             WOLFSSL_API extern void *wc_linuxkm_malloc(size_t size);
diff --git a/linuxkm/module_hooks.c b/linuxkm/module_hooks.c
index 66b953048e4..9a11b7007a4 100644
--- a/linuxkm/module_hooks.c
+++ b/linuxkm/module_hooks.c
@@ -527,7 +527,9 @@ int wc_linuxkm_GenerateSeed_IntelRD(struct OS_Seed* os, byte* output, word32 sz)
 
 #endif /* WC_LINUXKM_RDSEED_IN_GLUE_LAYER */
 
-#if defined(WOLFSSL_USE_SAVE_VECTOR_REGISTERS) && defined(CONFIG_X86)
+#if defined(WOLFSSL_USE_SAVE_VECTOR_REGISTERS) && \
+    (defined(CONFIG_X86) || defined(CONFIG_ARM) || defined(CONFIG_ARM64))
+    /* arch-generic save/restore tracker (kernel_fpu_* on x86, kernel_neon_* on ARM) */
     #include "linuxkm/x86_vector_register_glue.c"
 #endif
 
@@ -1516,7 +1518,8 @@ static int set_up_wolfssl_linuxkm_pie_redirect_table(void) {
 
     wolfssl_linuxkm_pie_redirect_table.get_current = my_get_current_thread;
 
-#if defined(WOLFSSL_USE_SAVE_VECTOR_REGISTERS) && defined(CONFIG_X86)
+#if defined(WOLFSSL_USE_SAVE_VECTOR_REGISTERS) && \
+    (defined(CONFIG_X86) || defined(CONFIG_ARM) || defined(CONFIG_ARM64))
     wolfssl_linuxkm_pie_redirect_table.allocate_wolfcrypt_linuxkm_fpu_states = allocate_wolfcrypt_linuxkm_fpu_states;
     wolfssl_linuxkm_pie_redirect_table.wc_can_save_vector_registers_x86 = wc_can_save_vector_registers_x86;
     wolfssl_linuxkm_pie_redirect_table.free_wolfcrypt_linuxkm_fpu_states = free_wolfcrypt_linuxkm_fpu_states;
@@ -2041,7 +2044,12 @@ static ssize_t FIPS_optest_trig_handler(struct kobject *kobj, struct kobj_attrib
     int ret;
     int argc;
     const char *argv[3];
-    char code_buf[5];
+    /* Holds the textual error code written to the sysfs node, plus a NUL.
+     * Must accommodate the v7.0.0 module's 5-character codes (e.g. "-1015"
+     * ML_KEM_PCT_E, "-1016" ML_DSA_PCT_E, "-1017" DRBG_SHA512_KAT_FIPS_E);
+     * the earlier [5] sizing silently rejected them via the length guard
+     * below.  Sized with headroom for any future wider code. */
+    char code_buf[8];
     size_t corrected_count;
     int i;
 
@@ -2057,7 +2065,7 @@ static ssize_t FIPS_optest_trig_handler(struct kobject *kobj, struct kobj_attrib
         corrected_count = count - 1;
     else
         corrected_count = count;
-    if ((corrected_count < 1) || (corrected_count > 4))
+    if ((corrected_count < 1) || (corrected_count > (sizeof(code_buf) - 1)))
         return -EINVAL;
     XMEMCPY(code_buf, buf, corrected_count);
     code_buf[corrected_count] = 0;
diff --git a/linuxkm/pie_redirect_table.c b/linuxkm/pie_redirect_table.c
index 03be2e04fa0..657aa4c96b6 100644
--- a/linuxkm/pie_redirect_table.c
+++ b/linuxkm/pie_redirect_table.c
@@ -53,8 +53,18 @@ const struct wolfssl_linuxkm_pie_redirect_table
     return &wolfssl_linuxkm_pie_redirect_table;
 }
 
-/* placeholder implementations for missing functions. */
-#if defined(CONFIG_MIPS)
+/* placeholder implementations for missing functions.
+ *
+ * ARM/ARM64 need these for the same reason MIPS does: with vector codegen
+ * enabled (the --enable-armasm build does not pass -mgeneral-regs-only to the
+ * wolfCrypt C files), gcc auto-generates raw memcpy/memset libcalls for
+ * aggregate copies inside the position-independent FIPS container (e.g. in
+ * asn.c / fips_test.c).  Source-level XMEMCPY/XMEMSET are redirected to the
+ * kernel's fast implementations via WC_PIE_INDIRECT_SYM, but compiler-emitted
+ * libcalls bypass that #define, so the container must define its own to stay
+ * self-contained (the in-core integrity check forbids ANY undefined symbol).
+ * (The pure-C C1 build does not auto-vectorize and so never references these.) */
+#if defined(CONFIG_MIPS) || defined(CONFIG_ARM) || defined(CONFIG_ARM64)
     #undef memcpy
     void *memcpy(void *dest, const void *src, size_t n) {
         char *dest_i = (char *)dest;
@@ -74,3 +84,69 @@ const struct wolfssl_linuxkm_pie_redirect_table
         return dest;
     }
 #endif
+
+#if defined(CONFIG_ARM)
+    /* 32-bit ARM has no integer-divide instruction in the baseline ISA, so gcc
+     * emits calls to these EABI runtime helpers for '/' and '%'.  The kernel
+     * exports them (arch/arm/lib/lib1funcs.S), but the self-contained PIE FIPS
+     * container may not reference external symbols (the in-core integrity check
+     * forbids ANY undefined symbol), so provide them here.  Restoring (bit-at-a-
+     * time) division -- correctness over speed; crypto-path divisions are on
+     * small sizes/indices.  Per the EABI, __aeabi_*idivmod return the quotient
+     * in r0 and the remainder in r1, i.e. a little-endian 64-bit value with the
+     * quotient in the low word and the remainder in the high word. */
+    unsigned int __aeabi_uidiv(unsigned int n, unsigned int d);
+    unsigned int __aeabi_uidiv(unsigned int n, unsigned int d) {
+        unsigned int q = 0, r = 0;
+        int i;
+        if (d == 0)
+            return ~0u;
+        for (i = 31; i >= 0; i--) {
+            r = (r << 1) | ((n >> i) & 1u);
+            if (r >= d) {
+                r -= d;
+                q |= (1u << i);
+            }
+        }
+        return q;
+    }
+
+    unsigned long long __aeabi_uidivmod(unsigned int n, unsigned int d);
+    unsigned long long __aeabi_uidivmod(unsigned int n, unsigned int d) {
+        unsigned int q = 0, r = 0;
+        int i;
+        if (d == 0)
+            return (unsigned long long)n << 32; /* quot=0, rem=n */
+        for (i = 31; i >= 0; i--) {
+            r = (r << 1) | ((n >> i) & 1u);
+            if (r >= d) {
+                r -= d;
+                q |= (1u << i);
+            }
+        }
+        return ((unsigned long long)r << 32) | q;
+    }
+
+    int __aeabi_idiv(int n, int d);
+    int __aeabi_idiv(int n, int d) {
+        int neg = (n < 0) ^ (d < 0);
+        unsigned int un = (n < 0) ? (unsigned int)(-(long)n) : (unsigned int)n;
+        unsigned int ud = (d < 0) ? (unsigned int)(-(long)d) : (unsigned int)d;
+        unsigned int uq = __aeabi_uidiv(un, ud);
+        return neg ? -(int)uq : (int)uq;
+    }
+
+    unsigned long long __aeabi_idivmod(int n, int d);
+    unsigned long long __aeabi_idivmod(int n, int d) {
+        int nneg = (n < 0);
+        int qneg = (n < 0) ^ (d < 0);
+        unsigned int un = nneg ? (unsigned int)(-(long)n) : (unsigned int)n;
+        unsigned int ud = (d < 0) ? (unsigned int)(-(long)d) : (unsigned int)d;
+        unsigned long long um = __aeabi_uidivmod(un, ud);
+        unsigned int uq = (unsigned int)um;
+        unsigned int ur = (unsigned int)(um >> 32);
+        int q = qneg ? -(int)uq : (int)uq;
+        int r = nneg ? -(int)ur : (int)ur;
+        return ((unsigned long long)(unsigned int)r << 32) | (unsigned int)q;
+    }
+#endif /* CONFIG_ARM */
diff --git a/linuxkm/x86_vector_register_glue.c b/linuxkm/x86_vector_register_glue.c
index 107c7e11274..af4b9b6d598 100644
--- a/linuxkm/x86_vector_register_glue.c
+++ b/linuxkm/x86_vector_register_glue.c
@@ -23,8 +23,29 @@
 /* included by linuxkm/module_hooks.c */
 #ifndef WC_SKIP_INCLUDED_C_FILES
 
-#if !defined(WOLFSSL_USE_SAVE_VECTOR_REGISTERS) || !defined(CONFIG_X86)
-    #error x86_vector_register_glue.c included in non-vectorized/non-x86 project.
+#if !defined(WOLFSSL_USE_SAVE_VECTOR_REGISTERS) || \
+    !(defined(CONFIG_X86) || defined(CONFIG_ARM) || defined(CONFIG_ARM64))
+    #error vector register glue included in non-vectorized or unsupported-arch project.
+#endif
+
+/* The per-CPU vector-register save/restore tracker below is architecture-neutral
+ * except for the single kernel call that claims/releases the SIMD/FP unit:
+ *   x86        -> kernel_fpu_begin()  / kernel_fpu_end()   (<asm/fpu/api.h>, via the
+ *                                                            <asm/simd.h> include in
+ *                                                            linuxkm_wc_port.h)
+ *   ARM/ARM64  -> kernel_neon_begin() / kernel_neon_end()  (<asm/neon.h>)
+ * Both APIs obey the same context rules the tracker already enforces (may_use_simd(),
+ * hard-IRQ/NMI rejection, preempt/bh/migration disable).  The functions retain their
+ * historical wc_*_x86 names: they are internal glue OUTSIDE the FIPS module boundary
+ * (reached from boundary code only through the PIE redirect table), so keeping the
+ * names leaves the validated x86 symbol set byte-for-byte unchanged. */
+#if defined(CONFIG_X86)
+    #define WC_LINUXKM_FPU_BEGIN() kernel_fpu_begin()
+    #define WC_LINUXKM_FPU_END()   kernel_fpu_end()
+#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
+    #include <asm/neon.h>
+    #define WC_LINUXKM_FPU_BEGIN() kernel_neon_begin()
+    #define WC_LINUXKM_FPU_END()   kernel_neon_end()
 #endif
 
 #ifdef WOLFSSL_LINUXKM_VERBOSE_DEBUG
@@ -70,9 +91,12 @@ WARN_UNUSED_RESULT int allocate_wolfcrypt_linuxkm_fpu_states(void)
             wc_linuxkm_fpu_states_n_tracked * sizeof(wc_linuxkm_fpu_states[0]));
 
     if (! wc_linuxkm_fpu_states) {
+        /* cast to unsigned long to match %lu: size_t is 32-bit on arm32 but
+         * 64-bit (== unsigned long) on x86_64/arm64, so the product type differs
+         * by arch.  Cast keeps the format portable across the ARM-generalized glue. */
         pr_err("ERROR: allocation of %lu bytes for "
                "wc_linuxkm_fpu_states failed.\n",
-               nr_cpu_ids * sizeof(wc_linuxkm_fpu_states[0]));
+               (unsigned long)(nr_cpu_ids * sizeof(wc_linuxkm_fpu_states[0])));
         return MEMORY_E;
     }
 
@@ -441,10 +465,10 @@ WARN_UNUSED_RESULT int wc_save_vector_registers_x86(enum wc_svr_flags flags)
         #if IS_ENABLED(CONFIG_PREEMPT_RT)
         preempt_disable();
         #endif
-        kernel_fpu_begin();
+        WC_LINUXKM_FPU_BEGIN();
         pstate = wc_linuxkm_fpu_state_assoc(1, 1);
         if (pstate == NULL) {
-            kernel_fpu_end();
+            WC_LINUXKM_FPU_END();
             #if IS_ENABLED(CONFIG_PREEMPT_RT)
             preempt_enable();
             #endif
@@ -508,7 +532,7 @@ void wc_restore_vector_registers_x86(enum wc_svr_flags flags)
 
     if (pstate->fpu_state == 0U) {
         wc_linuxkm_fpu_state_release(pstate);
-        kernel_fpu_end();
+        WC_LINUXKM_FPU_END();
         #if IS_ENABLED(CONFIG_PREEMPT_RT)
         preempt_enable();
         #endif
diff --git a/src/include.am b/src/include.am
index 4b80e149bac..bd4d2586a2c 100644
--- a/src/include.am
+++ b/src/include.am
@@ -109,17 +109,22 @@ endif
 
 if BUILD_AESNI
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_asm.S
-if BUILD_X86_ASM
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S
-else
+# GCM PCLMUL asm is x86_64-only.  aes_gcm_asm.S is internally guarded by
+# WOLFSSL_X86_64_BUILD, so it is empty on 32-bit x86 and GCM there falls back to
+# the portable-C GHASH (still AES-NI for the counter-mode blocks).  The 32-bit GCM
+# asm (aes_gcm_x86_asm.S) is NOT position-independent -- its .text relocations
+# break the FIPS in-core integrity in a shared object -- so it is not compiled.
+# Kaleb 2026-06-17; see memory 32bit-x86-aesni-enablement.
 if BUILD_AESGCM
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_asm.S
 endif
+# aes_xts_asm.S provides AES_XTS_*_aesni for BOTH x86_64 (WOLFSSL_X86_64_BUILD) and
+# 32-bit x86 (WOLFSSL_X86_BUILD); compile it for either, not only the non-X86_ASM
+# (x86_64) path -- the 32-bit AES-NI XTS section was added 2026-06-17.
 if BUILD_AESXTS
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_xts_asm.S
 endif
 endif
-endif
 
 if BUILD_DES3
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/des3.c
@@ -259,17 +264,22 @@ endif BUILD_PPC64_ASM
 
 if BUILD_AESNI
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_asm.S
-if BUILD_X86_ASM
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S
-else
+# GCM PCLMUL asm is x86_64-only.  aes_gcm_asm.S is internally guarded by
+# WOLFSSL_X86_64_BUILD, so it is empty on 32-bit x86 and GCM there falls back to
+# the portable-C GHASH (still AES-NI for the counter-mode blocks).  The 32-bit GCM
+# asm (aes_gcm_x86_asm.S) is NOT position-independent -- its .text relocations
+# break the FIPS in-core integrity in a shared object -- so it is not compiled.
+# Kaleb 2026-06-17; see memory 32bit-x86-aesni-enablement.
 if BUILD_AESGCM
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_asm.S
 endif
+# aes_xts_asm.S provides AES_XTS_*_aesni for BOTH x86_64 (WOLFSSL_X86_64_BUILD) and
+# 32-bit x86 (WOLFSSL_X86_BUILD); compile it for either, not only the non-X86_ASM
+# (x86_64) path -- the 32-bit AES-NI XTS section was added 2026-06-17.
 if BUILD_AESXTS
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_xts_asm.S
 endif
 endif
-endif
 
 if BUILD_RISCV_ASM
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/riscv/riscv-64-aes.c
@@ -532,17 +542,22 @@ endif BUILD_PPC64_ASM
 
 if BUILD_AESNI
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_asm.S
-if BUILD_X86_ASM
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S
-else
+# GCM PCLMUL asm is x86_64-only.  aes_gcm_asm.S is internally guarded by
+# WOLFSSL_X86_64_BUILD, so it is empty on 32-bit x86 and GCM there falls back to
+# the portable-C GHASH (still AES-NI for the counter-mode blocks).  The 32-bit GCM
+# asm (aes_gcm_x86_asm.S) is NOT position-independent -- its .text relocations
+# break the FIPS in-core integrity in a shared object -- so it is not compiled.
+# Kaleb 2026-06-17; see memory 32bit-x86-aesni-enablement.
 if BUILD_AESGCM
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_asm.S
 endif
+# aes_xts_asm.S provides AES_XTS_*_aesni for BOTH x86_64 (WOLFSSL_X86_64_BUILD) and
+# 32-bit x86 (WOLFSSL_X86_BUILD); compile it for either, not only the non-X86_ASM
+# (x86_64) path -- the 32-bit AES-NI XTS section was added 2026-06-17.
 if BUILD_AESXTS
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_xts_asm.S
 endif
 endif
-endif
 
 if BUILD_RISCV_ASM
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/riscv/riscv-64-aes.c
@@ -867,17 +882,22 @@ endif BUILD_AES
 
 if BUILD_AESNI
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_asm.S
-if BUILD_X86_ASM
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S
-else
+# GCM PCLMUL asm is x86_64-only.  aes_gcm_asm.S is internally guarded by
+# WOLFSSL_X86_64_BUILD, so it is empty on 32-bit x86 and GCM there falls back to
+# the portable-C GHASH (still AES-NI for the counter-mode blocks).  The 32-bit GCM
+# asm (aes_gcm_x86_asm.S) is NOT position-independent -- its .text relocations
+# break the FIPS in-core integrity in a shared object -- so it is not compiled.
+# Kaleb 2026-06-17; see memory 32bit-x86-aesni-enablement.
 if BUILD_AESGCM
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_asm.S
 endif
+# aes_xts_asm.S provides AES_XTS_*_aesni for BOTH x86_64 (WOLFSSL_X86_64_BUILD) and
+# 32-bit x86 (WOLFSSL_X86_BUILD); compile it for either, not only the non-X86_ASM
+# (x86_64) path -- the 32-bit AES-NI XTS section was added 2026-06-17.
 if BUILD_AESXTS
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_xts_asm.S
 endif
 endif
-endif
 
 if BUILD_SHA
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sha.c
@@ -1708,18 +1728,23 @@ endif
 if !BUILD_FIPS_V2_PLUS
 if BUILD_AESNI
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_asm.S
-if BUILD_X86_ASM
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S
-else
+# GCM PCLMUL asm is x86_64-only.  aes_gcm_asm.S is internally guarded by
+# WOLFSSL_X86_64_BUILD, so it is empty on 32-bit x86 and GCM there falls back to
+# the portable-C GHASH (still AES-NI for the counter-mode blocks).  The 32-bit GCM
+# asm (aes_gcm_x86_asm.S) is NOT position-independent -- its .text relocations
+# break the FIPS in-core integrity in a shared object -- so it is not compiled.
+# Kaleb 2026-06-17; see memory 32bit-x86-aesni-enablement.
 if BUILD_AESGCM
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_asm.S
 endif
+# aes_xts_asm.S provides AES_XTS_*_aesni for BOTH x86_64 (WOLFSSL_X86_64_BUILD) and
+# 32-bit x86 (WOLFSSL_X86_BUILD); compile it for either, not only the non-X86_ASM
+# (x86_64) path -- the 32-bit AES-NI XTS section was added 2026-06-17.
 if BUILD_AESXTS
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_xts_asm.S
 endif
 endif
 endif
-endif
 
 if BUILD_CAMELLIA
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/camellia.c
diff --git a/tests/api/test_aes.c b/tests/api/test_aes.c
index 72221cd04ad..ec767c1319c 100644
--- a/tests/api/test_aes.c
+++ b/tests/api/test_aes.c
@@ -693,7 +693,14 @@ static int test_wc_AesCbcEncryptDecrypt_WithKey(Aes* aes, byte* key,
     ExpectIntEQ(wc_AesCbcEncrypt(aes, cipher, vector, vector_len),
         0);
     ExpectBufEQ(cipher, vector_enc, vector_len);
-#ifdef WOLFSSL_AES_CBC_LENGTH_CHECKS
+    /* The BAD_LENGTH_E enforcement is in the non-FIPS aes.c implementation
+     * (see WOLFSSL_AES_CBC_LENGTH_CHECKS guard there).  FIPSv2 (cert3389)
+     * routes through its own historical wc_AesCbcEncrypt_fips wrapper that
+     * predates this check and silently returns 0 on unaligned input.  Only
+     * v5.x and newer FIPS modules carry the wrapper-level check.  Skip the
+     * assertion for FIPSv2 builds. */
+#if defined(WOLFSSL_AES_CBC_LENGTH_CHECKS) && \
+    (!defined(HAVE_FIPS) || FIPS_VERSION_GE(5,0))
     ExpectIntEQ(wc_AesCbcEncrypt(aes, cipher, vector, vector_len - 1),
         WC_NO_ERR_TRACE(BAD_LENGTH_E));
 #endif
@@ -703,7 +710,9 @@ static int test_wc_AesCbcEncryptDecrypt_WithKey(Aes* aes, byte* key,
     ExpectIntEQ(wc_AesCbcDecrypt(aes, decrypted, cipher,
         WC_AES_BLOCK_SIZE * 2), 0);
     ExpectBufEQ(decrypted, vector, vector_len);
-#ifdef WOLFSSL_AES_CBC_LENGTH_CHECKS
+#if defined(WOLFSSL_AES_CBC_LENGTH_CHECKS) && \
+    (!defined(HAVE_FIPS) || FIPS_VERSION_GE(5,0))
+    /* Same FIPSv2 vs v5+ rationale as the encrypt assertion above. */
     ExpectIntEQ(wc_AesCbcDecrypt(aes, decrypted, cipher,
         WC_AES_BLOCK_SIZE * 2 - 1), WC_NO_ERR_TRACE(BAD_LENGTH_E));
 #else
diff --git a/tests/api/test_evp_pkey.c b/tests/api/test_evp_pkey.c
index 9bdd5b9339d..2e106d16d6a 100644
--- a/tests/api/test_evp_pkey.c
+++ b/tests/api/test_evp_pkey.c
@@ -1526,7 +1526,7 @@ static int test_wolfSSL_EVP_PKEY_sign_verify(int keyType)
     !defined(HAVE_SELFTEST)
 #if !defined(HAVE_FIPS) || (defined(HAVE_FIPS_VERSION) && (HAVE_FIPS_VERSION>2))
         {
-            ExpectNotNull(rsa = RSA_generate_key(2048, 3, NULL, NULL));
+            ExpectNotNull(rsa = RSA_generate_key(2048, 65537, NULL, NULL));
             ExpectIntEQ(EVP_PKEY_assign_RSA(pkey, rsa), WOLFSSL_SUCCESS);
         }
 #endif
@@ -2159,7 +2159,7 @@ int test_wolfSSL_EVP_PKEY_encrypt(void)
         XMEMSET(outDec, 0, rsaKeySz);
     }
 
-    ExpectNotNull(rsa = RSA_generate_key(2048, 3, NULL, NULL));
+    ExpectNotNull(rsa = RSA_generate_key(2048, 65537, NULL, NULL));
     ExpectNotNull(pkey = wolfSSL_EVP_PKEY_new());
     ExpectIntEQ(EVP_PKEY_assign_RSA(pkey, rsa), WOLFSSL_SUCCESS);
     if (EXPECT_FAIL()) {
diff --git a/tests/api/test_ossl_rsa.c b/tests/api/test_ossl_rsa.c
index dc0cee665ba..250d1df0070 100644
--- a/tests/api/test_ossl_rsa.c
+++ b/tests/api/test_ossl_rsa.c
@@ -65,7 +65,7 @@ int test_wolfSSL_RSA(void)
 
     RSA_free(rsa);
     rsa = NULL;
-    ExpectNotNull(rsa = RSA_generate_key(2048, 3, NULL, NULL));
+    ExpectNotNull(rsa = RSA_generate_key(2048, 65537, NULL, NULL));
     ExpectIntEQ(RSA_size(rsa), 256);
 
 #if (!defined(HAVE_FIPS) || FIPS_VERSION3_GT(6,0,0)) && !defined(HAVE_SELFTEST)
@@ -306,7 +306,7 @@ int test_wolfSSL_RSA(void)
     rsa = NULL;
 
 #if !defined(USE_FAST_MATH) || (FP_MAX_BITS >= (3072*2))
-    ExpectNotNull(rsa = RSA_generate_key(3072, 17, NULL, NULL));
+    ExpectNotNull(rsa = RSA_generate_key(3072, 65537, NULL, NULL));
     ExpectIntEQ(RSA_size(rsa), 384);
     ExpectIntEQ(RSA_bits(rsa), 3072);
     RSA_free(rsa);
@@ -461,7 +461,7 @@ int test_wolfSSL_RSA_print(void)
 
     RSA_free(rsa);
     rsa = NULL;
-    ExpectNotNull(rsa = RSA_generate_key(2048, 3, NULL, NULL));
+    ExpectNotNull(rsa = RSA_generate_key(2048, 65537, NULL, NULL));
 
     ExpectIntEQ(RSA_print(bio, rsa, 0), 1);
     ExpectIntEQ(RSA_print(bio, rsa, 4), 1);
@@ -644,11 +644,11 @@ int test_wolfSSL_RSA_meth(void)
     RSA_METHOD *rsa_meth = NULL;
 
 #ifdef WOLFSSL_KEY_GEN
-    ExpectNotNull(rsa = RSA_generate_key(2048, 3, NULL, NULL));
+    ExpectNotNull(rsa = RSA_generate_key(2048, 65537, NULL, NULL));
     RSA_free(rsa);
     rsa = NULL;
 #else
-    ExpectNull(rsa = RSA_generate_key(2048, 3, NULL, NULL));
+    ExpectNull(rsa = RSA_generate_key(2048, 65537, NULL, NULL));
 #endif
 
     ExpectNotNull(RSA_get_default_method());
diff --git a/tests/api/test_slhdsa.c b/tests/api/test_slhdsa.c
index 988bbc579e0..4510319925e 100644
--- a/tests/api/test_slhdsa.c
+++ b/tests/api/test_slhdsa.c
@@ -1081,12 +1081,14 @@ int test_wc_slhdsa_sign_hash(void)
         WC_HASH_TYPE_SHA256, sig, sigLen),
         WC_NO_ERR_TRACE(BAD_LENGTH_E));
 
-    /* Unsupported hashType (FIPS 205 doesn't list WC_HASH_TYPE_NONE) hits
-     * the default branch of slhdsakey_validate_prehash. */
+    /* WC_HASH_TYPE_NONE is the "pure SLH-DSA" sentinel and is never a valid
+     * pre-hash algorithm (FIPS 205 Section 10.2.2 / Table 9).  HashSLH-DSA
+     * signing rejects it with an explicit early check (BAD_FUNC_ARG), not via
+     * the slhdsa_check_hash_for_n() switch default. */
     sigLen = WC_SLHDSA_MAX_SIG_LEN;
     ExpectIntEQ(wc_SlhDsaKey_SignHash(&key, ctx, sizeof(ctx), hash, 32,
         WC_HASH_TYPE_NONE, sig, &sigLen, &rng),
-        WC_NO_ERR_TRACE(NOT_COMPILED_IN));
+        WC_NO_ERR_TRACE(BAD_FUNC_ARG));
 
     /* Test SignHash with SHA-256. */
     sigLen = WC_SLHDSA_MAX_SIG_LEN;
diff --git a/wolfcrypt/benchmark/fips_cast_bench.c b/wolfcrypt/benchmark/fips_cast_bench.c
new file mode 100644
index 00000000000..bd7c0e9dbc6
--- /dev/null
+++ b/wolfcrypt/benchmark/fips_cast_bench.c
@@ -0,0 +1,363 @@
+/* fips_cast_bench.c
+ *
+ * Copyright (C) 2006-2026 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+/* FIPS CAST benchmark.
+ *
+ * Measures the wall-clock cost of each Conditional Algorithm Self-Test (CAST)
+ * defined by the wolfCrypt v7.0.0 FIPS module so operators can budget module
+ * power-on latency on resource-constrained operational environments (DSP,
+ * MCU) where every additional CAST is directly observable as boot-time delay.
+ *
+ * Compiled only when HAVE_FIPS is defined (see wolfcrypt/benchmark/include.am
+ * BUILD_FIPS gate).  Calls wc_RunCast_fips(id) repeatedly per CAST and reports
+ * mean / stddev / min / max for each, plus total time for one pass over all
+ * enabled CASTs (the cost paid by callers that invoke wc_RunAllCast_fips() at
+ * application start).
+ *
+ * Citations:
+ *   FIPS 140-3 sec 7.10 (Self-Tests) - CAST framework
+ *   FIPS 140-3 IG 10.3.A           - Algorithm-by-algorithm CAST coverage
+ *   ISO/IEC 19790:2012 sec 7.10.2  - Conditional self-test execution
+ */
+
+#ifdef HAVE_CONFIG_H
+    #include <config.h>
+#endif
+
+#if !defined(WOLFSSL_USER_SETTINGS) && !defined(WOLFSSL_NO_OPTIONS_H)
+    #include <wolfssl/options.h>
+#endif
+#include <wolfssl/wolfcrypt/settings.h> /* also picks up user_settings.h */
+
+/* fips_cast_bench drives wc_RunCast_fips() / wc_RunAllCast_fips() which were
+ * introduced in the v7.0.0 module's CAST framework.  Older 140-3 modules
+ * (v5.x, v6.0.0) and the FIPSv2 module do not export these symbols, so when
+ * fips-check.sh swaps in an older-flavor fips/ tree this file would otherwise
+ * fail to link.  Gate the entire benchmark on FIPS_VERSION3_GE(7,0,0); for
+ * older flavors we fall through to the empty-main stub at the bottom of the
+ * file so the build still produces an executable. */
+#if defined(HAVE_FIPS) && FIPS_VERSION3_GE(7,0,0)
+
+#include <wolfssl/version.h>
+#include <wolfssl/wolfcrypt/types.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/fips_test.h>
+#include <wolfssl/wolfcrypt/random.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <limits.h>
+
+#ifdef _WIN32
+    #define WIN32_LEAN_AND_MEAN
+    #include <windows.h>
+#else
+    #include <time.h>
+#endif
+
+
+#define BENCH_DEFAULT_ITERS 10
+
+/* Map FIPS_CAST_* enum value to a printable name.  Kept in sync with
+ * wolfssl/wolfcrypt/fips_test.h FipsCastId enum. */
+static const char* cast_name(int id)
+{
+    switch (id) {
+        case FIPS_CAST_AES_CBC:           return "AES-CBC";
+        case FIPS_CAST_AES_GCM:           return "AES-GCM";
+        case FIPS_CAST_HMAC_SHA1:         return "HMAC-SHA-1";
+        case FIPS_CAST_HMAC_SHA2_256:     return "HMAC-SHA2-256";
+        case FIPS_CAST_HMAC_SHA2_512:     return "HMAC-SHA2-512";
+        case FIPS_CAST_HMAC_SHA3_256:     return "HMAC-SHA3-256";
+        case FIPS_CAST_DRBG:              return "DRBG (SHA-256)";
+        case FIPS_CAST_RSA_SIGN_PKCS1v15: return "RSA-SIGN-PKCS1v15";
+        case FIPS_CAST_ECC_CDH:           return "ECC-CDH";
+        case FIPS_CAST_ECC_PRIMITIVE_Z:   return "ECC-Primitive-Z";
+        case FIPS_CAST_DH_PRIMITIVE_Z:    return "DH-Primitive-Z";
+        case FIPS_CAST_ECDSA:             return "ECDSA";
+        case FIPS_CAST_KDF_TLS12:         return "KDF-TLS12";
+        case FIPS_CAST_KDF_TLS13:         return "KDF-TLS13";
+        case FIPS_CAST_KDF_SSH:           return "KDF-SSH";
+#if defined(FIPS_VERSION_GE) && FIPS_VERSION_GE(6,0)
+        case FIPS_CAST_KDF_SRTP:          return "KDF-SRTP";
+        case FIPS_CAST_ED25519:           return "Ed25519";
+        case FIPS_CAST_ED448:             return "Ed448";
+        case FIPS_CAST_PBKDF2:            return "PBKDF2";
+#endif
+#if defined(FIPS_VERSION_GE) && FIPS_VERSION_GE(7,0)
+        case FIPS_CAST_AES_ECB:           return "AES-ECB";
+        case FIPS_CAST_ML_KEM:            return "ML-KEM";
+        case FIPS_CAST_ML_DSA:            return "ML-DSA";
+        case FIPS_CAST_LMS:               return "LMS";
+        case FIPS_CAST_XMSS:              return "XMSS";
+        case FIPS_CAST_DRBG_SHA512:       return "DRBG (SHA-512)";
+        case FIPS_CAST_SLH_DSA:           return "SLH-DSA";
+        case FIPS_CAST_AES_CMAC:          return "AES-CMAC";
+        case FIPS_CAST_SHAKE:             return "SHAKE";
+        case FIPS_CAST_AES_KW:            return "AES-KW";
+#endif
+        default:                          return "(unknown)";
+    }
+}
+
+
+/* Monotonic clock in nanoseconds.  POSIX clock_gettime(CLOCK_MONOTONIC) on
+ * Unix-like systems; QueryPerformanceCounter on Windows. */
+static long long now_ns(void)
+{
+#ifdef _WIN32
+    static LARGE_INTEGER freq = { 0 };
+    LARGE_INTEGER count;
+    if (freq.QuadPart == 0)
+        QueryPerformanceFrequency(&freq);
+    QueryPerformanceCounter(&count);
+    /* Multiply before divide to keep precision; freq is typically 10MHz. */
+    return (long long)((count.QuadPart * 1000000000LL) / freq.QuadPart);
+#else
+    struct timespec ts;
+    if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0)
+        return 0;
+    return (long long)ts.tv_sec * 1000000000LL + (long long)ts.tv_nsec;
+#endif
+}
+
+
+/* Run a single CAST iters times, populate stats (in milliseconds).
+ * Returns 0 on success, non-zero on first CAST failure. */
+static int run_one_cast(int id, int iters,
+                        double* out_mean_ms, double* out_stddev_ms,
+                        double* out_min_ms, double* out_max_ms)
+{
+    int i;
+    long long total = 0;
+    long long mn = LLONG_MAX;
+    long long mx = 0;
+    long long* samples;
+    double mean_ns;
+    double variance_acc = 0.0;
+
+    if (iters <= 0)
+        return BAD_FUNC_ARG;
+
+    samples = (long long*)XMALLOC((size_t)iters * sizeof(long long), NULL,
+                                  DYNAMIC_TYPE_TMP_BUFFER);
+    if (samples == NULL)
+        return MEMORY_E;
+
+    for (i = 0; i < iters; i++) {
+        long long t0, t1, dt;
+        int rc;
+
+        t0 = now_ns();
+        rc = wc_RunCast_fips(id);
+        t1 = now_ns();
+        if (rc != 0) {
+            XFREE(samples, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+            return rc;
+        }
+        dt = t1 - t0;
+        if (dt < 0)
+            dt = 0;
+        samples[i] = dt;
+        total += dt;
+        if (dt < mn)
+            mn = dt;
+        if (dt > mx)
+            mx = dt;
+    }
+
+    mean_ns = (double)total / (double)iters;
+    for (i = 0; i < iters; i++) {
+        double d = (double)samples[i] - mean_ns;
+        variance_acc += d * d;
+    }
+    XFREE(samples, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+
+    *out_mean_ms   = mean_ns / 1.0e6;
+    *out_stddev_ms = sqrt(variance_acc / (double)iters) / 1.0e6;
+    *out_min_ms    = (double)mn / 1.0e6;
+    *out_max_ms    = (double)mx / 1.0e6;
+    return 0;
+}
+
+
+static void usage(const char* prog)
+{
+    printf("usage: %s [-i ITERS] [-c CAST_ID] [-l]\n", prog);
+    printf("  -i ITERS    iterations per CAST (default %d)\n",
+           BENCH_DEFAULT_ITERS);
+    printf("  -c CAST_ID  benchmark only the named CAST id\n");
+    printf("  -l          list CAST ids and names; do not run\n");
+    printf("  -h          show this help\n");
+}
+
+
+int main(int argc, char** argv)
+{
+    int iters = BENCH_DEFAULT_ITERS;
+    int single = -1;
+    int list_only = 0;
+    int i;
+    int first, last;
+    int failures = 0;
+    int run_count = 0;
+    double total_mean_ms = 0.0;
+
+    for (i = 1; i < argc; i++) {
+        if (XSTRCMP(argv[i], "-i") == 0 && i + 1 < argc) {
+            iters = atoi(argv[++i]);
+            if (iters <= 0) {
+                fprintf(stderr, "-i requires a positive iteration count\n");
+                return 2;
+            }
+        } else if (XSTRCMP(argv[i], "-c") == 0 && i + 1 < argc) {
+            single = atoi(argv[++i]);
+        } else if (XSTRCMP(argv[i], "-l") == 0) {
+            list_only = 1;
+        } else if (XSTRCMP(argv[i], "-h") == 0
+                || XSTRCMP(argv[i], "--help") == 0) {
+            usage(argv[0]);
+            return 0;
+        } else {
+            fprintf(stderr, "unknown argument: %s\n", argv[i]);
+            usage(argv[0]);
+            return 2;
+        }
+    }
+
+    if (list_only) {
+        printf("FIPS CAST IDs (FIPS_CAST_COUNT = %d):\n", FIPS_CAST_COUNT);
+        for (i = 0; i < FIPS_CAST_COUNT; i++)
+            printf("  %2d  %s\n", i, cast_name(i));
+        return 0;
+    }
+
+    if (single >= 0 && single >= FIPS_CAST_COUNT) {
+        fprintf(stderr, "CAST id %d out of range (0..%d)\n",
+                single, FIPS_CAST_COUNT - 1);
+        return 2;
+    }
+
+    printf("wolfCrypt FIPS CAST benchmark\n");
+    printf("Library version: %s\n", LIBWOLFSSL_VERSION_STRING);
+    printf("FIPS_CAST_COUNT: %d\n", FIPS_CAST_COUNT);
+    printf("Iterations per CAST: %d\n", iters);
+    printf("Clock: %s\n",
+#ifdef _WIN32
+           "QueryPerformanceCounter"
+#else
+           "clock_gettime(CLOCK_MONOTONIC)"
+#endif
+           );
+    printf("\n");
+
+    /* Register the default DRBG seed callback (mirrors benchmark.c and
+     * wolfcrypt/test/test.c).  Builds with WC_RNG_SEED_CB - which include
+     * the FIPS optest CFLAGS - require every application that initializes
+     * the RNG to register a seed generator before _InitRng can produce a
+     * working DRBG; without it, wc_InitRng inside the ECC_PRIMITIVE_Z and
+     * ECDSA CASTs returns -199 (RNG_FAILURE_E) and the dependent CASTs
+     * cascade-fail. */
+#ifdef WC_RNG_SEED_CB
+    {
+        int seed_cb_rc = wc_SetSeed_Cb(WC_GENERATE_SEED_DEFAULT);
+        if (seed_cb_rc != 0) {
+            fprintf(stderr,
+                "wc_SetSeed_Cb returned %d - DRBG-using CASTs will fail.\n",
+                seed_cb_rc);
+        }
+    }
+#endif
+
+    /* Prime: run every CAST once via wc_RunAllCast_fips() so each CAST
+     * reaches FIPS_CAST_STATE_SUCCESS before we begin measuring.  This
+     * isolates the per-CAST KAT runtime cost from the cascading
+     * recursive-CAST init chain that fires on the first invocation of a
+     * cold CAST whose KAT internally calls FIPS-wrapped primitives whose
+     * own CASTs are still in INIT state.  Customers calling
+     * wc_RunAllCast_fips() at boot pay this one-time cost up front, so
+     * priming here matches that real-world workflow. */
+    {
+        int prime_rc = wc_RunAllCast_fips();
+        if (prime_rc != 0) {
+            fprintf(stderr,
+                "wc_RunAllCast_fips() prime returned %d - some CASTs may have failed.\n"
+                "Per-CAST measurements continue but failed CASTs will report errors.\n\n",
+                prime_rc);
+        }
+    }
+
+    printf("ID | Name                | Mean(ms) | StdDev(ms) | Min(ms) "
+           "| Max(ms)\n");
+    printf("---+---------------------+----------+------------+---------"
+           "+---------\n");
+
+    first = (single >= 0) ? single : 0;
+    last  = (single >= 0) ? single + 1 : FIPS_CAST_COUNT;
+
+    for (i = first; i < last; i++) {
+        double mean_ms = 0, sd_ms = 0, mn_ms = 0, mx_ms = 0;
+        int rc = run_one_cast(i, iters, &mean_ms, &sd_ms, &mn_ms, &mx_ms);
+        if (rc != 0) {
+            printf("%2d | %-19s | FAILED rc=%d (%s)\n",
+                   i, cast_name(i), rc, wc_GetErrorString(rc));
+            failures++;
+            continue;
+        }
+        printf("%2d | %-19s | %8.3f | %10.3f | %7.3f | %7.3f\n",
+               i, cast_name(i), mean_ms, sd_ms, mn_ms, mx_ms);
+        total_mean_ms += mean_ms;
+        run_count++;
+    }
+
+    printf("\n");
+    if (run_count > 0) {
+        printf("Sum of mean CAST times (one wc_RunAllCast_fips() pass): "
+               "%.3f ms\n", total_mean_ms);
+    }
+    if (failures > 0) {
+        printf("WARN: %d CAST(s) failed.\n", failures);
+        return 1;
+    }
+    return 0;
+}
+
+#else /* !(HAVE_FIPS && FIPS_VERSION3_GE(7,0,0)) */
+
+#include <stdio.h>
+
+int main(void)
+{
+#ifndef HAVE_FIPS
+    fprintf(stderr,
+            "fips_cast_bench: built without HAVE_FIPS - nothing to measure\n");
+#else
+    fprintf(stderr,
+            "fips_cast_bench: requires v7.0.0+ FIPS module "
+            "(wc_RunCast_fips / wc_RunAllCast_fips were added in v7) - "
+            "nothing to measure on this older module flavor\n");
+#endif
+    return 0;
+}
+
+#endif /* HAVE_FIPS && FIPS_VERSION3_GE(7,0,0) */
diff --git a/wolfcrypt/benchmark/include.am b/wolfcrypt/benchmark/include.am
index 22cecbdaefe..130343a14e1 100644
--- a/wolfcrypt/benchmark/include.am
+++ b/wolfcrypt/benchmark/include.am
@@ -10,6 +10,16 @@ wolfcrypt_benchmark_benchmark_LDADD        = src/libwolfssl@LIBSUFFIX@.la $(LIB_
 wolfcrypt_benchmark_benchmark_DEPENDENCIES = src/libwolfssl@LIBSUFFIX@.la
 noinst_HEADERS += wolfcrypt/benchmark/benchmark.h
 
+# FIPS CAST benchmark - measures wc_RunCast_fips() execution time per CAST.
+# Helps operators of resource-constrained operational environments budget
+# module power-on latency.  Compiled only when FIPS is enabled.
+if BUILD_FIPS
+noinst_PROGRAMS += wolfcrypt/benchmark/fips_cast_bench
+wolfcrypt_benchmark_fips_cast_bench_SOURCES      = wolfcrypt/benchmark/fips_cast_bench.c
+wolfcrypt_benchmark_fips_cast_bench_LDADD        = src/libwolfssl@LIBSUFFIX@.la $(LIB_STATIC_ADD) -lm
+wolfcrypt_benchmark_fips_cast_bench_DEPENDENCIES = src/libwolfssl@LIBSUFFIX@.la
+endif
+
 endif
 endif
 
diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c
index 6806acbc965..95386f5f3d0 100644
--- a/wolfcrypt/src/aes.c
+++ b/wolfcrypt/src/aes.c
@@ -136,6 +136,16 @@ block cipher mechanism that uses n-bit binary string parameter key with 128-bits
 
 #include <wolfssl/wolfcrypt/cpuid.h>
 
+/* The dedicated GCM (PCLMUL/GHASH) assembly is x86_64-only: the 32-bit GCM asm
+ * (aes_gcm_x86_asm.S) is not position-independent and its .text relocations break
+ * the FIPS module in-core integrity in a shared object.  On 32-bit x86, GCM uses
+ * the portable-C GHASH with AES-NI block encryption (AES-NI still engaged for the
+ * counter-mode blocks) -- mirrors the existing x86_64-gating of the GCM-AVX path.
+ * Kaleb 2026-06-17; see memory 32bit-x86-aesni-enablement. */
+#if defined(WOLFSSL_AESNI) && defined(WOLFSSL_X86_64_BUILD)
+    #define WC_AESNI_GCM
+#endif
+
 #ifdef WOLF_CRYPTO_CB
     #include <wolfssl/wolfcrypt/cryptocb.h>
 #endif
@@ -913,6 +923,124 @@ static void Check_CPU_support_HwCrypto(Aes* aes)
 }
 #endif /* __aarch64__ && !WOLFSSL_ARMASM_NO_HW_CRYPTO */
 
+/* In a Linux kernel module the 32-bit ARM AES asm (ARMv8 AArch32 AES/PMULL crypto
+ * extension + NEON) MUST run between kernel_neon_begin()/end() or the first SIMD
+ * instruction faults "undefined instruction".  wolfSSL never bracketed the
+ * in-kernel ARM crypto (it only happened to work on aarch64 because that kernel
+ * tolerates it).  We wrap every AES_*_AARCH32 entry with SAVE/RESTORE_VECTOR_
+ * REGISTERS, then #define-redirect the call sites below.  Defined before the
+ * #defines so the wrappers reference the real asm (no recursion).  Scoped to
+ * !__aarch64__ so the aarch64 path is byte-identical.  (FIPS 197 AES, SP 800-38D
+ * AES-GCM.)  On a (process-context-only here) save failure the op is skipped
+ * rather than crashing -- never reached by POST/optest/harness. */
+#if defined(WOLFSSL_USE_SAVE_VECTOR_REGISTERS) && !defined(__aarch64__) && \
+    !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+    static WC_INLINE void wc_svr_AES_set_key_AARCH32(const byte* userKey,
+        int keylen, byte* key, int dir) {
+        if (SAVE_VECTOR_REGISTERS2() != 0) return;
+        AES_set_key_AARCH32(userKey, keylen, key, dir);
+        RESTORE_VECTOR_REGISTERS();
+    }
+    static WC_INLINE void wc_svr_AES_encrypt_AARCH32(const byte* inBlock,
+        byte* outBlock, byte* key, int nr) {
+        if (SAVE_VECTOR_REGISTERS2() != 0) return;
+        AES_encrypt_AARCH32(inBlock, outBlock, key, nr);
+        RESTORE_VECTOR_REGISTERS();
+    }
+    static WC_INLINE void wc_svr_AES_decrypt_AARCH32(const byte* inBlock,
+        byte* outBlock, byte* key, int nr) {
+        if (SAVE_VECTOR_REGISTERS2() != 0) return;
+        AES_decrypt_AARCH32(inBlock, outBlock, key, nr);
+        RESTORE_VECTOR_REGISTERS();
+    }
+    static WC_INLINE void wc_svr_AES_encrypt_blocks_AARCH32(const byte* in,
+        byte* out, word32 sz, byte* key, int nr) {
+        if (SAVE_VECTOR_REGISTERS2() != 0) return;
+        AES_encrypt_blocks_AARCH32(in, out, sz, key, nr);
+        RESTORE_VECTOR_REGISTERS();
+    }
+    static WC_INLINE void wc_svr_AES_decrypt_blocks_AARCH32(const byte* in,
+        byte* out, word32 sz, byte* key, int nr) {
+        if (SAVE_VECTOR_REGISTERS2() != 0) return;
+        AES_decrypt_blocks_AARCH32(in, out, sz, key, nr);
+        RESTORE_VECTOR_REGISTERS();
+    }
+    static WC_INLINE void wc_svr_AES_CBC_encrypt_AARCH32(const byte* in,
+        byte* out, word32 sz, byte* reg, byte* key, int rounds) {
+        if (SAVE_VECTOR_REGISTERS2() != 0) return;
+        AES_CBC_encrypt_AARCH32(in, out, sz, reg, key, rounds);
+        RESTORE_VECTOR_REGISTERS();
+    }
+    static WC_INLINE void wc_svr_AES_CBC_decrypt_AARCH32(const byte* in,
+        byte* out, word32 sz, byte* reg, byte* key, int rounds) {
+        if (SAVE_VECTOR_REGISTERS2() != 0) return;
+        AES_CBC_decrypt_AARCH32(in, out, sz, reg, key, rounds);
+        RESTORE_VECTOR_REGISTERS();
+    }
+    static WC_INLINE void wc_svr_AES_CTR_encrypt_AARCH32(const byte* in,
+        byte* out, word32 sz, byte* reg, byte* key, byte* tmp, word32* left,
+        word32 rounds) {
+        if (SAVE_VECTOR_REGISTERS2() != 0) return;
+        AES_CTR_encrypt_AARCH32(in, out, sz, reg, key, tmp, left, rounds);
+        RESTORE_VECTOR_REGISTERS();
+    }
+    static WC_INLINE void wc_svr_AES_GCM_set_key_AARCH32(const byte* nonce,
+        const byte* key, byte* gcm_h, int nr) {
+        if (SAVE_VECTOR_REGISTERS2() != 0) return;
+        AES_GCM_set_key_AARCH32(nonce, key, gcm_h, nr);
+        RESTORE_VECTOR_REGISTERS();
+    }
+    static WC_INLINE void wc_svr_AES_GCM_encrypt_AARCH32(const byte* in,
+        byte* out, word32 sz, const byte* nonce, word32 nonceSz, byte* tag,
+        word32 tagSz, const byte* aad, word32 aadSz, byte* key, byte* gcm_h,
+        byte* tmp, byte* reg, int nr) {
+        if (SAVE_VECTOR_REGISTERS2() != 0) return;
+        AES_GCM_encrypt_AARCH32(in, out, sz, nonce, nonceSz, tag, tagSz, aad,
+            aadSz, key, gcm_h, tmp, reg, nr);
+        RESTORE_VECTOR_REGISTERS();
+    }
+    static WC_INLINE int wc_svr_AES_GCM_decrypt_AARCH32(const byte* in,
+        byte* out, word32 sz, const byte* nonce, word32 nonceSz, const byte* tag,
+        word32 tagSz, const byte* aad, word32 aadSz, byte* key, byte* gcm_h,
+        byte* tmp, byte* reg, int nr) {
+        int _ret, _svr = SAVE_VECTOR_REGISTERS2();
+        if (_svr != 0) return _svr;
+        _ret = AES_GCM_decrypt_AARCH32(in, out, sz, nonce, nonceSz, tag, tagSz,
+            aad, aadSz, key, gcm_h, tmp, reg, nr);
+        RESTORE_VECTOR_REGISTERS();
+        return _ret;
+    }
+    #define AES_set_key_AARCH32        wc_svr_AES_set_key_AARCH32
+    #define AES_encrypt_AARCH32        wc_svr_AES_encrypt_AARCH32
+    #define AES_decrypt_AARCH32        wc_svr_AES_decrypt_AARCH32
+    #define AES_encrypt_blocks_AARCH32 wc_svr_AES_encrypt_blocks_AARCH32
+    #define AES_decrypt_blocks_AARCH32 wc_svr_AES_decrypt_blocks_AARCH32
+    #define AES_CBC_encrypt_AARCH32    wc_svr_AES_CBC_encrypt_AARCH32
+    #define AES_CBC_decrypt_AARCH32    wc_svr_AES_CBC_decrypt_AARCH32
+    #define AES_CTR_encrypt_AARCH32    wc_svr_AES_CTR_encrypt_AARCH32
+    #define AES_GCM_set_key_AARCH32    wc_svr_AES_GCM_set_key_AARCH32
+    #define AES_GCM_encrypt_AARCH32    wc_svr_AES_GCM_encrypt_AARCH32
+    #define AES_GCM_decrypt_AARCH32    wc_svr_AES_GCM_decrypt_AARCH32
+    #ifdef WOLFSSL_AES_XTS
+    static WC_INLINE void wc_svr_AES_XTS_encrypt_AARCH32(const byte* in,
+        byte* out, word32 sz, const byte* i, byte* key, byte* key2, byte* tmp,
+        int nr) {
+        if (SAVE_VECTOR_REGISTERS2() != 0) return;
+        AES_XTS_encrypt_AARCH32(in, out, sz, i, key, key2, tmp, nr);
+        RESTORE_VECTOR_REGISTERS();
+    }
+    static WC_INLINE void wc_svr_AES_XTS_decrypt_AARCH32(const byte* in,
+        byte* out, word32 sz, const byte* i, byte* key, byte* key2, byte* tmp,
+        int nr) {
+        if (SAVE_VECTOR_REGISTERS2() != 0) return;
+        AES_XTS_decrypt_AARCH32(in, out, sz, i, key, key2, tmp, nr);
+        RESTORE_VECTOR_REGISTERS();
+    }
+    #define AES_XTS_encrypt_AARCH32    wc_svr_AES_XTS_encrypt_AARCH32
+    #define AES_XTS_decrypt_AARCH32    wc_svr_AES_XTS_decrypt_AARCH32
+    #endif /* WOLFSSL_AES_XTS */
+#endif /* WOLFSSL_USE_SAVE_VECTOR_REGISTERS && !__aarch64__ && !NO_HW_CRYPTO */
+
 #if defined(WOLFSSL_AES_DIRECT) || defined(HAVE_AESCCM) || \
     defined(WOLFSSL_AESGCM_STREAM)
 static WARN_UNUSED_RESULT int wc_AesEncrypt(Aes* aes, const byte* inBlock,
@@ -4571,6 +4699,14 @@ static WARN_UNUSED_RESULT int wc_AesDecrypt(Aes* aes, const byte* inBlock,
     static int AesSetKey(Aes* aes, const byte* userKey, word32 keylen,
             const byte* iv, int dir)
     {
+        /* Reject invalid AES key lengths early (FIPS 197: 128/192/256 only).
+         * The lenient wc_AesSetKeyDirect entry only bounds-checks keylen, so
+         * without this a zero/invalid keylen would be accepted here on 32-bit ARM
+         * armasm -- the C software path rejects it in wc_AesSetKeyLocal, so match
+         * that ("check early and BAD_FUNC_ARG out"). */
+        if (userKey == NULL || (keylen != 16 && keylen != 24 && keylen != 32)) {
+            return BAD_FUNC_ARG;
+        }
     #if defined(WOLFSSL_AES_COUNTER) || defined(WOLFSSL_AES_CFB) || \
         defined(WOLFSSL_AES_OFB) || defined(WOLFSSL_AES_XTS) || \
         defined(WOLFSSL_AES_CTS)
@@ -7751,8 +7887,16 @@ static WC_INLINE void IncrementGcmCounter(byte* inOutCtr)
 #endif
 #endif /* !FREESCALE_LTC_AES_GCM */
 
+/* SP 800-38D AES-GCM software GHASH (FlattenSzInBits length block + RIGHTSHIFTX +
+ * the GCM table GMULT/GHASH below).  On 32-bit ARM with --enable-armasm the HW
+ * GCM only accelerates the one-shot path; the STREAMING GHASH (GHASH_INIT/UPDATE/
+ * FINAL) has no 32-bit asm implementation (only __aarch64__ + PMULL does) and so
+ * uses this software path.  Therefore this block must also be compiled when
+ * WOLFSSL_AESGCM_STREAM is enabled, even for arm32 armasm HW-crypto -- otherwise
+ * GHASH_FINAL's fallback GHASH_LEN_BLOCK references an undefined FlattenSzInBits
+ * (arm64 already gets it via __aarch64__). */
 #if !defined(WOLFSSL_ARMASM) || defined(__aarch64__) || \
-    defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+    defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) || defined(WOLFSSL_AESGCM_STREAM)
 #if defined(GCM_SMALL) || defined(GCM_TABLE) || defined(GCM_TABLE_4BIT)
 
 static WC_INLINE void FlattenSzInBits(byte* buf, word32 sz)
@@ -7913,7 +8057,7 @@ void GenerateM0(Gcm* gcm)
     #define HAVE_INTEL_AVX2
 #endif
 
-#if defined(WOLFSSL_AESNI) && defined(GCM_TABLE_4BIT) && \
+#if defined(WC_AESNI_GCM) && defined(GCM_TABLE_4BIT) && \
     defined(WC_C_DYNAMIC_FALLBACK)
 void GCM_generate_m0_aesni(const unsigned char *h, unsigned char *m)
                            XASM_LINK("GCM_generate_m0_aesni");
@@ -8001,6 +8145,27 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)
 #ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO
     #if !defined(__aarch64__)
         AES_GCM_set_key_AARCH32(iv, (byte*)aes->key, aes->gcm.H, aes->rounds);
+        #if defined(GCM_TABLE) || defined(GCM_TABLE_4BIT)
+        {
+            /* The 32-bit ARM asm GCM keeps gcm->H in the PMULL (bit-reflected)
+             * field representation for its one-shot GHASH (vmull.p64).  The C
+             * STREAMING GHASH (wc_AesGcmEncrypt/DecryptUpdate/Final) instead uses
+             * the gcm->M0 table, which GenerateM0() builds from the *standard*
+             * H = E_K(0) byte order -- not derivable from the PMULL H.  Recompute
+             * the standard H with the standard AES block encrypt into gcm->H,
+             * build M0 from it, then restore the PMULL H for the one-shot path.
+             * Without this, streaming AES-GCM mis-authenticates (AES_GCM_AUTH_E).
+             * SP 800-38D AES-GCM GHASH. */
+            ALIGN16 byte gcmStdH[WC_AES_BLOCK_SIZE];
+            ALIGN16 byte gcmZero[WC_AES_BLOCK_SIZE];
+            XMEMSET(gcmZero, 0, WC_AES_BLOCK_SIZE);
+            XMEMCPY(gcmStdH, aes->gcm.H, WC_AES_BLOCK_SIZE);
+            AES_encrypt_AARCH32(gcmZero, aes->gcm.H, (byte*)aes->key,
+                (int)aes->rounds);
+            GenerateM0(&aes->gcm);
+            XMEMCPY(aes->gcm.H, gcmStdH, WC_AES_BLOCK_SIZE);
+        }
+        #endif
     #else
         if (aes->use_aes_hw_crypto && aes->use_pmull_hw_crypto) {
             AES_GCM_set_key_AARCH64(iv, (byte*)aes->key, aes->gcm.H,
@@ -8043,7 +8208,7 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)
 
         if (ret == 0) {
 #if defined(GCM_TABLE) || defined(GCM_TABLE_4BIT)
-    #if defined(WOLFSSL_AESNI) && defined(GCM_TABLE_4BIT)
+    #if defined(WC_AESNI_GCM) && defined(GCM_TABLE_4BIT)
             if (aes->use_aesni) {
         #if defined(WC_C_DYNAMIC_FALLBACK)
             #ifdef HAVE_INTEL_AVX2
@@ -8104,7 +8269,8 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)
 }
 
 
-#ifdef WOLFSSL_AESNI
+
+#ifdef WC_AESNI_GCM
 
 void AES_GCM_encrypt_aesni(const unsigned char *in, unsigned char *out,
                      const unsigned char* addt, const unsigned char* ivec,
@@ -8158,8 +8324,14 @@ void AES_GCM_decrypt_avx2(const unsigned char *in, unsigned char *out,
 
 #endif /* WOLFSSL_AESNI */
 
+/* SP 800-38D software GHASH (GMULT / GHASH / GHASH_ONE_BLOCK_SW per GCM table mode).
+ * As with FlattenSzInBits above, 32-bit ARM --enable-armasm has no assembly STREAMING
+ * GHASH (only __aarch64__ + PMULL does), so when WOLFSSL_AESGCM_STREAM is enabled the
+ * streaming GHASH_INIT/UPDATE/FINAL use this software path and need these symbols
+ * compiled even for arm32 armasm HW-crypto.  Widen the guard accordingly (no effect on
+ * x86 / arm64, which already satisfy it -> their in-core hash is unchanged). */
 #if !defined(WOLFSSL_ARMASM) || defined(__aarch64__) || \
-    defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+    defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) || defined(WOLFSSL_AESGCM_STREAM)
 #if defined(GCM_SMALL)
 static void GMULT(byte* X, byte* Y)
 {
@@ -10533,7 +10705,7 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
     ret = AES_GCM_encrypt_ASM(aes, out, in, sz, iv, ivSz, authTag, authTagSz,
         authIn, authInSz);
 #else
-#ifdef WOLFSSL_AESNI
+#ifdef WC_AESNI_GCM
     if (aes->use_aesni) {
 #ifdef HAVE_INTEL_AVX2
         if (IS_INTEL_AVX2(intel_flags)) {
@@ -11145,7 +11317,7 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
                      const byte* authIn, word32 authInSz)
 {
     int ret;
-#ifdef WOLFSSL_AESNI
+#ifdef WC_AESNI_GCM
     int res = WC_NO_ERR_TRACE(AES_GCM_AUTH_E);
 #endif
 
@@ -11291,7 +11463,7 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
             authTagSz, authIn, authInSz);
     }
 #else
-#ifdef WOLFSSL_AESNI
+#ifdef WC_AESNI_GCM
     if (aes->use_aesni) {
 #ifdef HAVE_INTEL_AVX2
         if (IS_INTEL_AVX2(intel_flags)) {
@@ -11334,6 +11506,16 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
 
     VECTOR_REGISTERS_POP;
 
+    /* FIPS 140-3 / SP 800-38D: on authentication failure, the decrypted-but-
+     * unauthenticated plaintext in `out` must not be released to the caller.
+     * Wipe it here so a caller that ignores the return value cannot observe
+     * plaintext derived from forged ciphertext.  All software paths (AES-NI,
+     * AVX1/2, ARM HW/NEON, C fallback) funnel through `ret` here, so this
+     * single guard covers every sub-implementation. */
+    if (ret == WC_NO_ERR_TRACE(AES_GCM_AUTH_E) && out != NULL && sz > 0) {
+        ForceZero(out, sz);
+    }
+
     return ret;
 }
 #endif
@@ -11502,7 +11684,7 @@ static WARN_UNUSED_RESULT int AesGcmFinal_C(
     return 0;
 }
 
-#ifdef WOLFSSL_AESNI
+#ifdef WC_AESNI_GCM
 
 #ifdef __cplusplus
     extern "C" {
@@ -12705,7 +12887,7 @@ int wc_AesGcmInit(Aes* aes, const byte* key, word32 len, const byte* iv,
         if (iv != NULL) {
             /* Initialize with the IV. */
 
-        #ifdef WOLFSSL_AESNI
+        #ifdef WC_AESNI_GCM
             if (aes->use_aesni) {
                 SAVE_VECTOR_REGISTERS(return _svr_ret;);
                 ret = AesGcmInit_aesni(aes, iv, ivSz);
@@ -12832,7 +13014,7 @@ int wc_AesGcmEncryptUpdate(Aes* aes, byte* out, const byte* in, word32 sz,
     if (ret == 0) {
         /* Encrypt with AAD and/or plaintext. */
 
-    #ifdef WOLFSSL_AESNI
+    #ifdef WC_AESNI_GCM
         if (aes->use_aesni) {
             SAVE_VECTOR_REGISTERS(return _svr_ret;);
             ret = AesGcmEncryptUpdate_aesni(aes, out, in, sz, authIn, authInSz);
@@ -12892,7 +13074,7 @@ int wc_AesGcmEncryptFinal(Aes* aes, byte* authTag, word32 authTagSz)
 
     if (ret == 0) {
         /* Calculate authentication tag. */
-    #ifdef WOLFSSL_AESNI
+    #ifdef WC_AESNI_GCM
         if (aes->use_aesni) {
             SAVE_VECTOR_REGISTERS(return _svr_ret;);
             ret = AesGcmEncryptFinal_aesni(aes, authTag, authTagSz);
@@ -12976,7 +13158,7 @@ int wc_AesGcmDecryptUpdate(Aes* aes, byte* out, const byte* in, word32 sz,
 
     if (ret == 0) {
         /* Decrypt with AAD and/or cipher text. */
-    #ifdef WOLFSSL_AESNI
+    #ifdef WC_AESNI_GCM
         if (aes->use_aesni) {
             SAVE_VECTOR_REGISTERS(return _svr_ret;);
             ret = AesGcmDecryptUpdate_aesni(aes, out, in, sz, authIn, authInSz);
@@ -13034,7 +13216,7 @@ int wc_AesGcmDecryptFinal(Aes* aes, const byte* authTag, word32 authTagSz)
 
     if (ret == 0) {
         /* Calculate authentication tag and compare with one passed in.. */
-    #ifdef WOLFSSL_AESNI
+    #ifdef WC_AESNI_GCM
         if (aes->use_aesni) {
             SAVE_VECTOR_REGISTERS(return _svr_ret;);
             ret = AesGcmDecryptFinal_aesni(aes, authTag, authTagSz);
@@ -13061,6 +13243,10 @@ int wc_AesGcmDecryptFinal(Aes* aes, const byte* authTag, word32 authTagSz)
         }
     }
 
+    /* Streaming decrypt cannot zeroize prior Update output buffers from here
+     * (Final does not see them).  On AES_GCM_AUTH_E, the caller is responsible
+     * for treating all Update-produced plaintext as invalid and wiping it.
+     * See PL-R34 Security Policy section 8 (Operational Rules). */
     return ret;
 }
 #endif /* HAVE_AES_DECRYPT || HAVE_AESGCM_DECRYPT */
diff --git a/wolfcrypt/src/aes_asm.S b/wolfcrypt/src/aes_asm.S
index 0371ca8cb22..3ee9983b20b 100644
--- a/wolfcrypt/src/aes_asm.S
+++ b/wolfcrypt/src/aes_asm.S
@@ -1831,11 +1831,16 @@ _AES_ECB_decrypt_AESNI:
         push	%edi
         push	%esi
         push	%ebx
-        movl	20(%esp), %edi
-        movl	24(%esp), %esi
-        movl	28(%esp), %edx
-        movl	32(%esp), %ecx
-        movl	36(%esp), %eax
+        # 3 pushes above add 12 bytes; args (return addr at 0) are therefore at
+        # 16/20/24/28/32(%esp), matching AES_ECB_encrypt_AESNI.  The original
+        # 32-bit decrypt used 20/24/28/32/36 (off by 4 -- offsets for 4 pushes),
+        # so it read the args shifted by one and dereferenced garbage -> segfault.
+        # This path was never exercised until 32-bit AES-NI was enabled 2026-06-17.
+        movl	16(%esp), %edi
+        movl	20(%esp), %esi
+        movl	24(%esp), %edx
+        movl	28(%esp), %ecx
+        movl	32(%esp), %eax
 
 
         movl    %edx, %ebx
diff --git a/wolfcrypt/src/aes_xts_asm.S b/wolfcrypt/src/aes_xts_asm.S
index 09045c6d8f7..ded430c211d 100644
--- a/wolfcrypt/src/aes_xts_asm.S
+++ b/wolfcrypt/src/aes_xts_asm.S
@@ -2786,6 +2786,518 @@ L_AES_XTS_decrypt_update_avx1_done_dec:
 #endif /* __APPLE__ */
 #endif /* HAVE_INTEL_AVX1 */
 #endif /* WOLFSSL_X86_64_BUILD */
+#ifdef WOLFSSL_X86_BUILD
+/* =========================================================================
+ * 32-bit Intel (i386) AES-NI AES-XTS.  Single-block ports of the x86_64
+ * AES_XTS_*_aesni routines above: SAME algorithm and SAME KAT output, but using
+ * only xmm0-xmm7 and the i386 cdecl stack ABI.  The x86_64 versions use xmm8-15
+ * and r8-r15 (none available in 32-bit) and a 4-block pipeline; this port drops
+ * the pipeline and processes one block at a time, which is correct and still
+ * AES-NI-accelerated.  Added 2026-06-17 so 32-bit Intel AES-NI builds
+ * (host_cpu=x86 -> WOLFSSL_X86_BUILD) link + run AES-XTS, matching x86_64.  The
+ * x86_64 section above is byte-for-byte unchanged.
+ *
+ * The GF(2^128) constant {0x87,1,1,1} is materialized on the stack (PIC-safe; a
+ * 32-bit shared object would otherwise need a GOT relocation to reach .data).
+ *
+ * Conventions: %ebp = frame pointer, %edi = running byte offset.
+ *   xmm0 = tweak T   xmm1 = data block   xmm2,xmm3 = round-key scratch
+ *   xmm4 = GF scratch  xmm5 = tweak' (decrypt CTS)  xmm6 = GF const  xmm7 = CTS buf
+ * ========================================================================= */
+
+/* Encrypt the block in \blk with expanded key whose base is GP reg \k; the AES
+ * round count is read from memory operand \nr.  Clobbers xmm2,xmm3. */
+.macro AESENC_BLK k, nr, blk
+        pxor    (\k), \blk
+        movdqu  16(\k), %xmm2
+        aesenc  %xmm2, \blk
+        movdqu  32(\k), %xmm2
+        aesenc  %xmm2, \blk
+        movdqu  48(\k), %xmm2
+        aesenc  %xmm2, \blk
+        movdqu  64(\k), %xmm2
+        aesenc  %xmm2, \blk
+        movdqu  80(\k), %xmm2
+        aesenc  %xmm2, \blk
+        movdqu  96(\k), %xmm2
+        aesenc  %xmm2, \blk
+        movdqu  112(\k), %xmm2
+        aesenc  %xmm2, \blk
+        movdqu  128(\k), %xmm2
+        aesenc  %xmm2, \blk
+        movdqu  144(\k), %xmm2
+        aesenc  %xmm2, \blk
+        cmpl    $11, \nr
+        movdqu  160(\k), %xmm2
+        jl      .L_encblk_last\@
+        aesenc  %xmm2, \blk
+        movdqu  176(\k), %xmm3
+        aesenc  %xmm3, \blk
+        cmpl    $13, \nr
+        movdqu  192(\k), %xmm2
+        jl      .L_encblk_last\@
+        aesenc  %xmm2, \blk
+        movdqu  208(\k), %xmm3
+        aesenc  %xmm3, \blk
+        movdqu  224(\k), %xmm2
+.L_encblk_last\@:
+        aesenclast %xmm2, \blk
+.endm
+
+/* Decrypt the block in \blk with inverse-cipher key base \k, rounds at \nr. */
+.macro AESDEC_BLK k, nr, blk
+        pxor    (\k), \blk
+        movdqu  16(\k), %xmm2
+        aesdec  %xmm2, \blk
+        movdqu  32(\k), %xmm2
+        aesdec  %xmm2, \blk
+        movdqu  48(\k), %xmm2
+        aesdec  %xmm2, \blk
+        movdqu  64(\k), %xmm2
+        aesdec  %xmm2, \blk
+        movdqu  80(\k), %xmm2
+        aesdec  %xmm2, \blk
+        movdqu  96(\k), %xmm2
+        aesdec  %xmm2, \blk
+        movdqu  112(\k), %xmm2
+        aesdec  %xmm2, \blk
+        movdqu  128(\k), %xmm2
+        aesdec  %xmm2, \blk
+        movdqu  144(\k), %xmm2
+        aesdec  %xmm2, \blk
+        cmpl    $11, \nr
+        movdqu  160(\k), %xmm2
+        jl      .L_decblk_last\@
+        aesdec  %xmm2, \blk
+        movdqu  176(\k), %xmm3
+        aesdec  %xmm3, \blk
+        cmpl    $13, \nr
+        movdqu  192(\k), %xmm2
+        jl      .L_decblk_last\@
+        aesdec  %xmm2, \blk
+        movdqu  208(\k), %xmm3
+        aesdec  %xmm3, \blk
+        movdqu  224(\k), %xmm2
+.L_decblk_last\@:
+        aesdeclast %xmm2, \blk
+.endm
+
+/* \dst = \src doubled in GF(2^128) (XTS tweak * alpha).  xmm6 = {0x87,1,1,1}. */
+.macro TWEAKDBL dst, src
+        movdqa  \src, %xmm4
+.ifnc \dst,\src
+        movdqa  \src, \dst
+.endif
+        psrad   $31, %xmm4
+        pslld   $1, \dst
+        pshufd  $0x93, %xmm4, %xmm4
+        pand    %xmm6, %xmm4
+        pxor    %xmm4, \dst
+.endm
+
+/* Build {0x87,1,1,1} into xmm6 using the 16-byte stack scratch at (%esp). */
+.macro GF_CONST
+        movl    $0x87, (%esp)
+        movl    $1, 4(%esp)
+        movl    $1, 8(%esp)
+        movl    $1, 12(%esp)
+        movdqu  (%esp), %xmm6
+.endm
+
+/* void AES_XTS_init_aesni(unsigned char* i, const unsigned char* tweak_key,
+ *                         int tweak_nr); */
+#ifndef __APPLE__
+.text
+.globl	AES_XTS_init_aesni
+.type	AES_XTS_init_aesni,@function
+.align	16
+AES_XTS_init_aesni:
+#else
+.section	__TEXT,__text
+.globl	_AES_XTS_init_aesni
+.p2align	4
+_AES_XTS_init_aesni:
+#endif /* __APPLE__ */
+        push    %ebp
+        movl    %esp, %ebp
+        movl    8(%ebp), %eax            /* i */
+        movdqu  (%eax), %xmm1
+        movl    12(%ebp), %ecx           /* tweak_key */
+        AESENC_BLK %ecx, 16(%ebp), %xmm1 /* nr = tweak_nr */
+        movl    8(%ebp), %eax
+        movdqu  %xmm1, (%eax)
+        pop     %ebp
+        ret
+#ifndef __APPLE__
+.size	AES_XTS_init_aesni,.-AES_XTS_init_aesni
+#endif /* __APPLE__ */
+
+/* void AES_XTS_encrypt_aesni(const unsigned char* in, unsigned char* out,
+ *      word32 sz, const unsigned char* i, const unsigned char* key,
+ *      const unsigned char* key2, int nr);
+ * args: 8=in 12=out 16=sz 20=i 24=key 28=key2 32=nr */
+#ifndef __APPLE__
+.text
+.globl	AES_XTS_encrypt_aesni
+.type	AES_XTS_encrypt_aesni,@function
+.align	16
+AES_XTS_encrypt_aesni:
+#else
+.section	__TEXT,__text
+.globl	_AES_XTS_encrypt_aesni
+.p2align	4
+_AES_XTS_encrypt_aesni:
+#endif /* __APPLE__ */
+        push    %ebp
+        movl    %esp, %ebp
+        push    %ebx
+        push    %esi
+        push    %edi
+        subl    $32, %esp
+        GF_CONST
+        /* T = AES_key2(i) */
+        movl    20(%ebp), %eax
+        movdqu  (%eax), %xmm0
+        movl    28(%ebp), %ecx
+        AESENC_BLK %ecx, 32(%ebp), %xmm0
+        xorl    %edi, %edi               /* offset */
+        movl    16(%ebp), %edx
+        andl    $0xfffffff0, %edx        /* full = sz & ~15 (encrypt all full blocks) */
+.L_xe_loop:
+        cmpl    %edx, %edi
+        jge     .L_xe_loop_done
+        movl    8(%ebp), %eax
+        movdqu  (%eax,%edi), %xmm1
+        pxor    %xmm0, %xmm1
+        movl    24(%ebp), %ecx
+        AESENC_BLK %ecx, 32(%ebp), %xmm1
+        pxor    %xmm0, %xmm1
+        movl    12(%ebp), %eax
+        movdqu  %xmm1, (%eax,%edi)
+        TWEAKDBL %xmm0, %xmm0
+        addl    $16, %edi
+        jmp     .L_xe_loop
+.L_xe_loop_done:
+        movl    16(%ebp), %eax
+        cmpl    %eax, %edi
+        je      .L_xe_done               /* exact multiple of 16 -> no CTS */
+        /* ciphertext stealing: steal from the last full ciphertext block */
+        subl    $16, %edi
+        movl    12(%ebp), %eax
+        movdqu  (%eax,%edi), %xmm7
+        addl    $16, %edi
+        movdqu  %xmm7, (%esp)            /* buf = C_{m-1} */
+        xorl    %edx, %edx               /* j */
+.L_xe_cts:
+        movzbl  (%esp,%edx), %ecx        /* tmp1 = buf[j] */
+        movl    8(%ebp), %esi
+        movzbl  (%esi,%edi), %ebx        /* tmp2 = in[off] */
+        movl    12(%ebp), %esi
+        movb    %cl, (%esi,%edi)         /* out[off] = tmp1 */
+        movb    %bl, (%esp,%edx)         /* buf[j] = tmp2 */
+        incl    %edi
+        incl    %edx
+        movl    16(%ebp), %eax
+        cmpl    %eax, %edi
+        jl      .L_xe_cts
+        subl    %edx, %edi               /* off = m*16 */
+        movdqu  (%esp), %xmm1
+        subl    $16, %edi                /* off = (m-1)*16 */
+        pxor    %xmm0, %xmm1
+        movl    24(%ebp), %ecx
+        AESENC_BLK %ecx, 32(%ebp), %xmm1
+        pxor    %xmm0, %xmm1
+        movl    12(%ebp), %eax
+        movdqu  %xmm1, (%eax,%edi)
+.L_xe_done:
+        addl    $32, %esp
+        pop     %edi
+        pop     %esi
+        pop     %ebx
+        pop     %ebp
+        ret
+#ifndef __APPLE__
+.size	AES_XTS_encrypt_aesni,.-AES_XTS_encrypt_aesni
+#endif /* __APPLE__ */
+
+/* void AES_XTS_encrypt_update_aesni(const unsigned char* in, unsigned char* out,
+ *      word32 sz, const unsigned char* key, unsigned char* i, int nr);
+ * args: 8=in 12=out 16=sz 20=key 24=i 28=nr.  Tweak is read from *i (already
+ * encrypted) and the advanced tweak written back to *i. */
+#ifndef __APPLE__
+.text
+.globl	AES_XTS_encrypt_update_aesni
+.type	AES_XTS_encrypt_update_aesni,@function
+.align	16
+AES_XTS_encrypt_update_aesni:
+#else
+.section	__TEXT,__text
+.globl	_AES_XTS_encrypt_update_aesni
+.p2align	4
+_AES_XTS_encrypt_update_aesni:
+#endif /* __APPLE__ */
+        push    %ebp
+        movl    %esp, %ebp
+        push    %ebx
+        push    %esi
+        push    %edi
+        subl    $32, %esp
+        GF_CONST
+        movl    24(%ebp), %eax           /* i (tweak, already encrypted) */
+        movdqu  (%eax), %xmm0
+        xorl    %edi, %edi
+        movl    16(%ebp), %edx
+        andl    $0xfffffff0, %edx
+.L_xeu_loop:
+        cmpl    %edx, %edi
+        jge     .L_xeu_loop_done
+        movl    8(%ebp), %eax
+        movdqu  (%eax,%edi), %xmm1
+        pxor    %xmm0, %xmm1
+        movl    20(%ebp), %ecx
+        AESENC_BLK %ecx, 28(%ebp), %xmm1
+        pxor    %xmm0, %xmm1
+        movl    12(%ebp), %eax
+        movdqu  %xmm1, (%eax,%edi)
+        TWEAKDBL %xmm0, %xmm0
+        addl    $16, %edi
+        jmp     .L_xeu_loop
+.L_xeu_loop_done:
+        movl    16(%ebp), %eax
+        cmpl    %eax, %edi
+        je      .L_xeu_done
+        subl    $16, %edi
+        movl    12(%ebp), %eax
+        movdqu  (%eax,%edi), %xmm7
+        addl    $16, %edi
+        movdqu  %xmm7, (%esp)
+        xorl    %edx, %edx
+.L_xeu_cts:
+        movzbl  (%esp,%edx), %ecx
+        movl    8(%ebp), %esi
+        movzbl  (%esi,%edi), %ebx
+        movl    12(%ebp), %esi
+        movb    %cl, (%esi,%edi)
+        movb    %bl, (%esp,%edx)
+        incl    %edi
+        incl    %edx
+        movl    16(%ebp), %eax
+        cmpl    %eax, %edi
+        jl      .L_xeu_cts
+        subl    %edx, %edi
+        movdqu  (%esp), %xmm1
+        subl    $16, %edi
+        pxor    %xmm0, %xmm1
+        movl    20(%ebp), %ecx
+        AESENC_BLK %ecx, 28(%ebp), %xmm1
+        pxor    %xmm0, %xmm1
+        movl    12(%ebp), %eax
+        movdqu  %xmm1, (%eax,%edi)
+.L_xeu_done:
+        movl    24(%ebp), %eax           /* write advanced tweak back to *i */
+        movdqu  %xmm0, (%eax)
+        addl    $32, %esp
+        pop     %edi
+        pop     %esi
+        pop     %ebx
+        pop     %ebp
+        ret
+#ifndef __APPLE__
+.size	AES_XTS_encrypt_update_aesni,.-AES_XTS_encrypt_update_aesni
+#endif /* __APPLE__ */
+
+/* void AES_XTS_decrypt_aesni(const unsigned char* in, unsigned char* out,
+ *      word32 sz, const unsigned char* i, const unsigned char* key,
+ *      const unsigned char* key2, int nr);
+ * args: 8=in 12=out 16=sz 20=i 24=key 28=key2 32=nr */
+#ifndef __APPLE__
+.text
+.globl	AES_XTS_decrypt_aesni
+.type	AES_XTS_decrypt_aesni,@function
+.align	16
+AES_XTS_decrypt_aesni:
+#else
+.section	__TEXT,__text
+.globl	_AES_XTS_decrypt_aesni
+.p2align	4
+_AES_XTS_decrypt_aesni:
+#endif /* __APPLE__ */
+        push    %ebp
+        movl    %esp, %ebp
+        push    %ebx
+        push    %esi
+        push    %edi
+        subl    $32, %esp
+        GF_CONST
+        /* T = AES_key2(i) (tweak is ENCRYPTED, even for decrypt) */
+        movl    20(%ebp), %eax
+        movdqu  (%eax), %xmm0
+        movl    28(%ebp), %ecx
+        AESENC_BLK %ecx, 32(%ebp), %xmm0
+        xorl    %edi, %edi
+        movl    16(%ebp), %eax
+        movl    %eax, %edx
+        andl    $0xfffffff0, %edx        /* full */
+        cmpl    %eax, %edx
+        je      .L_xd_bound              /* exact multiple -> bound = full */
+        subl    $16, %edx                /* partial -> reserve last full block */
+.L_xd_bound:
+.L_xd_loop:
+        cmpl    %edx, %edi
+        jge     .L_xd_loop_done
+        movl    8(%ebp), %eax
+        movdqu  (%eax,%edi), %xmm1
+        pxor    %xmm0, %xmm1
+        movl    24(%ebp), %ecx
+        AESDEC_BLK %ecx, 32(%ebp), %xmm1
+        pxor    %xmm0, %xmm1
+        movl    12(%ebp), %eax
+        movdqu  %xmm1, (%eax,%edi)
+        TWEAKDBL %xmm0, %xmm0
+        addl    $16, %edi
+        jmp     .L_xd_loop
+.L_xd_loop_done:
+        movl    16(%ebp), %eax
+        cmpl    %eax, %edi
+        je      .L_xd_done               /* no partial */
+        /* last_31 CTS: T' = 2T (xmm5); decrypt last full block with T' */
+        TWEAKDBL %xmm5, %xmm0
+        movl    8(%ebp), %eax
+        movdqu  (%eax,%edi), %xmm1
+        pxor    %xmm5, %xmm1
+        movl    24(%ebp), %ecx
+        AESDEC_BLK %ecx, 32(%ebp), %xmm1
+        pxor    %xmm5, %xmm1
+        movdqu  %xmm1, (%esp)            /* buf */
+        addl    $16, %edi
+        xorl    %edx, %edx
+.L_xd_cts:
+        movzbl  (%esp,%edx), %ecx
+        movl    8(%ebp), %esi
+        movzbl  (%esi,%edi), %ebx
+        movl    12(%ebp), %esi
+        movb    %cl, (%esi,%edi)
+        movb    %bl, (%esp,%edx)
+        incl    %edi
+        incl    %edx
+        movl    16(%ebp), %eax
+        cmpl    %eax, %edi
+        jl      .L_xd_cts
+        subl    %edx, %edi               /* off = m*16 */
+        movdqu  (%esp), %xmm1
+        pxor    %xmm0, %xmm1             /* ^ T (current) */
+        movl    24(%ebp), %ecx
+        AESDEC_BLK %ecx, 32(%ebp), %xmm1
+        pxor    %xmm0, %xmm1
+        subl    $16, %edi                /* off = (m-1)*16 */
+        movl    12(%ebp), %eax
+        movdqu  %xmm1, (%eax,%edi)
+.L_xd_done:
+        addl    $32, %esp
+        pop     %edi
+        pop     %esi
+        pop     %ebx
+        pop     %ebp
+        ret
+#ifndef __APPLE__
+.size	AES_XTS_decrypt_aesni,.-AES_XTS_decrypt_aesni
+#endif /* __APPLE__ */
+
+/* void AES_XTS_decrypt_update_aesni(const unsigned char* in, unsigned char* out,
+ *      word32 sz, const unsigned char* key, unsigned char* i, int nr);
+ * args: 8=in 12=out 16=sz 20=key 24=i 28=nr.  Tweak from *i, advanced back to *i. */
+#ifndef __APPLE__
+.text
+.globl	AES_XTS_decrypt_update_aesni
+.type	AES_XTS_decrypt_update_aesni,@function
+.align	16
+AES_XTS_decrypt_update_aesni:
+#else
+.section	__TEXT,__text
+.globl	_AES_XTS_decrypt_update_aesni
+.p2align	4
+_AES_XTS_decrypt_update_aesni:
+#endif /* __APPLE__ */
+        push    %ebp
+        movl    %esp, %ebp
+        push    %ebx
+        push    %esi
+        push    %edi
+        subl    $32, %esp
+        GF_CONST
+        movl    24(%ebp), %eax           /* i (tweak) */
+        movdqu  (%eax), %xmm0
+        xorl    %edi, %edi
+        movl    16(%ebp), %eax
+        movl    %eax, %edx
+        andl    $0xfffffff0, %edx
+        cmpl    %eax, %edx
+        je      .L_xdu_bound
+        subl    $16, %edx
+.L_xdu_bound:
+.L_xdu_loop:
+        cmpl    %edx, %edi
+        jge     .L_xdu_loop_done
+        movl    8(%ebp), %eax
+        movdqu  (%eax,%edi), %xmm1
+        pxor    %xmm0, %xmm1
+        movl    20(%ebp), %ecx
+        AESDEC_BLK %ecx, 28(%ebp), %xmm1
+        pxor    %xmm0, %xmm1
+        movl    12(%ebp), %eax
+        movdqu  %xmm1, (%eax,%edi)
+        TWEAKDBL %xmm0, %xmm0
+        addl    $16, %edi
+        jmp     .L_xdu_loop
+.L_xdu_loop_done:
+        movl    16(%ebp), %eax
+        cmpl    %eax, %edi
+        je      .L_xdu_done
+        TWEAKDBL %xmm5, %xmm0
+        movl    8(%ebp), %eax
+        movdqu  (%eax,%edi), %xmm1
+        pxor    %xmm5, %xmm1
+        movl    20(%ebp), %ecx
+        AESDEC_BLK %ecx, 28(%ebp), %xmm1
+        pxor    %xmm5, %xmm1
+        movdqu  %xmm1, (%esp)
+        addl    $16, %edi
+        xorl    %edx, %edx
+.L_xdu_cts:
+        movzbl  (%esp,%edx), %ecx
+        movl    8(%ebp), %esi
+        movzbl  (%esi,%edi), %ebx
+        movl    12(%ebp), %esi
+        movb    %cl, (%esi,%edi)
+        movb    %bl, (%esp,%edx)
+        incl    %edi
+        incl    %edx
+        movl    16(%ebp), %eax
+        cmpl    %eax, %edi
+        jl      .L_xdu_cts
+        subl    %edx, %edi
+        movdqu  (%esp), %xmm1
+        pxor    %xmm0, %xmm1
+        movl    20(%ebp), %ecx
+        AESDEC_BLK %ecx, 28(%ebp), %xmm1
+        pxor    %xmm0, %xmm1
+        subl    $16, %edi
+        movl    12(%ebp), %eax
+        movdqu  %xmm1, (%eax,%edi)
+.L_xdu_done:
+        movl    24(%ebp), %eax
+        movdqu  %xmm0, (%eax)
+        addl    $32, %esp
+        pop     %edi
+        pop     %esi
+        pop     %ebx
+        pop     %ebp
+        ret
+#ifndef __APPLE__
+.size	AES_XTS_decrypt_update_aesni,.-AES_XTS_decrypt_update_aesni
+#endif /* __APPLE__ */
+
+#endif /* WOLFSSL_X86_BUILD */
 #endif /* WOLFSSL_AES_XTS */
 
 #if defined(__linux__) && defined(__ELF__)
diff --git a/wolfcrypt/src/cpuid.c b/wolfcrypt/src/cpuid.c
index 2c3670234a6..6b5335ad47d 100644
--- a/wolfcrypt/src/cpuid.c
+++ b/wolfcrypt/src/cpuid.c
@@ -75,6 +75,22 @@
         #define cpuid(a,b,c) __cpuidex((int*)a,b,c)
     #endif /* _MSC_VER */
 
+    /* On the 32-bit x86 Linux kernel (WOLFSSL_LINUXKM + WOLFSSL_X86_BUILD), the
+     * UAPI header <asm/ptrace-abi.h> -- pulled in transitively via
+     * processor.h -> math_emu.h -> ptrace.h on i386 only -- already #defines
+     * EAX/EBX/ECX/EDX as ptrace register *indices* with DIFFERENT values
+     * (EAX=6, EBX=0, ECX=1, EDX=2).  We use these names as cpuid()-result
+     * array indices (0..3), so the collision is a real bug, not a cosmetic
+     * -Werror redefinition: leaving the kernel's values in place would index
+     * reg[6] (past "unsigned int reg[5]") and mis-compare the vendor string.
+     * #undef first so our indices win.  This is a no-op where the names are
+     * not predefined (x86_64 kernel uses a processor.h chain without
+     * math_emu.h, and all user-space builds), so generated code off
+     * i386-kernel is byte-identical. */
+    #undef EAX
+    #undef EBX
+    #undef ECX
+    #undef EDX
     #define EAX 0
     #define EBX 1
     #define ECX 2
diff --git a/wolfcrypt/src/dh.c b/wolfcrypt/src/dh.c
index 3e9abe3d869..378c5544b72 100644
--- a/wolfcrypt/src/dh.c
+++ b/wolfcrypt/src/dh.c
@@ -1422,8 +1422,20 @@ int wc_DhGeneratePublic(DhKey* key, byte* priv, word32 privSz,
     #if FIPS_VERSION_GE(5,0) || defined(WOLFSSL_VALIDATE_DH_KEYGEN)
     if (ret == 0)
         ret = _ffc_validate_public_key(key, pub, *pubSz, NULL, 0, 0);
-    if (ret == 0)
-        ret = _ffc_pairwise_consistency_test(key, pub, *pubSz, priv, privSz);
+    if (ret == 0) {
+        /* Pairwise Consistency Test per SP 800-56A r3 sec 5.6.2.1.4
+         * (FFC key pair).  FIPS 140-3 IG 10.3.B requires a PCT after
+         * KeyGen for key-establishment algorithms; on failure under a
+         * FIPS build the error is remapped to DH_PCT_E so the FIPS
+         * module's DEGRADE_STATE handler transitions FIPS_CAST_DH_
+         * PRIMITIVE_Z to the error state. */
+        ret = _ffc_pairwise_consistency_test(key, pub, *pubSz, priv,
+                                             privSz);
+    #ifdef HAVE_FIPS
+        if (ret != 0)
+            ret = DH_PCT_E;
+    #endif
+    }
     #endif /* FIPS V5 or later || WOLFSSL_VALIDATE_DH_KEYGEN */
 
     return ret;
@@ -1446,8 +1458,20 @@ static int wc_DhGenerateKeyPair_Sync(DhKey* key, WC_RNG* rng,
 #if FIPS_VERSION_GE(5,0) || defined(WOLFSSL_VALIDATE_DH_KEYGEN)
     if (ret == 0)
         ret = _ffc_validate_public_key(key, pub, *pubSz, NULL, 0, 0);
-    if (ret == 0)
-        ret = _ffc_pairwise_consistency_test(key, pub, *pubSz, priv, *privSz);
+    if (ret == 0) {
+        /* Pairwise Consistency Test per SP 800-56A r3 sec 5.6.2.1.4
+         * (FFC key pair).  FIPS 140-3 IG 10.3.B requires a PCT after
+         * KeyGen for key-establishment algorithms; on failure under a
+         * FIPS build the error is remapped to DH_PCT_E so the FIPS
+         * module's DEGRADE_STATE handler transitions FIPS_CAST_DH_
+         * PRIMITIVE_Z to the error state. */
+        ret = _ffc_pairwise_consistency_test(key, pub, *pubSz, priv,
+                                             *privSz);
+    #ifdef HAVE_FIPS
+        if (ret != 0)
+            ret = DH_PCT_E;
+    #endif
+    }
 #endif /* FIPS V5 or later || WOLFSSL_VALIDATE_DH_KEYGEN */
 
     return ret;
diff --git a/wolfcrypt/src/error.c b/wolfcrypt/src/error.c
index 0f70a84cc8b..be0acd033d3 100644
--- a/wolfcrypt/src/error.c
+++ b/wolfcrypt/src/error.c
@@ -692,6 +692,21 @@ const char* wc_GetErrorString(int error)
     case SLH_DSA_KAT_FIPS_E:
         return "SLH-DSA Known Answer Test check FIPS error";
 
+    case SLH_DSA_PCT_E:
+        return "wolfcrypt SLH-DSA Pairwise Consistency Test Failure";
+
+    case CMAC_KAT_FIPS_E:
+        return "AES-CMAC Known Answer Test FIPS error";
+
+    case SHAKE_KAT_FIPS_E:
+        return "SHAKE Known Answer Test FIPS error";
+
+    case DH_PCT_E:
+        return "wolfcrypt DH (FFC) Pairwise Consistency Test Failure";
+
+    case AES_KW_KAT_FIPS_E:
+        return "AES-KW Known Answer Test FIPS error";
+
     case SEQ_OVERFLOW_E:
         return "Sequence counter would overflow";
 
diff --git a/wolfcrypt/src/ge_operations.c b/wolfcrypt/src/ge_operations.c
index 5d3157628d2..2fbe29241bd 100644
--- a/wolfcrypt/src/ge_operations.c
+++ b/wolfcrypt/src/ge_operations.c
@@ -10196,9 +10196,14 @@ void ge_tobytes_nct(unsigned char *s,const ge_p2 *h)
 /* if HAVE_ED25519 but not HAVE_CURVE25519, and an asm implementation is built,
  * then curve25519() won't get its WOLFSSL_LOCAL attribute unless we dummy-call
  * it here.
- */
+ * This assumes the asm port still emits curve25519() when X25519 is disabled --
+ * true for the x86 and 64-bit ARM (armv8-curve25519) ports, but NOT for the
+ * 32-bit ARM port (armv8-32-curve25519 gates curve25519() on HAVE_CURVE25519),
+ * so the dummy-call would reference an undefined symbol there.  Exclude
+ * arm32 armasm (RFC 7748 / SP 800-186 X25519). */
 #if defined(CURVED25519_ASM) && defined(WOLFSSL_API_PREFIX_MAP) && \
-    !defined(HAVE_CURVE25519) && !defined(FREESCALE_LTC_ECC)
+    !defined(HAVE_CURVE25519) && !defined(FREESCALE_LTC_ECC) && \
+    (!defined(WOLFSSL_ARMASM) || defined(__aarch64__))
 WOLFSSL_LOCAL void _wc_curve25519_dummy(void);
 WOLFSSL_LOCAL void _wc_curve25519_dummy(void) {
     (void)curve25519((byte *)0, (byte *)0, (const byte *)0);
diff --git a/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S
index 6d2f0172994..128e2e91997 100644
--- a/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S
+++ b/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S
@@ -27,6 +27,16 @@
 
 #include <wolfssl/wolfcrypt/libwolfssl_sources_asm.h>
 
+/* Honor WC_SHA3_NO_ASM exactly as sha3.c does: when the build forces the C SHA-3
+ * (e.g. every Linux kernel-module build sets -DWC_SHA3_NO_ASM via KERNEL_MODE_DEFAULTS),
+ * this NEON BlockSha3 must NOT be emitted -- otherwise on 32-bit ARM it provides a
+ * second BlockSha3 that multiply-defines against sha3.c's C BlockSha3 at link time.
+ * (arm64's sha3 asm is gated on WOLFSSL_ARMASM_CRYPTO_SHA3 (default off) so it never
+ * hit this; the arm32 NEON path is only gated on WOLFSSL_ARMASM_NO_NEON.) FIPS 202. */
+#ifdef WC_SHA3_NO_ASM
+    #undef WOLFSSL_ARMASM
+#endif
+
 #ifdef WOLFSSL_ARMASM
 #if !defined(__aarch64__) && !defined(WOLFSSL_ARMASM_THUMB2)
 #ifndef WOLFSSL_ARMASM_INLINE
diff --git a/wolfcrypt/src/random.c b/wolfcrypt/src/random.c
index 609e7fb6b45..f51d376fab6 100644
--- a/wolfcrypt/src/random.c
+++ b/wolfcrypt/src/random.c
@@ -416,6 +416,24 @@ int wc_DrbgState_MutexFree(void)
 static int LockDrbgState(void)
 {
 #ifndef SINGLE_THREADED
+#ifndef WOLFSSL_MUTEX_INITIALIZER
+    /* Platforms lacking a static mutex initializer (e.g. Windows
+     * CRITICAL_SECTION) must initialize drbgStateMutex at run time.
+     * wolfCrypt_Init() does so, but the FIPS 140-3 module's pre-operational
+     * self test exercises the DRBG -- and therefore this lock -- from a
+     * load-time constructor that runs before any wolfCrypt_Init() call.
+     * Locking an uninitialized CRITICAL_SECTION is undefined behavior: it
+     * tolerates the single-threaded POST but faults (0xC0000005) on the
+     * degraded-mode CAST re-run.  Initialize on demand here
+     * (wc_DrbgState_MutexInit() is idempotent); the first lock is the
+     * single-threaded POST, so this is race-free.  The mutex guards the
+     * SP 800-90A DRBG enable/disable state, which must stay consistent. */
+    {
+        int initRet = wc_DrbgState_MutexInit();
+        if (initRet != 0)
+            return initRet;
+    }
+#endif
     return wc_LockMutex(&drbgStateMutex);
 #else
     return 0;
@@ -3617,23 +3635,90 @@ int wc_FreeNetRandom(void)
 #if defined(HAVE_INTEL_RDRAND) || defined(HAVE_INTEL_RDSEED) || \
     defined(HAVE_AMD_RDSEED)
 
-#ifdef WOLFSSL_ASYNC_CRYPT
-    /* need more retries if multiple cores */
-    #define INTELRD_RETRY (32 * 8)
-#else
-    #define INTELRD_RETRY 32
+/* INTELRD_RETRY bounds the RDRAND/RDSEED retry loop (IntelRDseed64_r /
+ * IntelRDrand* below).  RDSEED legitimately sets CF=0 (failure) when the on-chip
+ * conditioned entropy has not been replenished since the last read; per Intel's
+ * DRNG implementation guidance software must retry.  Overridable via -D for OEs
+ * that need a different budget. */
+#ifndef INTELRD_RETRY
+    #if defined(WOLFSSL_LINUXKM)
+        /* Linux-kernel-module: the boot-time FIPS CASTs poll RDSEED during
+         * module_init (wc_RunAllCast_fips -> ECDSA CAST -> wc_InitRng ->
+         * wc_GenerateSeed) while the platform RNG is warming up and the hardware
+         * RDSEED source is heavily contended -- under virtualization especially,
+         * where the guest's RDSEED is funneled to the (busy) host CPU.  RDSEED
+         * then returns CF=0 far more than the 32-retry userspace default
+         * tolerates, which made --enable-amdrdseed / --enable-intelrdseed kernel
+         * modules fail the ECDSA CAST and refuse to load.  Use a large retry
+         * budget: it is a ceiling, not a fixed cost -- RDSEED succeeds in ~1 read
+         * once entropy is available, so post-boot use is unaffected. */
+        #define INTELRD_RETRY 100000
+    #elif defined(WOLFSSL_ASYNC_CRYPT)
+        /* need more retries if multiple cores */
+        #define INTELRD_RETRY (32 * 8)
+    #else
+        #define INTELRD_RETRY 32
+    #endif
 #endif
 
 #if defined(HAVE_INTEL_RDSEED) || defined(HAVE_AMD_RDSEED)
 
+/* Vendor tag used by the optional FIPS_CODE_REVIEW evidence prints below.
+ * Both the Intel RDSEED and AMD RDSEED entropy sources funnel through the same
+ * primitive (the x86 RDSEED instruction is implemented by both vendors); only
+ * the underlying CPU vendor differs.  At compile time exactly one of
+ * HAVE_INTEL_RDSEED / HAVE_AMD_RDSEED is expected to be set for a given OE, so
+ * this string resolves unambiguously per build. */
+#if defined(HAVE_AMD_RDSEED)
+#define WC_RDSEED_VENDOR "AMD"
+#else
+#define WC_RDSEED_VENDOR "Intel"
+#endif
+
 #ifndef USE_INTEL_INTRINSICS
 
-    /* return 0 on success */
+    /* return 0 on success.
+     *
+     * E27 Public Use Document (NIST CMVP entropy source disclosure) explicitly
+     * documents that wolfSSL polls the x86 CF (Carry Flag) to determine
+     * whether a given RDSEED invocation delivered a usable conditioned entropy
+     * sample.  The rdseed instruction semantics are:
+     *   CF = 1 (set)   -> destination register holds 64 bits of conditioned
+     *                     entropy on this cycle; sample is fresh and usable.
+     *   CF = 0 (clear) -> hardware seed pool was empty this cycle; the
+     *                     destination register MUST NOT be used; caller is
+     *                     required to retry (handled by IntelRDseed64_r
+     *                     below, which loops up to INTELRD_RETRY times).
+     * The "setc %1" mnemonic materialises CF into the byte-sized output bound
+     * to (ok); the "=qm" output constraint constrains that byte to a q-class
+     * register (one that has an addressable low byte) so setc can target it. */
     static WC_INLINE int IntelRDseed64(word64* seed)
     {
         unsigned char ok;
 
         __asm__ volatile("rdseed %0; setc %1":"=r"(*seed), "=qm"(ok));
+#ifdef FIPS_CODE_REVIEW
+        /* One-shot tracer: confirm this path is alive on the first call.
+         * After that go silent so the per-chunk RDSEED prints don't drown
+         * the optest sanity-log (RDSEED is invoked per 64-bit chunk for
+         * every DRBG instantiate/reseed; flooding the log obscures real
+         * signal).  Per-request entropy volume is still surfaced by the
+         * outer wc_GenerateSeed_IntelRD print below. */
+        {
+            static int printed_asm = 0;
+            if (!printed_asm) {
+                printed_asm = 1;
+                printf("FIPS_CODE_REVIEW IntelRDseed64 [asm path, %s] "
+                       "(one-shot): delivered %u bits, CF=%u\n",
+                       WC_RDSEED_VENDOR, (unsigned)(sizeof(word64) * 8u),
+                       (unsigned)ok);
+            }
+        }
+#endif
+        /* Ternary: CF set (ok != 0) -> 64 bits of conditioned entropy
+         * captured in *seed, return 0 (success). CF clear (ok == 0) -> sample
+         * invalid, return -1 so the retry wrapper IntelRDseed64_r() tries
+         * again. */
         return (ok) ? 0 : -1;
     }
 
@@ -3641,7 +3726,14 @@ int wc_FreeNetRandom(void)
     /* The compiler Visual Studio uses does not allow inline assembly.
      * It does allow for Intel intrinsic functions. */
 
-    /* return 0 on success */
+    /* return 0 on success.
+     *
+     * E27 PUD (NIST CMVP) cited path: _rdseed64_step is the compiler intrinsic
+     * front-end for the same RDSEED instruction documented in the asm path
+     * above.  The intrinsic returns 1 when CF was set by the underlying RDSEED
+     * (i.e. the 64-bit conditioned entropy sample in *seed is valid this
+     * cycle) and 0 when CF was clear (caller MUST retry; *seed MUST NOT be
+     * consumed). */
 # ifdef __GNUC__
     __attribute__((target("rdseed")))
 # endif
@@ -3650,6 +3742,23 @@ int wc_FreeNetRandom(void)
         int ok;
 
         ok = _rdseed64_step((unsigned long long*) seed);
+#ifdef FIPS_CODE_REVIEW
+        /* One-shot tracer; see asm-path comment above for rationale. */
+        {
+            static int printed_intrinsic = 0;
+            if (!printed_intrinsic) {
+                printed_intrinsic = 1;
+                printf("FIPS_CODE_REVIEW IntelRDseed64 [intrinsic path, %s] "
+                       "(one-shot): delivered %u bits, "
+                       "intrinsic_ret=%d (== CF)\n",
+                       WC_RDSEED_VENDOR, (unsigned)(sizeof(word64) * 8u), ok);
+            }
+        }
+#endif
+        /* intrinsic_ret == 1 -> CF was set, 64 bits of conditioned entropy
+         * captured in *seed; return 0 to signal success to the retry wrapper.
+         * intrinsic_ret == 0 -> CF was clear; return -1 so the retry wrapper
+         * re-attempts. */
         return (ok) ? 0 : -1;
     }
 
@@ -3662,6 +3771,12 @@ static WC_INLINE int IntelRDseed64_r(word64* rnd)
     for (i = 0; i < INTELRD_RETRY; i++) {
         if (IntelRDseed64(rnd) == 0)
             return 0;
+        /* Give the hardware entropy source a chance to replenish between
+         * attempts (Intel DRNG guidance) and yield the CPU when it is safe to
+         * block.  WC_RELAX_LONG_LOOP() is a no-op where blocking is unsafe, so
+         * this only ever helps -- e.g. it lets other work (and the entropy
+         * conditioner) run during a long boot-time RDSEED starvation. */
+        WC_RELAX_LONG_LOOP();
     }
     return -1;
 }
@@ -3675,6 +3790,19 @@ static int wc_GenerateSeed_IntelRD(OS_Seed* os, byte* output, word32 sz)
 
     (void)os;
 
+#ifdef FIPS_CODE_REVIEW
+    /* Each conditioned entropy sample produced by IntelRDseed64() is 64 bits
+     * wide.  This entry-level trace makes the per-request entropy volume
+     * obvious in evidence logs: sz bytes requested -> ceil(sz/8) RDSEED
+     * invocations expected (plus the two-or-three sanity-status reads on the
+     * first ever call into this function). */
+    printf("FIPS_CODE_REVIEW wc_GenerateSeed_IntelRD [%s]: "
+           "requested %u bytes = %u bits "
+           "(expect %u RDSEED 64-bit samples)\n",
+           WC_RDSEED_VENDOR, (unsigned)sz, (unsigned)(sz * 8u),
+           (unsigned)((sz + sizeof(word64) - 1u) / sizeof(word64)));
+#endif
+
     if (!IS_INTEL_RDSEED(intel_flags))
         return -1;
 
diff --git a/wolfcrypt/src/rsa.c b/wolfcrypt/src/rsa.c
index 0a6b6143a7a..d313f81ac95 100644
--- a/wolfcrypt/src/rsa.c
+++ b/wolfcrypt/src/rsa.c
@@ -5153,9 +5153,15 @@ static WC_INLINE int RsaSizeCheck(int size)
     }
 
 #ifdef HAVE_FIPS
-    /* Key size requirements for CAVP */
+    /* Approved RSA key sizes per FIPS 186-5 sec 5.1 and NIST SP 800-131Ar2
+     * sec 4 Table 2 (Asymmetric Key Establishment) - 2048, 3072, 4096 only.
+     * 1024-bit RSA was deprecated for FIPS-Approved key generation by
+     * SP 800-131Ar2 effective 2014-01-01 and is disallowed thereafter.  The
+     * outer wc_MakeRsaKey_fips wrapper already gates on WC_RSA_FIPS_GEN_MIN,
+     * but RsaSizeCheck itself is reached by library-internal paths that do
+     * not pass through that wrapper - defense-in-depth removal here closes
+     * the gap. */
     switch (size) {
-        case 1024:
         case 2048:
         case 3072:
         case 4096:
@@ -5415,6 +5421,20 @@ int wc_MakeRsaKey(RsaKey* key, int size, long e, WC_RNG* rng)
         goto out;
     }
 
+#ifdef HAVE_FIPS
+    /* FIPS 186-5 sec 5.2 (Public Verification Exponent e): 2^16 + 1 <= e <
+     * 2^256 and e odd.  The general non-FIPS check above accepts e >= 3 odd;
+     * the FIPS Approved range is narrower.  e is a long here so the upper
+     * bound 2^256 is structurally satisfied on any LP64 / LLP64 platform
+     * (long is at most 64 bits), but the lower bound 65537 must be enforced
+     * explicitly.  Defense-in-depth even though FIPS application code
+     * conventionally passes e = 65537 (RSA_F4). */
+    if (e < 65537L) {
+        err = BAD_FUNC_ARG;
+        goto out;
+    }
+#endif
+
 #if defined(WOLFSSL_CRYPTOCELL)
     err = cc310_RSA_GenerateKeyPair(key, size, e);
     goto out;
diff --git a/wolfcrypt/src/sha256.c b/wolfcrypt/src/sha256.c
index 047c57dade8..ddb6b304fa8 100644
--- a/wolfcrypt/src/sha256.c
+++ b/wolfcrypt/src/sha256.c
@@ -1179,14 +1179,32 @@ int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
     return ret;
 }
 
+/* The 32-bit ARM SHA-256 NEON/crypto-extension transforms use vector registers,
+ * so in a Linux kernel module they MUST be bracketed with SAVE/RESTORE_VECTOR_
+ * REGISTERS (kernel_neon_begin/end) or the first SIMD instruction faults with
+ * "undefined instruction".  Scoped to !__aarch64__ so the aarch64 path is
+ * unchanged.  (FIPS 180-4 SHA-256.) */
+#if defined(WOLFSSL_USE_SAVE_VECTOR_REGISTERS) && !defined(__aarch64__) && \
+    !defined(WOLFSSL_ARMASM_THUMB2) && !defined(WOLFSSL_ARMASM_NO_NEON)
+    #define WC_SHA256_ARM_SVR_BEGIN(fail) SAVE_VECTOR_REGISTERS(fail)
+    #define WC_SHA256_ARM_SVR_END()       RESTORE_VECTOR_REGISTERS()
+#else
+    #define WC_SHA256_ARM_SVR_BEGIN(fail) WC_DO_NOTHING
+    #define WC_SHA256_ARM_SVR_END()       WC_DO_NOTHING
+#endif
+
 static WC_INLINE int Transform_Sha256(wc_Sha256* sha256, const byte* data)
 {
 #if defined(WOLFSSL_ARMASM_THUMB2) || defined(WOLFSSL_ARMASM_NO_NEON)
     Transform_Sha256_Len_base(sha256, data, WC_SHA256_BLOCK_SIZE);
-#elif defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
-    Transform_Sha256_Len_neon(sha256, data, WC_SHA256_BLOCK_SIZE);
 #else
+    WC_SHA256_ARM_SVR_BEGIN(return _svr_ret;);
+  #if defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+    Transform_Sha256_Len_neon(sha256, data, WC_SHA256_BLOCK_SIZE);
+  #else
     Transform_Sha256_Len_crypto(sha256, data, WC_SHA256_BLOCK_SIZE);
+  #endif
+    WC_SHA256_ARM_SVR_END();
 #endif
     return 0;
 }
@@ -1196,10 +1214,14 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data,
 {
 #if defined(WOLFSSL_ARMASM_THUMB2) || defined(WOLFSSL_ARMASM_NO_NEON)
     Transform_Sha256_Len_base(sha256, data, len);
-#elif defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
-    Transform_Sha256_Len_neon(sha256, data, len);
 #else
+    WC_SHA256_ARM_SVR_BEGIN(return _svr_ret;);
+  #if defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+    Transform_Sha256_Len_neon(sha256, data, len);
+  #else
     Transform_Sha256_Len_crypto(sha256, data, len);
+  #endif
+    WC_SHA256_ARM_SVR_END();
 #endif
     return 0;
 }
diff --git a/wolfcrypt/src/sha512.c b/wolfcrypt/src/sha512.c
index b2f57b13b86..009f538c44b 100644
--- a/wolfcrypt/src/sha512.c
+++ b/wolfcrypt/src/sha512.c
@@ -1455,15 +1455,32 @@ static void (*Transform_Sha512_p)(wc_Sha512* sha512, const byte* data) = NULL;
 static void (*Transform_Sha512_Len_p)(wc_Sha512* sha512, const byte* data,
     word32 len) = NULL;
 
+/* The 32-bit ARM SHA-512 NEON transform uses vector registers, so in a Linux
+ * kernel module the asm calls below MUST be bracketed by SAVE/RESTORE_VECTOR_
+ * REGISTERS (kernel_neon_begin/end) or the first NEON instruction faults with
+ * "undefined instruction".  Scoped to !__aarch64__ so the aarch64 path (and the
+ * THUMB2/NO_NEON base path) is unchanged.  (FIPS 180-4 SHA-512.) */
+#if defined(WOLFSSL_USE_SAVE_VECTOR_REGISTERS) && !defined(__aarch64__) && \
+    !defined(WOLFSSL_ARMASM_THUMB2) && !defined(WOLFSSL_ARMASM_NO_NEON)
+    #define WC_SHA512_ARM_SVR_BEGIN(fail) SAVE_VECTOR_REGISTERS(fail)
+    #define WC_SHA512_ARM_SVR_END()       RESTORE_VECTOR_REGISTERS()
+#else
+    #define WC_SHA512_ARM_SVR_BEGIN(fail) WC_DO_NOTHING
+    #define WC_SHA512_ARM_SVR_END()       WC_DO_NOTHING
+#endif
 static WC_INLINE int Transform_Sha512(wc_Sha512 *sha512, const byte* data)
 {
+    WC_SHA512_ARM_SVR_BEGIN(return _svr_ret;);
     (*Transform_Sha512_p)(sha512, data);
+    WC_SHA512_ARM_SVR_END();
     return 0;
 }
 static WC_INLINE int Transform_Sha512_Len(wc_Sha512 *sha512, const byte* data,
     word32 len)
 {
+    WC_SHA512_ARM_SVR_BEGIN(return _svr_ret;);
     (*Transform_Sha512_Len_p)(sha512, data, len);
+    WC_SHA512_ARM_SVR_END();
     return 0;
 }
 
diff --git a/wolfcrypt/src/wc_lms.c b/wolfcrypt/src/wc_lms.c
index f2b62ea5901..d223632ec19 100644
--- a/wolfcrypt/src/wc_lms.c
+++ b/wolfcrypt/src/wc_lms.c
@@ -26,6 +26,14 @@
 #if FIPS_VERSION3_GE(2,0,0)
     /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */
     #define FIPS_NO_WRAPPERS
+
+    /* Windows orders the FIPS in-core integrity boundary by named linker
+     * sections.  Keep LMS (SP 800-208) code/const inside the boundary,
+     * sorted between sha3 (.fipsA$n) and fips.c (.fipsA$o). */
+    #ifdef USE_WINDOWS_API
+        #pragma code_seg(".fipsA$nc")
+        #pragma const_seg(".fipsB$nc")
+    #endif
 #endif
 #include <wolfssl/wolfcrypt/wc_lms.h>
 #include <wolfssl/wolfcrypt/hash.h>
diff --git a/wolfcrypt/src/wc_lms_impl.c b/wolfcrypt/src/wc_lms_impl.c
index 25e89901a9b..ba449062ca8 100644
--- a/wolfcrypt/src/wc_lms_impl.c
+++ b/wolfcrypt/src/wc_lms_impl.c
@@ -39,6 +39,15 @@
 
 #include <wolfssl/wolfcrypt/libwolfssl_sources.h>
 
+#if FIPS_VERSION3_GE(2,0,0)
+    /* Keep this LMS (SP 800-208) implementation's code/const inside the FIPS
+     * in-core integrity boundary (Windows orders it by named sections). */
+    #ifdef USE_WINDOWS_API
+        #pragma code_seg(".fipsA$ne")
+        #pragma const_seg(".fipsB$ne")
+    #endif
+#endif
+
 #include <wolfssl/wolfcrypt/wc_lms.h>
 
 #ifdef NO_INLINE
@@ -2317,7 +2326,10 @@ static int wc_lms_treehash_update(LmsState* state, LmsPrivState* privState,
     byte* left = dp + LMS_D_LEN;
     byte* temp = left + params->hash_len;
     WC_DECLARE_VAR(stack, byte, (LMS_MAX_HEIGHT + 1) * LMS_MAX_NODE_LEN, 0);
-    byte* sp;
+    /* Init to NULL: sp is set and used only on the ret==0 path, but 32-bit ARM
+     * gcc cannot correlate the two separate `if (ret == 0)` guards and reports a
+     * false-positive -Wmaybe-uninitialized (x86_64/aarch64 gcc do not). */
+    byte* sp = NULL;
     word32 max_cb = (word32)1 << params->cacheBits;
     word32 i;
 
diff --git a/wolfcrypt/src/wc_mldsa.c b/wolfcrypt/src/wc_mldsa.c
index 45e247e889b..1c028be496b 100644
--- a/wolfcrypt/src/wc_mldsa.c
+++ b/wolfcrypt/src/wc_mldsa.c
@@ -140,6 +140,14 @@
 #if FIPS_VERSION3_GE(2,0,0)
     /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */
     #define FIPS_NO_WRAPPERS
+
+    /* Windows orders the FIPS in-core integrity boundary by named linker
+     * sections.  Keep ML-DSA (FIPS 204) code/const inside the boundary,
+     * sorted between sha3 (.fipsA$n) and fips.c (.fipsA$o). */
+    #ifdef USE_WINDOWS_API
+        #pragma code_seg(".fipsA$nc")
+        #pragma const_seg(".fipsB$nc")
+    #endif
 #endif
 
 #ifndef WOLFSSL_MLDSA_NO_ASN1
@@ -760,8 +768,103 @@ static int mldsa_hash256_ctx_msg(wc_Shake* shake256, const byte* tr,
  * @return  0 on success.
  * @return  BAD_FUNC_ARG if hash algorithm not known.
  */
-static int mldsa_get_hash_oid(int hash, byte* oidBuffer, word32* oidLen)
+/* HashML-DSA PH-vs-paramSet enforcement.
+ *
+ * FIPS 204 sec. 5.4 (Table 4) restricts the pre-hash PH for HashML-DSA to
+ * algorithms whose collision-resistance strength meets or exceeds the
+ * parameter set's claimed security level.  Any other PH must be rejected
+ * for BOTH sigGen and sigVer:
+ *   ML-DSA-44 (128-bit):  SHA2-256, SHA2-384, SHA2-512, SHA2-512/256,
+ *                         SHA3-256, SHA3-384, SHA3-512,
+ *                         SHAKE-128, SHAKE-256
+ *   ML-DSA-65 (192-bit):  SHA2-384, SHA2-512, SHA3-384, SHA3-512, SHAKE-256
+ *   ML-DSA-87 (256-bit):  SHA2-512, SHA3-512, SHAKE-256
+ *
+ * Returns 0 if (hashAlg, level) is an approved combination.  Returns
+ * BAD_FUNC_ARG otherwise -- including for any hash not on the approved
+ * list (SHA-224, SHA-512/224, SHA3-224, etc.).
+ */
+static int mldsa_check_hash_for_level(int hashAlg, byte level)
 {
+    int strengthBits;  /* collision-resistance strength of the chosen hash */
+    int requiredBits;  /* security level required by the paramSet */
+
+    switch (hashAlg) {
+    #ifndef NO_SHA256
+        case WC_HASH_TYPE_SHA256:
+            strengthBits = 128;
+            break;
+    #endif
+    #ifdef WOLFSSL_SHA384
+        case WC_HASH_TYPE_SHA384:
+            strengthBits = 192;
+            break;
+    #endif
+    #ifdef WOLFSSL_SHA512
+        case WC_HASH_TYPE_SHA512:
+            strengthBits = 256;
+            break;
+        #ifndef WOLFSSL_NOSHA512_256
+        case WC_HASH_TYPE_SHA512_256:
+            /* SHA-512/256 has 128-bit collision resistance (truncated). */
+            strengthBits = 128;
+            break;
+        #endif
+    #endif
+    #ifdef WOLFSSL_SHA3
+        #ifndef WOLFSSL_NOSHA3_256
+        case WC_HASH_TYPE_SHA3_256:
+            strengthBits = 128;
+            break;
+        #endif
+        #ifndef WOLFSSL_NOSHA3_384
+        case WC_HASH_TYPE_SHA3_384:
+            strengthBits = 192;
+            break;
+        #endif
+        #ifndef WOLFSSL_NOSHA3_512
+        case WC_HASH_TYPE_SHA3_512:
+            strengthBits = 256;
+            break;
+        #endif
+    #endif
+    #ifdef WOLFSSL_SHAKE128
+        case WC_HASH_TYPE_SHAKE128:
+            strengthBits = 128;
+            break;
+    #endif
+    #ifdef WOLFSSL_SHAKE256
+        case WC_HASH_TYPE_SHAKE256:
+            strengthBits = 256;
+            break;
+    #endif
+        default:
+            /* Hash not on the FIPS 204 Table 4 approved list (e.g. SHA-224,
+             * SHA-512/224, SHA3-224, MD5).  Reject regardless of level. */
+            return BAD_FUNC_ARG;
+    }
+
+    switch (level) {
+        case WC_ML_DSA_44:
+            requiredBits = 128;
+            break;
+        case WC_ML_DSA_65:
+            requiredBits = 192;
+            break;
+        case WC_ML_DSA_87:
+            requiredBits = 256;
+            break;
+        default:
+            return BAD_FUNC_ARG;
+    }
+
+    if (strengthBits < requiredBits) {
+        return BAD_FUNC_ARG;
+    }
+    return 0;
+}
+
+static int mldsa_get_hash_oid(int hash, byte* oidBuffer, word32* oidLen){
     int ret = 0;
     const byte* oid;
 
@@ -9453,11 +9556,17 @@ static int mldsa_sign_ctx_hash_with_seed(wc_MlDsaKey* key,
     byte oidMsgHash[MLDSA_HASH_OID_LEN + WC_MAX_DIGEST_SIZE];
     word32 oidMsgHashLen = 0;
 
-    /* Check that the input hash length is valid. */
+    /* Check that the input hash length is valid (guards against caller-side
+     * buffer overruns before we touch hash). */
     if ((int)hashLen != wc_HashGetDigestSize((enum wc_HashType)hashAlg)) {
         ret = BAD_LENGTH_E;
     }
 
+    /* FIPS 204 sec. 5.4 Table 4: enforce hash <-> paramSet matching. */
+    if (ret == 0) {
+        ret = mldsa_check_hash_for_level(hashAlg, key->level);
+    }
+
     if (ret == 0) {
         XMEMCPY(seedMu, seed, MLDSA_RND_SZ);
 
@@ -10126,12 +10235,17 @@ static int mldsa_verify_ctx_hash(wc_MlDsaKey* key, const byte* ctx,
     if ((key == NULL) || (key->params == NULL)) {
         ret = BAD_FUNC_ARG;
     }
-    /* Check that the input hash length is valid. */
+    /* Check that the input hash length is valid (guards against caller-side
+     * buffer overruns before we touch hash). */
     if ((ret == 0) &&
         ((int)hashLen != wc_HashGetDigestSize((enum wc_HashType)hashAlg)))
     {
         ret = BAD_LENGTH_E;
     }
+    /* FIPS 204 sec. 5.4 Table 4: enforce hash <-> paramSet matching. */
+    if (ret == 0) {
+        ret = mldsa_check_hash_for_level(hashAlg, key->level);
+    }
 
     if (ret == 0) {
         /* Step 6: Hash public key. */
diff --git a/wolfcrypt/src/wc_mlkem.c b/wolfcrypt/src/wc_mlkem.c
index d03539f7053..dac31afa6e4 100644
--- a/wolfcrypt/src/wc_mlkem.c
+++ b/wolfcrypt/src/wc_mlkem.c
@@ -81,6 +81,14 @@
 #if FIPS_VERSION3_GE(2,0,0)
     /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */
     #define FIPS_NO_WRAPPERS
+
+    /* Windows orders the FIPS in-core integrity boundary by named linker
+     * sections.  Keep ML-KEM (FIPS 203) code/const inside the boundary,
+     * sorted between sha3 (.fipsA$n) and fips.c (.fipsA$o). */
+    #ifdef USE_WINDOWS_API
+        #pragma code_seg(".fipsA$na")
+        #pragma const_seg(".fipsB$na")
+    #endif
 #endif
 
 #include <wolfssl/wolfcrypt/wc_mlkem.h>
@@ -694,49 +702,12 @@ int wc_MlKemKey_MakeKey(MlKemKey* key, WC_RNG* rng)
         ret = wc_MlKemKey_MakeKeyWithRandom(key, rand, sizeof(rand));
     }
 
-#ifdef HAVE_FIPS
-    /* Pairwise Consistency Test (PCT) per FIPS 140-3 / ISO 19790:2012
-     * Section 7.10.3.3: encapsulate with ek, decapsulate with dk,
-     * verify shared secrets match. */
-    if (ret == 0) {
-        WC_DECLARE_VAR(pct_ct, byte, WC_ML_KEM_MAX_CIPHER_TEXT_SIZE,
-            key->heap);
-        byte pct_ss1[WC_ML_KEM_SS_SZ];
-        byte pct_ss2[WC_ML_KEM_SS_SZ];
-        word32 ctSz = 0;
-
-        WC_ALLOC_VAR_EX(pct_ct, byte, WC_ML_KEM_MAX_CIPHER_TEXT_SIZE,
-            key->heap, DYNAMIC_TYPE_TMP_BUFFER, ret = MEMORY_E);
-
-        if (ret == 0)
-            ret = wc_MlKemKey_CipherTextSize(key, &ctSz);
-
-        if (ret == 0)
-            ret = wc_MlKemKey_Encapsulate(key, pct_ct, pct_ss1, rng);
-
-        if (ret == 0)
-            ret = wc_MlKemKey_Decapsulate(key, pct_ss2, pct_ct, ctSz);
-
-        if (ret == 0) {
-            if (XMEMCMP(pct_ss1, pct_ss2, WC_ML_KEM_SS_SZ) != 0)
-                ret = ML_KEM_PCT_E;
-        }
-
-        ForceZero(pct_ss1, sizeof(pct_ss1));
-        ForceZero(pct_ss2, sizeof(pct_ss2));
-        if (WC_VAR_OK(pct_ct))
-            ForceZero(pct_ct, WC_ML_KEM_MAX_CIPHER_TEXT_SIZE);
-
-        WC_FREE_VAR_EX(pct_ct, key->heap, DYNAMIC_TYPE_TMP_BUFFER);
-
-        /* FIPS 140-3 IG 10.3.A (TE10.35.02): a key pair that fails the PCT
-         * must be rendered unusable.  Zeroize the generated key material so
-         * a caller that ignores the return value cannot use it. */
-        if (ret != 0) {
-            wc_MlKemKey_Free(key);
-        }
-    }
-#endif /* HAVE_FIPS */
+    /* PCT now lives in wc_MlKemKey_MakeKeyWithRandom() (called above) so
+     * that BOTH the random-seeded path (this function) and the
+     * caller-supplied-seed path (direct invocation of MakeKeyWithRandom)
+     * exercise the FIPS 140-3 IG 10.3.A 1.B Pairwise Consistency Test.
+     * Audit A16-1: previously the PCT lived only here, leaving the
+     * deterministic-seed entry uncovered. */
 
     /* Ensure seeds are zeroized. */
     ForceZero((void*)rand, (word32)sizeof(rand));
@@ -985,8 +956,70 @@ int wc_MlKemKey_MakeKeyWithRandom(MlKemKey* key, const unsigned char* rand,
     ForceZero(e, (size_t)(k * MLKEM_N) * sizeof(sword16));
 #endif
 
-    /* Note: PCT is performed in wc_MlKemKey_MakeKey() which calls this
-     * function and has the RNG parameter needed for encapsulation. */
+#ifdef HAVE_FIPS
+    /* Pairwise Consistency Test (PCT) per FIPS 140-3 IG 10.3.A 1.B and
+     * ISO/IEC 19790:2012 Section 7.10.3.3: encapsulate with the freshly
+     * generated encapsulation key (ek), decapsulate the ciphertext with
+     * the matching decapsulation key (dk), and verify the recovered
+     * shared secret matches.  This entry point (MakeKeyWithRandom) is
+     * a deterministic key-gen path with no caller-supplied RNG; the PCT
+     * uses wc_MlKemKey_EncapsulateWithRandom() with a fixed 32-byte test
+     * value for `m` (FIPS 203 Algorithm 17 input).  The encapsulation
+     * `m` does not need to be unpredictable for the PCT - it only needs
+     * the encap/decap roundtrip to recover the same shared secret.
+     *
+     * Audit A16-1: previously the PCT lived only in wc_MlKemKey_MakeKey
+     * which generates `rand` from the DRBG, leaving callers of this
+     * deterministic-seed entry without PCT coverage. */
+    if (ret == 0) {
+        WC_DECLARE_VAR(pct_ct, byte, WC_ML_KEM_MAX_CIPHER_TEXT_SIZE,
+            key->heap);
+        byte pct_ss1[WC_ML_KEM_SS_SZ];
+        byte pct_ss2[WC_ML_KEM_SS_SZ];
+        word32 pct_ctSz = 0;
+        /* Fixed 32-byte test pattern for FIPS 203 Alg 17 `m` parameter.
+         * Value is arbitrary - PCT only requires encap/decap roundtrip,
+         * not encap unpredictability. */
+        static const byte pct_m[WC_ML_KEM_ENC_RAND_SZ] = {
+            0xAB, 0xAB, 0xAB, 0xAB, 0xAB, 0xAB, 0xAB, 0xAB,
+            0xAB, 0xAB, 0xAB, 0xAB, 0xAB, 0xAB, 0xAB, 0xAB,
+            0xAB, 0xAB, 0xAB, 0xAB, 0xAB, 0xAB, 0xAB, 0xAB,
+            0xAB, 0xAB, 0xAB, 0xAB, 0xAB, 0xAB, 0xAB, 0xAB
+        };
+
+        WC_ALLOC_VAR_EX(pct_ct, byte, WC_ML_KEM_MAX_CIPHER_TEXT_SIZE,
+            key->heap, DYNAMIC_TYPE_TMP_BUFFER, ret = MEMORY_E);
+
+        if (ret == 0)
+            ret = wc_MlKemKey_CipherTextSize(key, &pct_ctSz);
+
+        if (ret == 0)
+            ret = wc_MlKemKey_EncapsulateWithRandom(key, pct_ct, pct_ss1,
+                pct_m, (int)sizeof(pct_m));
+
+        if (ret == 0)
+            ret = wc_MlKemKey_Decapsulate(key, pct_ss2, pct_ct, pct_ctSz);
+
+        if (ret == 0) {
+            if (XMEMCMP(pct_ss1, pct_ss2, WC_ML_KEM_SS_SZ) != 0)
+                ret = ML_KEM_PCT_E;
+        }
+
+        ForceZero(pct_ss1, sizeof(pct_ss1));
+        ForceZero(pct_ss2, sizeof(pct_ss2));
+        if (WC_VAR_OK(pct_ct))
+            ForceZero(pct_ct, WC_ML_KEM_MAX_CIPHER_TEXT_SIZE);
+
+        WC_FREE_VAR_EX(pct_ct, key->heap, DYNAMIC_TYPE_TMP_BUFFER);
+
+        /* FIPS 140-3 IG 10.3.A (TE10.35.02): a key pair that fails the PCT
+         * must be rendered unusable.  Zeroize the generated key material so
+         * a caller that ignores the return value cannot use it. */
+        if (ret != 0) {
+            wc_MlKemKey_Free(key);
+        }
+    }
+#endif /* HAVE_FIPS */
 
     return ret;
 }
diff --git a/wolfcrypt/src/wc_mlkem_poly.c b/wolfcrypt/src/wc_mlkem_poly.c
index aa3d7835d5d..533a3198e02 100644
--- a/wolfcrypt/src/wc_mlkem_poly.c
+++ b/wolfcrypt/src/wc_mlkem_poly.c
@@ -69,6 +69,15 @@
 
 #include <wolfssl/wolfcrypt/libwolfssl_sources.h>
 
+#if FIPS_VERSION3_GE(2,0,0)
+    /* Keep this ML-KEM (FIPS 203) implementation's code/const inside the FIPS
+     * in-core integrity boundary (Windows orders it by named sections). */
+    #ifdef USE_WINDOWS_API
+        #pragma code_seg(".fipsA$nb")
+        #pragma const_seg(".fipsB$nb")
+    #endif
+#endif
+
 #ifdef WC_MLKEM_NO_ASM
     #undef USE_INTEL_SPEEDUP
     #undef WOLFSSL_ARMASM
diff --git a/wolfcrypt/src/wc_slhdsa.c b/wolfcrypt/src/wc_slhdsa.c
index b3cfb56349b..0b776553610 100644
--- a/wolfcrypt/src/wc_slhdsa.c
+++ b/wolfcrypt/src/wc_slhdsa.c
@@ -24,6 +24,14 @@
 #if FIPS_VERSION3_GE(2,0,0)
     /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */
     #define FIPS_NO_WRAPPERS
+
+    /* Windows orders the FIPS in-core integrity boundary by named linker
+     * sections.  Keep SLH-DSA (FIPS 205) code/const inside the boundary,
+     * sorted between sha3 (.fipsA$n) and fips.c (.fipsA$o). */
+    #ifdef USE_WINDOWS_API
+        #pragma code_seg(".fipsA$nh")
+        #pragma const_seg(".fipsB$nh")
+    #endif
 #endif
 
 #include <wolfssl/wolfcrypt/wc_slhdsa.h>
@@ -6999,6 +7007,49 @@ int wc_SlhDsaKey_MakeKey(SlhDsaKey* key, WC_RNG* rng)
             key->sk + 2 * n, n);
     }
 
+#ifdef HAVE_FIPS
+    /* Pairwise Consistency Test (PCT) per FIPS 140-3 IG 10.3.A (TE10.35.02):
+     * sign with the new sk, verify with the matching pk.  SLH-DSA is a
+     * stateless hash-based signature scheme (FIPS 205), so the relaxed PCT
+     * rule for stateful HBS (LMS/XMSS) does not apply -- PCT runs on every
+     * KeyGen.  SignDeterministic avoids consuming RNG state; heap allocation
+     * is used because SLH-DSA signatures can reach ~50 KB.  The paramSet is
+     * known by this point, so allocate the exact signature length for this
+     * variant rather than the family-wide worst case -- the difference
+     * across SLH-DSA variants is roughly 8 KB to 50 KB. */
+    if (ret == 0) {
+        static const byte pct_msg[] = "wolfSSL SLH-DSA PCT";
+        word32 pct_sigLen = key->params->sigLen;
+        byte* pct_sig = (byte*)XMALLOC(pct_sigLen, NULL,
+            DYNAMIC_TYPE_TMP_BUFFER);
+        word32 pct_sigSz = pct_sigLen;
+
+        if (pct_sig == NULL) {
+            ret = MEMORY_E;
+        }
+        if (ret == 0) {
+            ret = wc_SlhDsaKey_SignDeterministic(key, NULL, 0,
+                pct_msg, sizeof(pct_msg), pct_sig, &pct_sigSz);
+        }
+        if (ret == 0) {
+            ret = wc_SlhDsaKey_Verify(key, NULL, 0,
+                pct_msg, sizeof(pct_msg), pct_sig, pct_sigSz);
+            if (ret != 0) {
+                ret = SLH_DSA_PCT_E;
+            }
+        }
+        if (pct_sig != NULL) {
+            ForceZero(pct_sig, pct_sigLen);
+            XFREE(pct_sig, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+        }
+        /* IG 10.3.A (TE10.35.02): a key pair that fails the PCT must be
+         * rendered unusable. */
+        if (ret != 0) {
+            wc_SlhDsaKey_Free(key);
+        }
+    }
+#endif /* HAVE_FIPS */
+
     return ret;
 }
 
@@ -7945,6 +7996,99 @@ static const byte slhdsakey_oid_sha3_512[] = {
 #endif
 #endif
 
+/* HashSLH-DSA PH-vs-paramSet enforcement.
+ *
+ * FIPS 205 sec. 10.2.2 (Table 9) restricts the pre-hash PH for HashSLH-DSA
+ * to algorithms whose collision-resistance strength meets or exceeds the
+ * parameter set's security level (encoded as key->params->n in bytes):
+ *   n = 16 (128-bit):  SHA2-256, SHA2-384, SHA2-512, SHA2-512/256,
+ *                      SHA3-256, SHA3-384, SHA3-512,
+ *                      SHAKE-128, SHAKE-256
+ *   n = 24 (192-bit):  SHA2-384, SHA2-512, SHA3-384, SHA3-512, SHAKE-256
+ *   n = 32 (256-bit):  SHA2-512, SHA3-512, SHAKE-256
+ *
+ * Returns 0 if (hashType, n) is an approved combination.  Returns
+ * BAD_FUNC_ARG otherwise -- including for any hash not on the approved
+ * list (SHA-224, SHA-512/224, SHA3-224, etc.).
+ */
+static int slhdsa_check_hash_for_n(enum wc_HashType hashType, byte n)
+{
+    int strengthBits;
+    int requiredBits;
+
+    switch ((int)hashType) {
+    #ifndef NO_SHA256
+        case WC_HASH_TYPE_SHA256:
+            strengthBits = 128;
+            break;
+    #endif
+    #ifdef WOLFSSL_SHA384
+        case WC_HASH_TYPE_SHA384:
+            strengthBits = 192;
+            break;
+    #endif
+    #ifdef WOLFSSL_SHA512
+        case WC_HASH_TYPE_SHA512:
+            strengthBits = 256;
+            break;
+        #ifndef WOLFSSL_NOSHA512_256
+        case WC_HASH_TYPE_SHA512_256:
+            /* SHA-512/256 has 128-bit collision resistance (truncated). */
+            strengthBits = 128;
+            break;
+        #endif
+    #endif
+    #ifdef WOLFSSL_SHA3
+        #ifndef WOLFSSL_NOSHA3_256
+        case WC_HASH_TYPE_SHA3_256:
+            strengthBits = 128;
+            break;
+        #endif
+        #ifndef WOLFSSL_NOSHA3_384
+        case WC_HASH_TYPE_SHA3_384:
+            strengthBits = 192;
+            break;
+        #endif
+        #ifndef WOLFSSL_NOSHA3_512
+        case WC_HASH_TYPE_SHA3_512:
+            strengthBits = 256;
+            break;
+        #endif
+    #endif
+    #ifdef WOLFSSL_SHAKE128
+        case WC_HASH_TYPE_SHAKE128:
+            strengthBits = 128;
+            break;
+    #endif
+    #ifdef WOLFSSL_SHAKE256
+        case WC_HASH_TYPE_SHAKE256:
+            strengthBits = 256;
+            break;
+    #endif
+        default:
+            /* Hash not on the FIPS 205 Table 9 approved list. */
+            return BAD_FUNC_ARG;
+    }
+
+    if (n == WC_SLHDSA_N_128) {
+        requiredBits = 128;
+    }
+    else if (n == WC_SLHDSA_N_192) {
+        requiredBits = 192;
+    }
+    else if (n == WC_SLHDSA_N_256) {
+        requiredBits = 256;
+    }
+    else {
+        return BAD_FUNC_ARG;
+    }
+
+    if (strengthBits < requiredBits) {
+        return BAD_FUNC_ARG;
+    }
+    return 0;
+}
+
 /* Validate the caller-supplied pre-hashed digest length and look up the
  * corresponding OID for the chosen hash algorithm.
  *
@@ -8162,6 +8306,16 @@ static int slhdsakey_signhash_external(SlhDsaKey* key, const byte* ctx,
             (sigSz == NULL)) {
         ret = BAD_FUNC_ARG;
     }
+    /* HashSLH-DSA requires an explicit, approved pre-hash algorithm.
+     * WC_HASH_TYPE_NONE is the "pure SLH-DSA" sentinel used by the non
+     * pre-hash Sign/Verify paths and is never valid here.  Reject it
+     * explicitly (FIPS 205 Section 10.2.2 / Table 9) rather than relying on
+     * the slhdsa_check_hash_for_n() switch default below, so the rejection
+     * survives any future reordering of the validators or the addition of a
+     * WC_HASH_TYPE_NONE case to that switch. */
+    else if (hashType == WC_HASH_TYPE_NONE) {
+        ret = BAD_FUNC_ARG;
+    }
     /* Check sig buffer is large enough to hold generated signature. */
     else if (*sigSz < key->params->sigLen) {
         ret = BAD_LENGTH_E;
@@ -8171,6 +8325,12 @@ static int slhdsakey_signhash_external(SlhDsaKey* key, const byte* ctx,
         /* Alg 23, Step 6: Return error. */
         ret = BAD_FUNC_ARG;
     }
+    /* FIPS 205 sec. 10.2.2 Table 9: enforce PH <-> paramSet matching before
+     * pre-hashing the message.  Rejects PHs whose collision-resistance
+     * strength is below the paramSet's security level (n). */
+    if (ret == 0) {
+        ret = slhdsa_check_hash_for_n(hashType, key->params->n);
+    }
     if (ret == 0) {
         /* Alg 23, Steps 8-23: Validate caller-supplied pre-hashed digest length
          * and select OID for the chosen hash algorithm. */
@@ -8405,8 +8565,11 @@ int wc_SlhDsaKey_SignHash(SlhDsaKey* key, const byte* ctx, byte ctxSz,
         ret = MISSING_KEY;
     }
     /* First sanity check on hashType; the downstream prehash validator does
-     * the detailed check for the actual type. */
-    else if ((word32)hashType > (word32)WC_HASH_TYPE_MAX) {
+     * the detailed check for the actual type.  Reject WC_HASH_TYPE_NONE here
+     * too -- it is never a valid pre-hash (FIPS 205 Section 10.2.2 / Table 9),
+     * so a known-invalid call fails before consuming DRBG output below. */
+    else if ((hashType == WC_HASH_TYPE_NONE) ||
+             ((word32)hashType > (word32)WC_HASH_TYPE_MAX)) {
         ret = BAD_FUNC_ARG;
     }
 
@@ -8535,6 +8698,12 @@ int wc_SlhDsaKey_VerifyHash(SlhDsaKey* key, const byte* ctx, byte ctxSz,
     }
 
 #ifdef WOLF_CRYPTO_CB
+    /* FIPS 205 sec. 10.2.2 Table 9: enforce PH <-> paramSet matching.
+     * Rejects PHs whose collision-resistance strength is below the
+     * paramSet's security level (n). */
+    if (ret == 0) {
+        ret = slhdsa_check_hash_for_n(hashType, key->params->n);
+    }
     if (ret == 0) {
     #ifndef WOLF_CRYPTO_CB_FIND
         if (key->devId != INVALID_DEVID)
diff --git a/wolfcrypt/src/wc_xmss.c b/wolfcrypt/src/wc_xmss.c
index 9ad311d0bab..1e723333f2f 100644
--- a/wolfcrypt/src/wc_xmss.c
+++ b/wolfcrypt/src/wc_xmss.c
@@ -26,6 +26,14 @@
 #if FIPS_VERSION3_GE(2,0,0)
     /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */
     #define FIPS_NO_WRAPPERS
+
+    /* Windows orders the FIPS in-core integrity boundary by named linker
+     * sections.  Keep XMSS (SP 800-208) code/const inside the boundary,
+     * sorted between sha3 (.fipsA$n) and fips.c (.fipsA$o). */
+    #ifdef USE_WINDOWS_API
+        #pragma code_seg(".fipsA$nf")
+        #pragma const_seg(".fipsB$nf")
+    #endif
 #endif
 #include <wolfssl/wolfcrypt/wc_xmss.h>
 #include <wolfssl/wolfcrypt/hash.h>
diff --git a/wolfcrypt/src/wc_xmss_impl.c b/wolfcrypt/src/wc_xmss_impl.c
index d1598c4d372..814495130fe 100644
--- a/wolfcrypt/src/wc_xmss_impl.c
+++ b/wolfcrypt/src/wc_xmss_impl.c
@@ -31,6 +31,15 @@
 
 #include <wolfssl/wolfcrypt/libwolfssl_sources.h>
 
+#if FIPS_VERSION3_GE(2,0,0)
+    /* Keep this XMSS (SP 800-208) implementation's code/const inside the FIPS
+     * in-core integrity boundary (Windows orders it by named sections). */
+    #ifdef USE_WINDOWS_API
+        #pragma code_seg(".fipsA$ng")
+        #pragma const_seg(".fipsB$ng")
+    #endif
+#endif
+
 #include <wolfssl/wolfcrypt/wc_xmss.h>
 #include <wolfssl/wolfcrypt/hash.h>
 
diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c
index e95bba5ae5e..75d42847e4c 100644
--- a/wolfcrypt/test/test.c
+++ b/wolfcrypt/test/test.c
@@ -55884,6 +55884,142 @@ static wc_test_ret_t mldsa_param_test(int param, WC_RNG* rng)
 #endif
     return ret;
 }
+
+#if !defined(WOLFSSL_DILITHIUM_NO_SIGN) && \
+    !defined(WOLFSSL_DILITHIUM_NO_VERIFY)
+/* Negative test: HashML-DSA must reject pre-hash algorithms whose collision
+ * resistance is below the parameter set's claimed security strength.
+ *
+ * Per FIPS 204 sec. 5.4, Table 4 (Approved PH algorithms for HashML-DSA):
+ *   ML-DSA-44 (128-bit security):  SHA2-256, SHA2-384, SHA2-512, SHA2-512/256,
+ *                                  SHA3-256, SHA3-384, SHA3-512,
+ *                                  SHAKE-128, SHAKE-256
+ *   ML-DSA-65 (192-bit security):  SHA2-384, SHA2-512, SHA3-384, SHA3-512,
+ *                                  SHAKE-256
+ *   ML-DSA-87 (256-bit security):  SHA2-512, SHA3-512, SHAKE-256
+ *
+ * This test attempts sigGen and sigVer with disallowed (paramSet, hash) pairs
+ * and asserts both reject the call (non-zero return).  Before the in-module
+ * hash-vs-paramSet check exists, wc_dilithium_sign_ctx_hash and
+ * wc_dilithium_verify_ctx_hash happily proceed with any compiled-in hash,
+ * so this test is expected to FAIL until the check is added. */
+static wc_test_ret_t mldsa_hash_paramset_rejection_test(WC_RNG* rng)
+{
+    wc_test_ret_t ret = 0;
+    int           i;
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_NO_MALLOC)
+    dilithium_key* key = NULL;
+    byte*          sig = NULL;
+#else
+    dilithium_key  key[1];
+    byte           sig[DILITHIUM_MAX_SIG_SIZE];
+#endif
+    word32         sigLen;
+    int            verified;
+
+    /* Fixed-content digests; for a rejection test the bytes don't matter,
+     * only the (paramSet, hashAlg, hashLen) tuple. Sizes match each hash's
+     * digest length so the existing length sanity check inside
+     * wc_dilithium_*_ctx_hash() does NOT short-circuit before reaching the
+     * hash-vs-paramSet gate we are validating here. */
+    static const byte hash32[32] = {  /* SHA-256 digest size */
+        0xBA,0x78,0x16,0xBF,0x8F,0x01,0xCF,0xEA,
+        0x41,0x41,0x40,0xDE,0x5D,0xAE,0x22,0x23,
+        0xB0,0x03,0x61,0xA3,0x96,0x17,0x7A,0x9C,
+        0xB4,0x10,0xFF,0x61,0xF2,0x00,0x15,0xAD
+    };
+    static const byte hash48[48] = {  /* SHA-384 digest size */
+        0xCB,0x00,0x75,0x3F,0x45,0xA3,0x5E,0x8B,
+        0xB5,0xA0,0x3D,0x69,0x9A,0xC6,0x50,0x07,
+        0x27,0x2C,0x32,0xAB,0x0E,0xDE,0xD1,0x63,
+        0x1A,0x8B,0x60,0x5A,0x43,0xFF,0x5B,0xED,
+        0x80,0x86,0x07,0x2B,0xA1,0xE7,0xCC,0x23,
+        0x58,0xBA,0xEC,0xA1,0x34,0xC8,0x25,0xA7
+    };
+
+    struct {
+        int         level;
+        int         hashAlg;
+        const byte* hash;
+        word32      hashLen;
+    } forbidden[] = {
+        /* ML-DSA-65 needs >=192-bit collision strength; SHA-256 = 128-bit. */
+        { WC_ML_DSA_65, WC_HASH_TYPE_SHA256, hash32, 32 },
+        /* ML-DSA-87 needs >=256-bit collision strength; SHA-384 = 192-bit. */
+        { WC_ML_DSA_87, WC_HASH_TYPE_SHA384, hash48, 48 }
+    };
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_NO_MALLOC)
+    key = (dilithium_key*)XMALLOC(sizeof(*key), HEAP_HINT,
+        DYNAMIC_TYPE_TMP_BUFFER);
+    sig = (byte*)XMALLOC(DILITHIUM_MAX_SIG_SIZE, HEAP_HINT,
+        DYNAMIC_TYPE_TMP_BUFFER);
+    if ((key == NULL) || (sig == NULL)) {
+        ERROR_OUT(WC_TEST_RET_ENC_ERRNO, neg_out);
+    }
+#endif
+    XMEMSET(sig, 0, DILITHIUM_MAX_SIG_SIZE);
+
+    for (i = 0; i < (int)(sizeof(forbidden) / sizeof(forbidden[0])); i++) {
+    #ifdef WOLFSSL_NO_ML_DSA_65
+        if (forbidden[i].level == WC_ML_DSA_65) continue;
+    #endif
+    #ifdef WOLFSSL_NO_ML_DSA_87
+        if (forbidden[i].level == WC_ML_DSA_87) continue;
+    #endif
+
+        ret = wc_dilithium_init_ex(key, NULL, devId);
+        if (ret != 0) {
+            ERROR_OUT(WC_TEST_RET_ENC_EC(ret), neg_out);
+        }
+        ret = wc_dilithium_set_level(key, (byte)forbidden[i].level);
+        if (ret != 0) {
+            wc_dilithium_free(key);
+            ERROR_OUT(WC_TEST_RET_ENC_EC(ret), neg_out);
+        }
+        ret = wc_dilithium_make_key(key, rng);
+        if (ret != 0) {
+            wc_dilithium_free(key);
+            ERROR_OUT(WC_TEST_RET_ENC_EC(ret), neg_out);
+        }
+
+        sigLen = (word32)wc_dilithium_sig_size(key);
+
+        /* sigGen with disallowed PH must be REJECTED. */
+        PRIVATE_KEY_UNLOCK();
+        ret = wc_dilithium_sign_ctx_hash(NULL, 0, forbidden[i].hashAlg,
+            forbidden[i].hash, forbidden[i].hashLen, sig, &sigLen, key, rng);
+        PRIVATE_KEY_LOCK();
+        if (ret == 0) {
+            /* Module did NOT reject -- this is the missing-enforcement bug. */
+            wc_dilithium_free(key);
+            ERROR_OUT(WC_TEST_RET_ENC_NC, neg_out);
+        }
+
+        /* sigVer with disallowed PH must ALSO be REJECTED. */
+        verified = -1;
+        sigLen = (word32)wc_dilithium_sig_size(key);
+        ret = wc_dilithium_verify_ctx_hash(sig, sigLen, NULL, 0,
+            forbidden[i].hashAlg, forbidden[i].hash, forbidden[i].hashLen,
+            &verified, key);
+        if (ret == 0) {
+            wc_dilithium_free(key);
+            ERROR_OUT(WC_TEST_RET_ENC_NC, neg_out);
+        }
+
+        wc_dilithium_free(key);
+        ret = 0;
+    }
+
+neg_out:
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_NO_MALLOC)
+    if (sig != NULL) XFREE(sig, HEAP_HINT, DYNAMIC_TYPE_TMP_BUFFER);
+    if (key != NULL) XFREE(key, HEAP_HINT, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+    return ret;
+}
+#endif /* !WOLFSSL_DILITHIUM_NO_SIGN && !WOLFSSL_DILITHIUM_NO_VERIFY */
+
 #endif
 
 #if defined(WC_MLDSA_CACHE_MATRIX_A) && \
@@ -56327,6 +56463,18 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t mldsa_test(void)
 #endif /* (WOLFSSL_MLDSA_PUBLIC_KEY && !WOLFSSL_MLDSA_NO_VERIFY) ||
         * (WOLFSSL_MLDSA_PRIVATE_KEY && !WOLFSSL_MLDSA_NO_SIGN) */
 
+#if !defined(WOLFSSL_MLDSA_NO_MAKE_KEY) && \
+    !defined(WOLFSSL_MLDSA_NO_SIGN) && \
+    !defined(WOLFSSL_MLDSA_NO_VERIFY) && \
+    (!defined(WOLFSSL_NO_ML_DSA_65) || !defined(WOLFSSL_NO_ML_DSA_87))
+    /* FIPS 204 sec. 5.4 -- HashML-DSA must reject pre-hashes weaker than
+     * the parameter set's security level. */
+    ret = mldsa_hash_paramset_rejection_test(&rng);
+    if (ret != 0) {
+        ERROR_OUT(ret, out);
+    }
+#endif
+
 #if !defined(WOLFSSL_MLDSA_NO_MAKE_KEY) || \
     !defined(WOLFSSL_MLDSA_NO_VERIFY) || \
     defined(WOLFSSL_MLDSA_PRIVATE_KEY) || \
@@ -57772,29 +57920,18 @@ static wc_test_ret_t slhdsa_test_param(enum SlhDsaParam param)
         ERROR_OUT(WC_TEST_RET_ENC_EC(ret), out);
     }
 
-    /* HashSLH-DSA takes the caller's pre-hashed digest as input. */
+    /* HashSLH-DSA takes the caller's pre-hashed digest as input.  SHAKE-256
+     * is universally approved by FIPS 205 sec. 10.2.2 Table 9 across all
+     * SLH-DSA-{128,192,256} variants, so use it unconditionally for the
+     * positive round-trip path -- avoids tripping the in-module
+     * hash-vs-paramSet validation gate for higher-security paramSets. */
     {
-#ifdef WOLFSSL_SLHDSA_SHA2
-        enum wc_HashType phType = SLHDSA_IS_SHA2(param) ?
-            WC_HASH_TYPE_SHA256 : WC_HASH_TYPE_SHAKE256;
-#else
         enum wc_HashType phType = WC_HASH_TYPE_SHAKE256;
-#endif
         byte digest[WC_SHA3_512_DIGEST_SIZE];
-        word32 digestLen;
+        word32 digestLen = WC_SHA3_512_DIGEST_SIZE;
 
-#ifdef WOLFSSL_SLHDSA_SHA2
-        if (phType == WC_HASH_TYPE_SHA256) {
-            ret = wc_Sha256Hash(msg, (word32)sizeof(msg), digest);
-            digestLen = WC_SHA256_DIGEST_SIZE;
-        }
-        else
-#endif
-        {
-            ret = wc_Shake256Hash(msg, (word32)sizeof(msg), digest,
-                WC_SHA3_512_DIGEST_SIZE);
-            digestLen = WC_SHA3_512_DIGEST_SIZE;
-        }
+        ret = wc_Shake256Hash(msg, (word32)sizeof(msg), digest,
+            WC_SHA3_512_DIGEST_SIZE);
         if (ret != 0) {
             ERROR_OUT(WC_TEST_RET_ENC_EC(ret), out);
         }
@@ -57813,9 +57950,13 @@ static wc_test_ret_t slhdsa_test_param(enum SlhDsaParam param)
         ERROR_OUT(WC_TEST_RET_ENC_EC(ret), out);
     }
 
-    /* Additional pre-hash test: SHA-384 exercises a different OID path */
+    /* Additional pre-hash test: SHA-384 exercises a different OID path.
+     * Skip for SLH-DSA-256 because SHA-384 (192-bit collision) is below the
+     * 256-bit security level required by FIPS 205 sec. 10.2.2 Table 9. */
 #ifdef WOLFSSL_SHA384
-    {
+    /* Skip SHA-384 for SLH-DSA-256: 192-bit collision strength below the
+     * 256-bit security level (FIPS 205 sec. 10.2.2 Table 9). */
+    if (key->params->n != WC_SLHDSA_N_256) {
         byte digest384[WC_SHA384_DIGEST_SIZE];
 
         ret = wc_Sha384Hash(msg, (word32)sizeof(msg), digest384);
@@ -57875,6 +58016,98 @@ static wc_test_ret_t slhdsa_test_param(enum SlhDsaParam param)
 
     return ret;
 }
+
+/* Negative test: HashSLH-DSA must reject pre-hash algorithms whose collision
+ * resistance is below the parameter set's claimed security strength.
+ *
+ * Per FIPS 205 sec. 10.2.2, Table 9 (Approved PH for HashSLH-DSA):
+ *   SLH-DSA-*-128*  (128-bit):  SHA2-256, SHA2-384, SHA2-512, SHA2-512/256,
+ *                               SHA3-256, SHA3-384, SHA3-512,
+ *                               SHAKE-128, SHAKE-256
+ *   SLH-DSA-*-192*  (192-bit):  SHA2-384, SHA2-512, SHA3-384, SHA3-512,
+ *                               SHAKE-256
+ *   SLH-DSA-*-256*  (256-bit):  SHA2-512, SHA3-512, SHAKE-256
+ *
+ * This test attempts sigGen / sigVer with a disallowed (paramSet, hash) pair
+ * and asserts both reject the call.  Before the in-module hash-vs-paramSet
+ * check exists, wc_SlhDsaKey_SignHash / wc_SlhDsaKey_VerifyHash happily
+ * proceed with any compiled-in hash, so this test is expected to FAIL until
+ * the check is added. */
+static wc_test_ret_t slhdsa_hash_paramset_rejection_test(enum SlhDsaParam param)
+{
+    int    ret = 0;
+    WC_RNG rng;
+    SlhDsaKey key[1];
+    byte   sig[WC_SLHDSA_MAX_SIG_LEN];
+    word32 sigLen;
+    static const byte msg[] = {
+        0x48,0x65,0x6c,0x6c,0x6f,0x20,0x57,0x6f,
+        0x72,0x6c,0x64,0x21
+    };
+    byte   ctx[1];
+    /* Hash that is BELOW the security level of every 192/256-bit paramSet
+     * tested below.  SHA-256 (128-bit collision) is approved only for the
+     * 128-bit SLH-DSA paramSets, so any 192/256-bit paramSet must reject it. */
+    enum wc_HashType badHash = WC_HASH_TYPE_SHA256;
+
+    XMEMSET(&key, 0, sizeof(key));
+
+#ifndef HAVE_FIPS
+    ret = wc_InitRng_ex(&rng, HEAP_HINT, devId);
+#else
+    ret = wc_InitRng(&rng);
+#endif
+    if (ret != 0) return WC_TEST_RET_ENC_EC(ret);
+
+    ret = wc_SlhDsaKey_Init(key, param, NULL, INVALID_DEVID);
+    if (ret != 0) {
+        wc_FreeRng(&rng);
+        return WC_TEST_RET_ENC_EC(ret);
+    }
+
+    ret = wc_SlhDsaKey_MakeKey(key, &rng);
+    if (ret != 0) {
+        wc_SlhDsaKey_Free(key);
+        wc_FreeRng(&rng);
+        return WC_TEST_RET_ENC_EC(ret);
+    }
+
+    /* Only enforce on paramSets above 128-bit security; SHA-256 is approved
+     * for 128-bit so wouldn't be a rejection target there. */
+    if (key->params->n == WC_SLHDSA_N_128) {
+        wc_SlhDsaKey_Free(key);
+        wc_FreeRng(&rng);
+        return 0;
+    }
+
+    /* sigGen with too-weak PH must be REJECTED. */
+    sigLen = WC_SLHDSA_MAX_SIG_LEN;
+    PRIVATE_KEY_UNLOCK();
+    ret = wc_SlhDsaKey_SignHash(key, ctx, 0, msg, (word32)sizeof(msg),
+        badHash, sig, &sigLen, &rng);
+    PRIVATE_KEY_LOCK();
+    if (ret == 0) {
+        /* Module did NOT reject -- this is the missing-enforcement bug. */
+        wc_SlhDsaKey_Free(key);
+        wc_FreeRng(&rng);
+        return WC_TEST_RET_ENC_NC;
+    }
+
+    /* sigVer with too-weak PH must ALSO be REJECTED. */
+    sigLen = WC_SLHDSA_MAX_SIG_LEN;
+    XMEMSET(sig, 0, sigLen);
+    ret = wc_SlhDsaKey_VerifyHash(key, ctx, 0, msg, (word32)sizeof(msg),
+        badHash, sig, sigLen);
+    if (ret == 0) {
+        wc_SlhDsaKey_Free(key);
+        wc_FreeRng(&rng);
+        return WC_TEST_RET_ENC_NC;
+    }
+
+    wc_SlhDsaKey_Free(key);
+    wc_FreeRng(&rng);
+    return 0;
+}
 #endif
 
 /* True iff slhdsa_test() actually emits at least one `goto out;` /
@@ -59868,6 +60101,41 @@ wc_test_ret_t slhdsa_test(void)
     }
 #endif
 
+    /* FIPS 205 sec. 10.2.2 -- HashSLH-DSA must reject pre-hashes whose
+     * collision strength is below the paramSet's security level.  Use any
+     * available 192- or 256-bit paramSet to exercise the rejection.  The
+     * 128-bit paramSets allow SHA-256, so they are not useful as targets
+     * here. */
+#ifdef WOLFSSL_SLHDSA_PARAM_192S
+    ret = slhdsa_hash_paramset_rejection_test(SLHDSA_SHAKE192S);
+    if (ret != 0) {
+        wc_test_render_error_message("SLHDSA_SHAKE192S (hash-paramset reject)",
+            0);
+        goto out;
+    }
+#elif defined(WOLFSSL_SLHDSA_PARAM_256S)
+    ret = slhdsa_hash_paramset_rejection_test(SLHDSA_SHAKE256S);
+    if (ret != 0) {
+        wc_test_render_error_message("SLHDSA_SHAKE256S (hash-paramset reject)",
+            0);
+        goto out;
+    }
+#elif defined(WOLFSSL_SLHDSA_PARAM_SHA2_192S)
+    ret = slhdsa_hash_paramset_rejection_test(SLHDSA_SHA2_192S);
+    if (ret != 0) {
+        wc_test_render_error_message("SLHDSA_SHA2_192S (hash-paramset reject)",
+            0);
+        goto out;
+    }
+#elif defined(WOLFSSL_SLHDSA_PARAM_SHA2_256S)
+    ret = slhdsa_hash_paramset_rejection_test(SLHDSA_SHA2_256S);
+    if (ret != 0) {
+        wc_test_render_error_message("SLHDSA_SHA2_256S (hash-paramset reject)",
+            0);
+        goto out;
+    }
+#endif
+
 #endif /* !WOLFSSL_SLHDSA_VERIFY_ONLY */
 
 #if defined(WOLF_PRIVATE_KEY_ID) && \
diff --git a/wolfssl/wolfcrypt/aes.h b/wolfssl/wolfcrypt/aes.h
index e3d7637470d..e5b8b2ded9d 100644
--- a/wolfssl/wolfcrypt/aes.h
+++ b/wolfssl/wolfcrypt/aes.h
@@ -66,8 +66,14 @@ typedef struct Gcm {
 #endif
 
 WOLFSSL_LOCAL void GenerateM0(Gcm* gcm);
+/* The two-byte-pointer GMULT signature is the GCM_SMALL form only.  Other GCM
+ * table modes (GCM_TABLE / GCM_TABLE_4BIT) use a static GMULT taking a table
+ * argument (byte m[N][16]), so this prototype must be scoped to GCM_SMALL --
+ * otherwise on 32-bit ARM armasm with WOLFSSL_AESGCM_STREAM (which now compiles
+ * the software table GHASH for the streaming path) it conflicts with the
+ * table-mode GMULT.  See SP 800-38D AES-GCM GHASH. */
 #if !defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
-    !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+    !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) && defined(GCM_SMALL)
 WOLFSSL_LOCAL void GMULT(byte* X, byte* Y);
 #endif
 WOLFSSL_LOCAL void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c,
diff --git a/wolfssl/wolfcrypt/error-crypt.h b/wolfssl/wolfcrypt/error-crypt.h
index 5b089f118b4..286a8739f44 100644
--- a/wolfssl/wolfcrypt/error-crypt.h
+++ b/wolfssl/wolfcrypt/error-crypt.h
@@ -327,9 +327,17 @@ enum wolfCrypt_ErrorCodes {
     ML_DSA_PCT_E        = -1016, /* ML-DSA Pairwise Consistency Test failure */
     DRBG_SHA512_KAT_FIPS_E = -1017, /* SHA-512 DRBG KAT failure */
     SLH_DSA_KAT_FIPS_E  = -1018, /* SLH-DSA CAST KAT failure */
-
-    WC_SPAN2_LAST_E     = -1018, /* Update to indicate last used error code */
-    WC_LAST_E           = -1018, /* the last code used either here or in
+    SLH_DSA_PCT_E       = -1019, /* SLH-DSA Pairwise Consistency Test failure */
+    CMAC_KAT_FIPS_E     = -1020, /* AES-CMAC KAT failure (vendor-elected) */
+    SHAKE_KAT_FIPS_E    = -1021, /* SHAKE KAT failure (vendor-elected) */
+    DH_PCT_E            = -1022, /* DH (FFC) Pairwise Consistency Test
+                                  * failure (SP 800-56A r3 sec 5.6.2.1.4,
+                                  * FIPS 140-3 IG 10.3.B) */
+    AES_KW_KAT_FIPS_E   = -1023, /* AES-KW KAT failure (vendor-elected,
+                                  * SP 800-38F sec 6.2 / RFC 3394) */
+
+    WC_SPAN2_LAST_E     = -1023, /* Update to indicate last used error code */
+    WC_LAST_E           = -1023, /* the last code used either here or in
                                   * error-ssl.h */
 
     WC_SPAN2_MIN_CODE_E = -1999, /* Last usable code in span 2 */
diff --git a/wolfssl/wolfcrypt/fips_test.h b/wolfssl/wolfcrypt/fips_test.h
index de2b506df2c..41467b0ee2a 100644
--- a/wolfssl/wolfcrypt/fips_test.h
+++ b/wolfssl/wolfcrypt/fips_test.h
@@ -31,8 +31,23 @@
     extern "C" {
 #endif
 
-/* Added for FIPS v5.3 or later */
-#if defined(FIPS_VERSION_GE) && FIPS_VERSION_GE(5,3)
+/* Added for FIPS v5.3 or later.
+ *
+ * v7.0.0 and later upgrade the in-core integrity HMAC to SHA-512 (with a
+ * 512-bit key) for NSA 2.0 compliance.  Customers that must avoid SHA-256
+ * anywhere in the validated module can therefore use the v7 module without
+ * residual SHA-256 integrity material.  v5.3 and v6.x retain HMAC-SHA-256.
+ */
+#if defined(FIPS_VERSION_GE) && FIPS_VERSION_GE(7,0)
+    #ifdef WOLFSSL_SHA512
+        #define FIPS_IN_CORE_DIGEST_SIZE 64
+        #define FIPS_IN_CORE_HASH_TYPE   WC_SHA512
+        #define FIPS_IN_CORE_KEY_SZ      64
+        #define FIPS_IN_CORE_VERIFY_SZ   FIPS_IN_CORE_KEY_SZ
+    #else
+        #error FIPS v7+ integrity test requires WOLFSSL_SHA512
+    #endif
+#elif defined(FIPS_VERSION_GE) && FIPS_VERSION_GE(5,3)
     /* Determine FIPS in core hash type and size */
     #ifndef NO_SHA256
         #define FIPS_IN_CORE_DIGEST_SIZE 32
@@ -62,7 +77,11 @@ enum FipsCastId {
     FIPS_CAST_RSA_SIGN_PKCS1v15 =  7,
     FIPS_CAST_ECC_CDH           =  8,
     FIPS_CAST_ECC_PRIMITIVE_Z   =  9,
-    FIPS_CAST_DH_PRIMITIVE_Z    = 10,
+    FIPS_CAST_DH_PRIMITIVE_Z    = 10,  /* RETIRED (v7+): classic DH dropped
+                                        * from the FIPS 140-3 v7 PQ module
+                                        * boundary.  Preserved for ABI --
+                                        * do not reuse this id, no longer
+                                        * triggered. */
     FIPS_CAST_ECDSA             = 11,
     FIPS_CAST_KDF_TLS12         = 12,
     FIPS_CAST_KDF_TLS13         = 13,
@@ -80,7 +99,10 @@ enum FipsCastId {
     FIPS_CAST_XMSS              = 23,
     FIPS_CAST_DRBG_SHA512       = 24,
     FIPS_CAST_SLH_DSA           = 25,
-    FIPS_CAST_COUNT             = 26
+    FIPS_CAST_AES_CMAC          = 26,
+    FIPS_CAST_SHAKE             = 27,
+    FIPS_CAST_AES_KW            = 28,
+    FIPS_CAST_COUNT             = 29
 };
 
 enum FipsCastStateId {
diff --git a/wolfssl/wolfcrypt/random.h b/wolfssl/wolfcrypt/random.h
index 102f05d6b55..3747ea268f3 100644
--- a/wolfssl/wolfcrypt/random.h
+++ b/wolfssl/wolfcrypt/random.h
@@ -57,8 +57,12 @@
     #define DRBG_SEED_LEN (440/8)
 #endif
 
+/* Size of the DRBG seed (SHA-512) */
 #ifdef WOLFSSL_DRBG_SHA512
-    #define DRBG_SHA512_SEED_LEN (888/8)  /* 111 bytes per SP 800-90A Table 2 */
+    #ifndef DRBG_SHA512_SEED_LEN
+        #define DRBG_SHA512_SEED_LEN (888/8)  /* 111 bytes per SP 800-90A
+                                               * Table 2 */
+    #endif
 #endif
 
 
@@ -212,12 +216,20 @@ struct OS_Seed {
          */
         #define ENTROPY_SCALE_FACTOR  (512)
     #elif defined(HAVE_INTEL_RDSEED) || defined(HAVE_INTEL_RDRAND)
-        /* The value of 2 applies to Intel's RDSEED which provides about
-         * 0.5 bits minimum of entropy per bit. The value of 4 gives a
-         * conservative margin for FIPS. */
+        /* Intel RDSEED nominally provides about 0.5 bits min entropy per
+         * bit (NIST CMVP cert3389 PUD).  In FIPS mode we previously used
+         * ENTROPY_SCALE_FACTOR=8 (256-byte seed) on Intel and 512 (16384-
+         * byte seed) on AMD, asymmetric per-vendor.  As of v7 we adopt the
+         * AMD worst-case scale of 512 on Intel too: the AMD "Tyzen V1xxxx"
+         * PUD Table 3 documents 0.656040 bits per 128-bit block as the
+         * absolute floor across the entire CMVP-validated AMD family, and
+         * we use that same worst-case oversampling on Intel rather than
+         * trusting the higher Intel PUD claim, so a single seeding budget
+         * covers any x86 OE we deploy on.  Non-FIPS Intel builds keep the
+         * lighter scale=2 (Intel-PUD-derived) for performance. */
         #if defined(HAVE_FIPS) && defined(HAVE_FIPS_VERSION) && \
             (HAVE_FIPS_VERSION >= 2)
-            #define ENTROPY_SCALE_FACTOR (2*4)
+            #define ENTROPY_SCALE_FACTOR (512)
         #else
             /* Not FIPS, but Intel RDSEED, only double. */
             #define ENTROPY_SCALE_FACTOR (2)
diff --git a/wolfssl/wolfcrypt/settings.h b/wolfssl/wolfcrypt/settings.h
index 9f699145847..adf6dd75338 100644
--- a/wolfssl/wolfcrypt/settings.h
+++ b/wolfssl/wolfcrypt/settings.h
@@ -557,6 +557,17 @@
     #endif
     /* blinding adds API not available yet in FIPS mode */
     #undef WC_RSA_BLINDING
+
+    /* NIST SP 800-38A sec 6.2 specifies CBC operates on plaintext that is
+     * a multiple of the block size; the cipher does not implement padding
+     * (project_aes_no_padding_policy).  Force the wc_AesCbcEncrypt /
+     * wc_AesCbcDecrypt block-alignment check on for FIPS builds so a
+     * length not a multiple of WC_AES_BLOCK_SIZE returns BAD_LENGTH_E
+     * rather than silently truncating to the largest aligned prefix in
+     * the underlying implementation. */
+    #ifndef WOLFSSL_AES_CBC_LENGTH_CHECKS
+        #define WOLFSSL_AES_CBC_LENGTH_CHECKS
+    #endif
 #endif
 
 /* old FIPS has only AES_BLOCK_SIZE. */
@@ -3998,8 +4009,18 @@
         #undef HAVE_PUBLIC_FFDHE
     #endif
 
+    /* LinuxKM lkcapi previously needed a 4-byte minimum AES-GCM
+     * authentication tag for certain kernel-side test vectors.  Per
+     * NIST SP 800-38D sec 5.2.1.2 / sec 8.2 a minimum tag length of 96 bits
+     * (12 bytes) provides robust integrity for general-purpose use; FIPS
+     * 140-3 IG C.H reaffirms this 96-bit minimum for Approved-mode AES-GCM.
+     * Gate the 32-bit-tag relaxation on non-FIPS builds only so the
+     * v7.0.0 module's Approved configuration retains the full 96-bit
+     * minimum in all linuxkm and non-linuxkm scenarios. */
+#ifndef HAVE_FIPS
     #undef WOLFSSL_MIN_AUTH_TAG_SZ
     #define WOLFSSL_MIN_AUTH_TAG_SZ 4
+#endif
 
     #if defined(LINUXKM_LKCAPI_REGISTER) && !defined(WOLFSSL_ASN_INT_LEAD_0_ANY)
         /* kernel 5.10 crypto manager tests key(s) that fail unless leading