diff --git a/src/row/arch/neon/endian.rs b/src/row/arch/neon/endian.rs
new file mode 100644
index 00000000..6f800f7b
--- /dev/null
+++ b/src/row/arch/neon/endian.rs
@@ -0,0 +1,109 @@
+//! Endian-aware u16/u32 SIMD loaders for AArch64 NEON.
+#![allow(dead_code)] // tier kernels (Phase 2 rollout PRs) will consume these
+//!
+//! Each helper takes a raw byte pointer to LE-encoded (or BE-encoded) data
+//! and returns a NEON vector containing the elements in **host-native** byte
+//! order, ready for native u16/u32 SIMD math.
+//!
+//! The host-native conversion is monomorphized at compile time via
+//! `cfg(target_endian = ...)`:
+//!   - `load_le_*` is a no-op on LE targets, byte-swap on BE targets
+//!   - `load_be_*` is byte-swap on LE targets, no-op on BE targets
+//!
+//! Tier kernels call the generic dispatchers `load_endian_u16x8::<BE>` and
+//! `load_endian_u32x4::<BE>` from their own `<const BE: bool>` contexts.
+//! The `if BE { ... } else { ... }` in the dispatcher is eliminated by the
+//! compiler — each monomorphization sees only one branch.
+
+use core::arch::aarch64::*;
+
+// ---- u16x8 loaders ---------------------------------------------------------
+
+/// Loads 8 × u16 from `ptr` (LE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 16 readable bytes, aligned to at least 1 byte.
+/// Caller must have NEON enabled (via `#[target_feature(enable = "neon")]`).
+#[inline(always)]
+pub(crate) unsafe fn load_le_u16x8(ptr: *const u8) -> uint16x8_t {
+  let v = unsafe { vld1q_u16(ptr.cast()) };
+  #[cfg(target_endian = "big")]
+  let v = unsafe { vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(v))) };
+  v
+}
+
+/// Loads 8 × u16 from `ptr` (BE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 16 readable bytes, aligned to at least 1 byte.
+/// Caller must have NEON enabled (via `#[target_feature(enable = "neon")]`).
+#[inline(always)]
+pub(crate) unsafe fn load_be_u16x8(ptr: *const u8) -> uint16x8_t {
+  let v = unsafe { vld1q_u16(ptr.cast()) };
+  #[cfg(target_endian = "little")]
+  let v = unsafe { vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(v))) };
+  v
+}
+
+/// Generic dispatcher: routes to `load_le_u16x8` or `load_be_u16x8` based on
+/// the compile-time `BE` const parameter.  The unused branch is eliminated by
+/// the compiler when the caller is monomorphized.
+///
+/// # Safety
+///
+/// Same as `load_le_u16x8` / `load_be_u16x8`.
+#[inline(always)]
+pub(crate) unsafe fn load_endian_u16x8<const BE: bool>(ptr: *const u8) -> uint16x8_t {
+  if BE {
+    unsafe { load_be_u16x8(ptr) }
+  } else {
+    unsafe { load_le_u16x8(ptr) }
+  }
+}
+
+// ---- u32x4 loaders ---------------------------------------------------------
+
+/// Loads 4 × u32 from `ptr` (LE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 16 readable bytes, aligned to at least 1 byte.
+/// Caller must have NEON enabled.
+#[inline(always)]
+pub(crate) unsafe fn load_le_u32x4(ptr: *const u8) -> uint32x4_t {
+  let v = unsafe { vld1q_u32(ptr.cast()) };
+  #[cfg(target_endian = "big")]
+  let v = unsafe { vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(v))) };
+  v
+}
+
+/// Loads 4 × u32 from `ptr` (BE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 16 readable bytes, aligned to at least 1 byte.
+/// Caller must have NEON enabled.
+#[inline(always)]
+pub(crate) unsafe fn load_be_u32x4(ptr: *const u8) -> uint32x4_t {
+  let v = unsafe { vld1q_u32(ptr.cast()) };
+  #[cfg(target_endian = "little")]
+  let v = unsafe { vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(v))) };
+  v
+}
+
+/// Generic dispatcher: routes to `load_le_u32x4` or `load_be_u32x4` based on
+/// the compile-time `BE` const parameter.
+///
+/// # Safety
+///
+/// Same as `load_le_u32x4` / `load_be_u32x4`.
+#[inline(always)]
+pub(crate) unsafe fn load_endian_u32x4<const BE: bool>(ptr: *const u8) -> uint32x4_t {
+  if BE {
+    unsafe { load_be_u32x4(ptr) }
+  } else {
+    unsafe { load_le_u32x4(ptr) }
+  }
+}
diff --git a/src/row/arch/neon/mod.rs b/src/row/arch/neon/mod.rs
index c46d02c1..86b8d00a 100644
--- a/src/row/arch/neon/mod.rs
+++ b/src/row/arch/neon/mod.rs
@@ -39,6 +39,7 @@ pub(super) use crate::{ColorMatrix, row::scalar};
 
 pub(crate) mod alpha_extract;
 mod ayuv64;
+pub(crate) mod endian;
 mod gray;
 mod hsv;
 pub(crate) mod legacy_rgb;
diff --git a/src/row/arch/neon/tests/endian.rs b/src/row/arch/neon/tests/endian.rs
new file mode 100644
index 00000000..79c7cb8c
--- /dev/null
+++ b/src/row/arch/neon/tests/endian.rs
@@ -0,0 +1,191 @@
+use crate::row::arch::neon::endian::*;
+
+// Helper: extract uint16x8_t to a stack array.
+unsafe fn u16x8_to_arr(v: core::arch::aarch64::uint16x8_t) -> [u16; 8] {
+  let mut out = [0u16; 8];
+  unsafe { core::arch::aarch64::vst1q_u16(out.as_mut_ptr(), v) };
+  out
+}
+
+// Helper: extract uint32x4_t to a stack array.
+unsafe fn u32x4_to_arr(v: core::arch::aarch64::uint32x4_t) -> [u32; 4] {
+  let mut out = [0u32; 4];
+  unsafe { core::arch::aarch64::vst1q_u32(out.as_mut_ptr(), v) };
+  out
+}
+
+// ---- LE loader on LE host (no-op) ------------------------------------------
+
+/// On a LE host, `load_le_u16x8` must NOT swap bytes — the in-memory LE
+/// layout already matches host-native order.
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn neon_load_le_u16x8_noop_on_le_host() {
+  // 0x0102 stored LE = bytes [0x02, 0x01]; host reads as 0x0102.
+  let input: [u8; 16] = [
+    0x02, 0x01, // u16[0] = 0x0102
+    0x04, 0x03, // u16[1] = 0x0304
+    0x06, 0x05, // u16[2] = 0x0506
+    0x08, 0x07, // u16[3] = 0x0708
+    0x0a, 0x09, // u16[4] = 0x090a
+    0x0c, 0x0b, // u16[5] = 0x0b0c
+    0x0e, 0x0d, // u16[6] = 0x0d0e
+    0x10, 0x0f, // u16[7] = 0x0f10
+  ];
+  let v = unsafe { load_le_u16x8(input.as_ptr()) };
+  let got = unsafe { u16x8_to_arr(v) };
+  assert_eq!(
+    got,
+    [
+      0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10
+    ],
+    "load_le_u16x8 must not swap on LE host"
+  );
+}
+
+/// On a BE host, `load_le_u16x8` MUST swap bytes so the host-native value
+/// matches the original LE value.
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "big")]
+fn neon_load_le_u16x8_swaps_on_be_host() {
+  // Same byte layout as above; on a BE host the raw vld1q_u16 would give
+  // the byte-swapped values, so load_le_u16x8 must reverse that.
+  let input: [u8; 16] = [
+    0x02, 0x01, 0x04, 0x03, 0x06, 0x05, 0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b, 0x0e, 0x0d, 0x10, 0x0f,
+  ];
+  let v = unsafe { load_le_u16x8(input.as_ptr()) };
+  let got = unsafe { u16x8_to_arr(v) };
+  assert_eq!(
+    got,
+    [
+      0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10
+    ],
+    "load_le_u16x8 must swap bytes on BE host to restore LE value"
+  );
+}
+
+// ---- BE loader on LE host (swap) -------------------------------------------
+
+/// On a LE host, `load_be_u16x8` MUST swap bytes — BE-encoded data has the
+/// most-significant byte first, which needs swapping for LE-native math.
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn neon_load_be_u16x8_swaps_on_le_host() {
+  // 0x0102 stored BE = bytes [0x01, 0x02].
+  let input: [u8; 16] = [
+    0x01, 0x02, // u16[0] = 0x0102 in BE encoding
+    0x03, 0x04, // u16[1] = 0x0304
+    0x05, 0x06, // u16[2] = 0x0506
+    0x07, 0x08, // u16[3] = 0x0708
+    0x09, 0x0a, // u16[4] = 0x090a
+    0x0b, 0x0c, // u16[5] = 0x0b0c
+    0x0d, 0x0e, // u16[6] = 0x0d0e
+    0x0f, 0x10, // u16[7] = 0x0f10
+  ];
+  let v = unsafe { load_be_u16x8(input.as_ptr()) };
+  let got = unsafe { u16x8_to_arr(v) };
+  assert_eq!(
+    got,
+    [
+      0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10
+    ],
+    "load_be_u16x8 must swap on LE host to convert BE→LE-native"
+  );
+}
+
+/// On a BE host, `load_be_u16x8` must NOT swap — BE-encoded data already
+/// matches host-native order.
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "big")]
+fn neon_load_be_u16x8_noop_on_be_host() {
+  let input: [u8; 16] = [
+    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
+  ];
+  let v = unsafe { load_be_u16x8(input.as_ptr()) };
+  let got = unsafe { u16x8_to_arr(v) };
+  assert_eq!(
+    got,
+    [
+      0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10
+    ],
+    "load_be_u16x8 must not swap on BE host"
+  );
+}
+
+// ---- u32x4 LE loader on LE host (no-op) ------------------------------------
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn neon_load_le_u32x4_noop_on_le_host() {
+  let input: [u8; 16] = [
+    0x04, 0x03, 0x02, 0x01, // u32[0] = 0x01020304 LE
+    0x08, 0x07, 0x06, 0x05, // u32[1] = 0x05060708
+    0x0c, 0x0b, 0x0a, 0x09, // u32[2] = 0x090a0b0c
+    0x10, 0x0f, 0x0e, 0x0d, // u32[3] = 0x0d0e0f10
+  ];
+  let v = unsafe { load_le_u32x4(input.as_ptr()) };
+  let got = unsafe { u32x4_to_arr(v) };
+  assert_eq!(
+    got,
+    [0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10],
+    "load_le_u32x4 must not swap on LE host"
+  );
+}
+
+// ---- u32x4 BE loader on LE host (swap) -------------------------------------
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn neon_load_be_u32x4_swaps_on_le_host() {
+  let input: [u8; 16] = [
+    0x01, 0x02, 0x03, 0x04, // u32[0] = 0x01020304 BE
+    0x05, 0x06, 0x07, 0x08, // u32[1] = 0x05060708
+    0x09, 0x0a, 0x0b, 0x0c, // u32[2] = 0x090a0b0c
+    0x0d, 0x0e, 0x0f, 0x10, // u32[3] = 0x0d0e0f10
+  ];
+  let v = unsafe { load_be_u32x4(input.as_ptr()) };
+  let got = unsafe { u32x4_to_arr(v) };
+  assert_eq!(
+    got,
+    [0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10],
+    "load_be_u32x4 must swap on LE host"
+  );
+}
+
+// ---- Generic dispatcher consistency ----------------------------------------
+
+/// Verify `load_endian_u16x8::<false>` routes to the LE loader.
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn neon_load_endian_u16x8_le_dispatcher() {
+  let input: [u8; 16] = [
+    0x02, 0x01, 0x04, 0x03, 0x06, 0x05, 0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b, 0x0e, 0x0d, 0x10, 0x0f,
+  ];
+  let direct = unsafe { load_le_u16x8(input.as_ptr()) };
+  let via_dispatch = unsafe { load_endian_u16x8::<false>(input.as_ptr()) };
+  let d = unsafe { u16x8_to_arr(direct) };
+  let g = unsafe { u16x8_to_arr(via_dispatch) };
+  assert_eq!(d, g, "load_endian_u16x8::<false> must match load_le_u16x8");
+}
+
+/// Verify `load_endian_u16x8::<true>` routes to the BE loader.
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn neon_load_endian_u16x8_be_dispatcher() {
+  let input: [u8; 16] = [
+    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
+  ];
+  let direct = unsafe { load_be_u16x8(input.as_ptr()) };
+  let via_dispatch = unsafe { load_endian_u16x8::<true>(input.as_ptr()) };
+  let d = unsafe { u16x8_to_arr(direct) };
+  let g = unsafe { u16x8_to_arr(via_dispatch) };
+  assert_eq!(d, g, "load_endian_u16x8::<true> must match load_be_u16x8");
+}
diff --git a/src/row/arch/neon/tests/mod.rs b/src/row/arch/neon/tests/mod.rs
index 225fa2cf..aa1e28ac 100644
--- a/src/row/arch/neon/tests/mod.rs
+++ b/src/row/arch/neon/tests/mod.rs
@@ -1,6 +1,7 @@
 use super::*;
 
 mod ayuv64;
+mod endian;
 mod high_bit_4_2_0;
 mod high_bit_4_4_4_and_pn;
 mod legacy_rgb;
diff --git a/src/row/arch/wasm_simd128/endian.rs b/src/row/arch/wasm_simd128/endian.rs
new file mode 100644
index 00000000..e2c812c6
--- /dev/null
+++ b/src/row/arch/wasm_simd128/endian.rs
@@ -0,0 +1,123 @@
+//! Endian-aware u16/u32 SIMD loaders for WebAssembly simd128.
+#![allow(dead_code)] // tier kernels (Phase 2 rollout PRs) will consume these
+//!
+//! Each helper takes a raw byte pointer to LE-encoded (or BE-encoded) data
+//! and returns a `v128` vector containing the elements in **host-native** byte
+//! order, ready for native u16/u32 SIMD math.
+//!
+//! The host-native conversion is monomorphized at compile time via
+//! `cfg(target_endian = ...)`:
+//!   - `load_le_*` is a no-op on LE targets (wasm32 is LE), byte-swap on BE
+//!   - `load_be_*` is byte-swap on LE targets, no-op on BE targets
+//!
+//! Byte-swap is implemented with `u8x16_swizzle`, which has the same
+//! semantics as SSSE3 `_mm_shuffle_epi8`: indices ≥ 16 zero the output lane.
+//! The shuffle indices are expressed as `i8x16` constants (negative values
+//! zero-out lanes, but all our indices are 0..15 so we use non-negative
+//! values cast to i8).
+
+use core::arch::wasm32::*;
+
+// ---- u16x8 loaders ---------------------------------------------------------
+
+/// Loads 8 × u16 from `ptr` (LE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 16 readable bytes.  Caller must have simd128
+/// enabled via `#[target_feature(enable = "simd128")]`.
+#[inline(always)]
+pub(crate) unsafe fn load_le_u16x8(ptr: *const u8) -> v128 {
+  let v = unsafe { v128_load(ptr.cast()) };
+  #[cfg(target_endian = "big")]
+  let v = {
+    // swap bytes within each u16 lane: [1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14]
+    let mask = i8x16(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+    u8x16_swizzle(v, mask)
+  };
+  v
+}
+
+/// Loads 8 × u16 from `ptr` (BE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 16 readable bytes.  Caller must have simd128
+/// enabled.
+#[inline(always)]
+pub(crate) unsafe fn load_be_u16x8(ptr: *const u8) -> v128 {
+  let v = unsafe { v128_load(ptr.cast()) };
+  #[cfg(target_endian = "little")]
+  let v = {
+    let mask = i8x16(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+    u8x16_swizzle(v, mask)
+  };
+  v
+}
+
+/// Generic dispatcher: routes to `load_le_u16x8` or `load_be_u16x8` based on
+/// the compile-time `BE` const parameter.
+///
+/// # Safety
+///
+/// Same as `load_le_u16x8` / `load_be_u16x8`.
+#[inline(always)]
+pub(crate) unsafe fn load_endian_u16x8<const BE: bool>(ptr: *const u8) -> v128 {
+  if BE {
+    unsafe { load_be_u16x8(ptr) }
+  } else {
+    unsafe { load_le_u16x8(ptr) }
+  }
+}
+
+// ---- u32x4 loaders ---------------------------------------------------------
+
+/// Loads 4 × u32 from `ptr` (LE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 16 readable bytes.  Caller must have simd128
+/// enabled.
+#[inline(always)]
+pub(crate) unsafe fn load_le_u32x4(ptr: *const u8) -> v128 {
+  let v = unsafe { v128_load(ptr.cast()) };
+  #[cfg(target_endian = "big")]
+  let v = {
+    // swap bytes within each u32 lane: [3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12]
+    let mask = i8x16(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+    u8x16_swizzle(v, mask)
+  };
+  v
+}
+
+/// Loads 4 × u32 from `ptr` (BE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 16 readable bytes.  Caller must have simd128
+/// enabled.
+#[inline(always)]
+pub(crate) unsafe fn load_be_u32x4(ptr: *const u8) -> v128 {
+  let v = unsafe { v128_load(ptr.cast()) };
+  #[cfg(target_endian = "little")]
+  let v = {
+    let mask = i8x16(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+    u8x16_swizzle(v, mask)
+  };
+  v
+}
+
+/// Generic dispatcher: routes to `load_le_u32x4` or `load_be_u32x4` based on
+/// the compile-time `BE` const parameter.
+///
+/// # Safety
+///
+/// Same as `load_le_u32x4` / `load_be_u32x4`.
+#[inline(always)]
+pub(crate) unsafe fn load_endian_u32x4<const BE: bool>(ptr: *const u8) -> v128 {
+  if BE {
+    unsafe { load_be_u32x4(ptr) }
+  } else {
+    unsafe { load_le_u32x4(ptr) }
+  }
+}
diff --git a/src/row/arch/wasm_simd128/mod.rs b/src/row/arch/wasm_simd128/mod.rs
index 8d4ed835..eed6e69d 100644
--- a/src/row/arch/wasm_simd128/mod.rs
+++ b/src/row/arch/wasm_simd128/mod.rs
@@ -42,6 +42,7 @@ pub(super) use crate::{ColorMatrix, row::scalar};
 
 pub(crate) mod alpha_extract;
 mod ayuv64;
+pub(crate) mod endian;
 mod gray;
 mod hsv;
 pub(crate) mod legacy_rgb;
diff --git a/src/row/arch/wasm_simd128/tests/endian.rs b/src/row/arch/wasm_simd128/tests/endian.rs
new file mode 100644
index 00000000..7647eabd
--- /dev/null
+++ b/src/row/arch/wasm_simd128/tests/endian.rs
@@ -0,0 +1,173 @@
+use crate::row::arch::wasm_simd128::endian::*;
+
+// Helper: extract v128 to a stack array of 8 u16 lanes.
+unsafe fn v128_to_u16x8(v: core::arch::wasm32::v128) -> [u16; 8] {
+  let mut out = [0u16; 8];
+  unsafe { core::arch::wasm32::v128_store(out.as_mut_ptr().cast(), v) };
+  out
+}
+
+// Helper: extract v128 to a stack array of 4 u32 lanes.
+unsafe fn v128_to_u32x4(v: core::arch::wasm32::v128) -> [u32; 4] {
+  let mut out = [0u32; 4];
+  unsafe { core::arch::wasm32::v128_store(out.as_mut_ptr().cast(), v) };
+  out
+}
+
+// ---- LE loader on LE host (no-op) ------------------------------------------
+
+/// On a LE host (wasm32 is always LE), `load_le_u16x8` must NOT swap bytes.
+#[test]
+#[cfg(target_endian = "little")]
+fn wasm_load_le_u16x8_noop_on_le_host() {
+  let input: [u8; 16] = [
+    0x02, 0x01, // u16[0] = 0x0102
+    0x04, 0x03, // u16[1] = 0x0304
+    0x06, 0x05, // u16[2] = 0x0506
+    0x08, 0x07, // u16[3] = 0x0708
+    0x0a, 0x09, // u16[4] = 0x090a
+    0x0c, 0x0b, // u16[5] = 0x0b0c
+    0x0e, 0x0d, // u16[6] = 0x0d0e
+    0x10, 0x0f, // u16[7] = 0x0f10
+  ];
+  let v = unsafe { load_le_u16x8(input.as_ptr()) };
+  let got = unsafe { v128_to_u16x8(v) };
+  assert_eq!(
+    got,
+    [
+      0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10
+    ],
+    "wasm load_le_u16x8 must not swap on LE host"
+  );
+}
+
+/// On a BE host, `load_le_u16x8` MUST swap bytes.
+#[test]
+#[cfg(target_endian = "big")]
+fn wasm_load_le_u16x8_swaps_on_be_host() {
+  let input: [u8; 16] = [
+    0x02, 0x01, 0x04, 0x03, 0x06, 0x05, 0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b, 0x0e, 0x0d, 0x10, 0x0f,
+  ];
+  let v = unsafe { load_le_u16x8(input.as_ptr()) };
+  let got = unsafe { v128_to_u16x8(v) };
+  assert_eq!(
+    got,
+    [
+      0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10
+    ],
+    "wasm load_le_u16x8 must swap on BE host"
+  );
+}
+
+// ---- BE loader on LE host (swap) -------------------------------------------
+
+/// On a LE host, `load_be_u16x8` MUST swap bytes.
+#[test]
+#[cfg(target_endian = "little")]
+fn wasm_load_be_u16x8_swaps_on_le_host() {
+  let input: [u8; 16] = [
+    0x01, 0x02, // u16[0] = 0x0102 BE
+    0x03, 0x04, // u16[1] = 0x0304
+    0x05, 0x06, // u16[2] = 0x0506
+    0x07, 0x08, // u16[3] = 0x0708
+    0x09, 0x0a, // u16[4] = 0x090a
+    0x0b, 0x0c, // u16[5] = 0x0b0c
+    0x0d, 0x0e, // u16[6] = 0x0d0e
+    0x0f, 0x10, // u16[7] = 0x0f10
+  ];
+  let v = unsafe { load_be_u16x8(input.as_ptr()) };
+  let got = unsafe { v128_to_u16x8(v) };
+  assert_eq!(
+    got,
+    [
+      0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10
+    ],
+    "wasm load_be_u16x8 must swap on LE host"
+  );
+}
+
+/// On a BE host, `load_be_u16x8` must NOT swap.
+#[test]
+#[cfg(target_endian = "big")]
+fn wasm_load_be_u16x8_noop_on_be_host() {
+  let input: [u8; 16] = [
+    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
+  ];
+  let v = unsafe { load_be_u16x8(input.as_ptr()) };
+  let got = unsafe { v128_to_u16x8(v) };
+  assert_eq!(
+    got,
+    [
+      0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10
+    ],
+    "wasm load_be_u16x8 must not swap on BE host"
+  );
+}
+
+// ---- u32x4 LE loader on LE host (no-op) ------------------------------------
+
+#[test]
+#[cfg(target_endian = "little")]
+fn wasm_load_le_u32x4_noop_on_le_host() {
+  let input: [u8; 16] = [
+    0x04, 0x03, 0x02, 0x01, // u32[0] = 0x01020304 LE
+    0x08, 0x07, 0x06, 0x05, // u32[1] = 0x05060708
+    0x0c, 0x0b, 0x0a, 0x09, // u32[2] = 0x090a0b0c
+    0x10, 0x0f, 0x0e, 0x0d, // u32[3] = 0x0d0e0f10
+  ];
+  let v = unsafe { load_le_u32x4(input.as_ptr()) };
+  let got = unsafe { v128_to_u32x4(v) };
+  assert_eq!(
+    got,
+    [0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10],
+    "wasm load_le_u32x4 must not swap on LE host"
+  );
+}
+
+// ---- u32x4 BE loader on LE host (swap) -------------------------------------
+
+#[test]
+#[cfg(target_endian = "little")]
+fn wasm_load_be_u32x4_swaps_on_le_host() {
+  let input: [u8; 16] = [
+    0x01, 0x02, 0x03, 0x04, // u32[0] = 0x01020304 BE
+    0x05, 0x06, 0x07, 0x08, // u32[1] = 0x05060708
+    0x09, 0x0a, 0x0b, 0x0c, // u32[2] = 0x090a0b0c
+    0x0d, 0x0e, 0x0f, 0x10, // u32[3] = 0x0d0e0f10
+  ];
+  let v = unsafe { load_be_u32x4(input.as_ptr()) };
+  let got = unsafe { v128_to_u32x4(v) };
+  assert_eq!(
+    got,
+    [0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10],
+    "wasm load_be_u32x4 must swap on LE host"
+  );
+}
+
+// ---- Generic dispatcher consistency ----------------------------------------
+
+#[test]
+#[cfg(target_endian = "little")]
+fn wasm_load_endian_u16x8_le_dispatcher() {
+  let input: [u8; 16] = [
+    0x02, 0x01, 0x04, 0x03, 0x06, 0x05, 0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b, 0x0e, 0x0d, 0x10, 0x0f,
+  ];
+  let direct = unsafe { load_le_u16x8(input.as_ptr()) };
+  let via_dispatch = unsafe { load_endian_u16x8::<false>(input.as_ptr()) };
+  let d = unsafe { v128_to_u16x8(direct) };
+  let g = unsafe { v128_to_u16x8(via_dispatch) };
+  assert_eq!(d, g, "load_endian_u16x8::<false> must match load_le_u16x8");
+}
+
+#[test]
+#[cfg(target_endian = "little")]
+fn wasm_load_endian_u16x8_be_dispatcher() {
+  let input: [u8; 16] = [
+    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
+  ];
+  let direct = unsafe { load_be_u16x8(input.as_ptr()) };
+  let via_dispatch = unsafe { load_endian_u16x8::<true>(input.as_ptr()) };
+  let d = unsafe { v128_to_u16x8(direct) };
+  let g = unsafe { v128_to_u16x8(via_dispatch) };
+  assert_eq!(d, g, "load_endian_u16x8::<true> must match load_be_u16x8");
+}
diff --git a/src/row/arch/wasm_simd128/tests/mod.rs b/src/row/arch/wasm_simd128/tests/mod.rs
index 660bc752..0ed1c869 100644
--- a/src/row/arch/wasm_simd128/tests/mod.rs
+++ b/src/row/arch/wasm_simd128/tests/mod.rs
@@ -5,6 +5,7 @@ use super::*;
 use crate::row::scalar::planar_gbr_f16 as scalar_f16;
 
 mod ayuv64;
+mod endian;
 mod high_bit_4_2_0;
 mod high_bit_4_4_4_and_pn;
 mod legacy_rgb;
diff --git a/src/row/arch/x86_avx2/endian.rs b/src/row/arch/x86_avx2/endian.rs
new file mode 100644
index 00000000..d2dd6995
--- /dev/null
+++ b/src/row/arch/x86_avx2/endian.rs
@@ -0,0 +1,133 @@
+//! Endian-aware u16/u32 SIMD loaders for x86_64 AVX2.
+#![allow(dead_code)] // tier kernels (Phase 2 rollout PRs) will consume these
+//!
+//! Each helper takes a raw byte pointer to LE-encoded (or BE-encoded) data
+//! and returns a `__m256i` vector containing the elements in **host-native**
+//! byte order, ready for native u16/u32 SIMD math.
+//!
+//! The host-native conversion is monomorphized at compile time via
+//! `cfg(target_endian = ...)`:
+//!   - `load_le_*` is a no-op on LE targets (all real x86), byte-swap on BE
+//!   - `load_be_*` is byte-swap on LE targets, no-op on BE targets
+//!
+//! Byte-swap is implemented with `_mm256_shuffle_epi8` (AVX2) using
+//! compile-time shuffle masks.  The masks replicate the 128-bit SSE pattern
+//! across both 128-bit lanes of the 256-bit register — AVX2's `vpshufb`
+//! operates per-lane, so the same within-lane byte permutation is applied to
+//! both lanes independently.
+
+use core::arch::x86_64::*;
+
+// ---- Byte-swap shuffle masks -----------------------------------------------
+
+/// AVX2 `_mm256_shuffle_epi8` mask that swaps bytes within every 2-byte (u16)
+/// lane across both 128-bit halves.
+const BYTESWAP_MASK_U16: __m256i = unsafe {
+  core::mem::transmute([
+    // low 128-bit lane
+    1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+    // high 128-bit lane (identical pattern)
+    1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+  ])
+};
+
+/// AVX2 `_mm256_shuffle_epi8` mask that swaps bytes within every 4-byte (u32)
+/// lane across both 128-bit halves.
+const BYTESWAP_MASK_U32: __m256i = unsafe {
+  core::mem::transmute([
+    // low 128-bit lane
+    3u8, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
+    // high 128-bit lane (identical pattern)
+    3u8, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
+  ])
+};
+
+// ---- u16x16 loaders --------------------------------------------------------
+
+/// Loads 16 × u16 from `ptr` (LE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 32 readable bytes.  Caller must have AVX2
+/// enabled via `#[target_feature(enable = "avx2")]`.
+#[inline(always)]
+pub(crate) unsafe fn load_le_u16x16(ptr: *const u8) -> __m256i {
+  let v = unsafe { _mm256_loadu_si256(ptr.cast()) };
+  #[cfg(target_endian = "big")]
+  let v = unsafe { _mm256_shuffle_epi8(v, BYTESWAP_MASK_U16) };
+  v
+}
+
+/// Loads 16 × u16 from `ptr` (BE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 32 readable bytes.  Caller must have AVX2
+/// enabled.
+#[inline(always)]
+pub(crate) unsafe fn load_be_u16x16(ptr: *const u8) -> __m256i {
+  let v = unsafe { _mm256_loadu_si256(ptr.cast()) };
+  #[cfg(target_endian = "little")]
+  let v = unsafe { _mm256_shuffle_epi8(v, BYTESWAP_MASK_U16) };
+  v
+}
+
+/// Generic dispatcher: routes to `load_le_u16x16` or `load_be_u16x16` based
+/// on the compile-time `BE` const parameter.
+///
+/// # Safety
+///
+/// Same as `load_le_u16x16` / `load_be_u16x16`.
+#[inline(always)]
+pub(crate) unsafe fn load_endian_u16x16<const BE: bool>(ptr: *const u8) -> __m256i {
+  if BE {
+    unsafe { load_be_u16x16(ptr) }
+  } else {
+    unsafe { load_le_u16x16(ptr) }
+  }
+}
+
+// ---- u32x8 loaders ---------------------------------------------------------
+
+/// Loads 8 × u32 from `ptr` (LE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 32 readable bytes.  Caller must have AVX2
+/// enabled.
+#[inline(always)]
+pub(crate) unsafe fn load_le_u32x8(ptr: *const u8) -> __m256i {
+  let v = unsafe { _mm256_loadu_si256(ptr.cast()) };
+  #[cfg(target_endian = "big")]
+  let v = unsafe { _mm256_shuffle_epi8(v, BYTESWAP_MASK_U32) };
+  v
+}
+
+/// Loads 8 × u32 from `ptr` (BE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 32 readable bytes.  Caller must have AVX2
+/// enabled.
+#[inline(always)]
+pub(crate) unsafe fn load_be_u32x8(ptr: *const u8) -> __m256i {
+  let v = unsafe { _mm256_loadu_si256(ptr.cast()) };
+  #[cfg(target_endian = "little")]
+  let v = unsafe { _mm256_shuffle_epi8(v, BYTESWAP_MASK_U32) };
+  v
+}
+
+/// Generic dispatcher: routes to `load_le_u32x8` or `load_be_u32x8` based on
+/// the compile-time `BE` const parameter.
+///
+/// # Safety
+///
+/// Same as `load_le_u32x8` / `load_be_u32x8`.
+#[inline(always)]
+pub(crate) unsafe fn load_endian_u32x8<const BE: bool>(ptr: *const u8) -> __m256i {
+  if BE {
+    unsafe { load_be_u32x8(ptr) }
+  } else {
+    unsafe { load_le_u32x8(ptr) }
+  }
+}
diff --git a/src/row/arch/x86_avx2/mod.rs b/src/row/arch/x86_avx2/mod.rs
index bf7be11b..abe1c871 100644
--- a/src/row/arch/x86_avx2/mod.rs
+++ b/src/row/arch/x86_avx2/mod.rs
@@ -59,6 +59,7 @@ pub(super) use crate::{
 
 mod alpha_extract;
 mod ayuv64;
+pub(crate) mod endian;
 mod gray;
 mod hsv;
 pub(crate) mod legacy_rgb;
diff --git a/src/row/arch/x86_avx2/tests/endian.rs b/src/row/arch/x86_avx2/tests/endian.rs
new file mode 100644
index 00000000..ff394233
--- /dev/null
+++ b/src/row/arch/x86_avx2/tests/endian.rs
@@ -0,0 +1,204 @@
+use crate::row::arch::x86_avx2::endian::*;
+
+// Helper: extract __m256i to a stack array of 16 u16 lanes.
+#[cfg(target_arch = "x86_64")]
+unsafe fn m256i_to_u16x16(v: core::arch::x86_64::__m256i) -> [u16; 16] {
+  let mut out = [0u16; 16];
+  unsafe { core::arch::x86_64::_mm256_storeu_si256(out.as_mut_ptr().cast(), v) };
+  out
+}
+
+// Helper: extract __m256i to a stack array of 8 u32 lanes.
+#[cfg(target_arch = "x86_64")]
+unsafe fn m256i_to_u32x8(v: core::arch::x86_64::__m256i) -> [u32; 8] {
+  let mut out = [0u32; 8];
+  unsafe { core::arch::x86_64::_mm256_storeu_si256(out.as_mut_ptr().cast(), v) };
+  out
+}
+
+// ---- LE loader on LE host (no-op) ------------------------------------------
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 AVX2 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn avx2_load_le_u16x16_noop_on_le_host() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  let input: [u8; 32] = [
+    0x02, 0x01, 0x04, 0x03, 0x06, 0x05, 0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b, 0x0e, 0x0d, 0x10, 0x0f,
+    0x12, 0x11, 0x14, 0x13, 0x16, 0x15, 0x18, 0x17, 0x1a, 0x19, 0x1c, 0x1b, 0x1e, 0x1d, 0x20, 0x1f,
+  ];
+  let v = unsafe { load_le_u16x16(input.as_ptr()) };
+  let got = unsafe { m256i_to_u16x16(v) };
+  assert_eq!(
+    got,
+    [
+      0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10, 0x1112, 0x1314, 0x1516,
+      0x1718, 0x191a, 0x1b1c, 0x1d1e, 0x1f20,
+    ],
+    "AVX2 load_le_u16x16 must not swap on LE host"
+  );
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 AVX2 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "big")]
+fn avx2_load_le_u16x16_swaps_on_be_host() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  let input: [u8; 32] = [
+    0x02, 0x01, 0x04, 0x03, 0x06, 0x05, 0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b, 0x0e, 0x0d, 0x10, 0x0f,
+    0x12, 0x11, 0x14, 0x13, 0x16, 0x15, 0x18, 0x17, 0x1a, 0x19, 0x1c, 0x1b, 0x1e, 0x1d, 0x20, 0x1f,
+  ];
+  let v = unsafe { load_le_u16x16(input.as_ptr()) };
+  let got = unsafe { m256i_to_u16x16(v) };
+  assert_eq!(
+    got,
+    [
+      0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10, 0x1112, 0x1314, 0x1516,
+      0x1718, 0x191a, 0x1b1c, 0x1d1e, 0x1f20,
+    ],
+    "AVX2 load_le_u16x16 must swap on BE host"
+  );
+}
+
+// ---- BE loader on LE host (swap) -------------------------------------------
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 AVX2 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn avx2_load_be_u16x16_swaps_on_le_host() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  let input: [u8; 32] = [
+    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
+    0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
+  ];
+  let v = unsafe { load_be_u16x16(input.as_ptr()) };
+  let got = unsafe { m256i_to_u16x16(v) };
+  assert_eq!(
+    got,
+    [
+      0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10, 0x1112, 0x1314, 0x1516,
+      0x1718, 0x191a, 0x1b1c, 0x1d1e, 0x1f20,
+    ],
+    "AVX2 load_be_u16x16 must swap on LE host"
+  );
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 AVX2 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "big")]
+fn avx2_load_be_u16x16_noop_on_be_host() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  let input: [u8; 32] = [
+    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
+    0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
+  ];
+  let v = unsafe { load_be_u16x16(input.as_ptr()) };
+  let got = unsafe { m256i_to_u16x16(v) };
+  assert_eq!(
+    got,
+    [
+      0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10, 0x1112, 0x1314, 0x1516,
+      0x1718, 0x191a, 0x1b1c, 0x1d1e, 0x1f20,
+    ],
+    "AVX2 load_be_u16x16 must not swap on BE host"
+  );
+}
+
+// ---- u32x8 LE loader on LE host (no-op) ------------------------------------
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 AVX2 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn avx2_load_le_u32x8_noop_on_le_host() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  let input: [u8; 32] = [
+    0x04, 0x03, 0x02, 0x01, 0x08, 0x07, 0x06, 0x05, 0x0c, 0x0b, 0x0a, 0x09, 0x10, 0x0f, 0x0e, 0x0d,
+    0x14, 0x13, 0x12, 0x11, 0x18, 0x17, 0x16, 0x15, 0x1c, 0x1b, 0x1a, 0x19, 0x20, 0x1f, 0x1e, 0x1d,
+  ];
+  let v = unsafe { load_le_u32x8(input.as_ptr()) };
+  let got = unsafe { m256i_to_u32x8(v) };
+  assert_eq!(
+    got,
+    [
+      0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10, 0x11121314, 0x15161718, 0x191a1b1c,
+      0x1d1e1f20,
+    ],
+    "AVX2 load_le_u32x8 must not swap on LE host"
+  );
+}
+
+// ---- u32x8 BE loader on LE host (swap) -------------------------------------
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 AVX2 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn avx2_load_be_u32x8_swaps_on_le_host() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  let input: [u8; 32] = [
+    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
+    0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
+  ];
+  let v = unsafe { load_be_u32x8(input.as_ptr()) };
+  let got = unsafe { m256i_to_u32x8(v) };
+  assert_eq!(
+    got,
+    [
+      0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10, 0x11121314, 0x15161718, 0x191a1b1c,
+      0x1d1e1f20,
+    ],
+    "AVX2 load_be_u32x8 must swap on LE host"
+  );
+}
+
+// ---- Generic dispatcher consistency ----------------------------------------
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 AVX2 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn avx2_load_endian_u16x16_le_dispatcher() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  let input: [u8; 32] = [
+    0x02, 0x01, 0x04, 0x03, 0x06, 0x05, 0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b, 0x0e, 0x0d, 0x10, 0x0f,
+    0x12, 0x11, 0x14, 0x13, 0x16, 0x15, 0x18, 0x17, 0x1a, 0x19, 0x1c, 0x1b, 0x1e, 0x1d, 0x20, 0x1f,
+  ];
+  let direct = unsafe { load_le_u16x16(input.as_ptr()) };
+  let via_dispatch = unsafe { load_endian_u16x16::<false>(input.as_ptr()) };
+  let d = unsafe { m256i_to_u16x16(direct) };
+  let g = unsafe { m256i_to_u16x16(via_dispatch) };
+  assert_eq!(
+    d, g,
+    "load_endian_u16x16::<false> must match load_le_u16x16"
+  );
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 AVX2 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn avx2_load_endian_u16x16_be_dispatcher() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  let input: [u8; 32] = [
+    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
+    0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
+  ];
+  let direct = unsafe { load_be_u16x16(input.as_ptr()) };
+  let via_dispatch = unsafe { load_endian_u16x16::<true>(input.as_ptr()) };
+  let d = unsafe { m256i_to_u16x16(direct) };
+  let g = unsafe { m256i_to_u16x16(via_dispatch) };
+  assert_eq!(d, g, "load_endian_u16x16::<true> must match load_be_u16x16");
+}
diff --git a/src/row/arch/x86_avx2/tests/mod.rs b/src/row/arch/x86_avx2/tests/mod.rs
index 0acc31f3..fcb6df4b 100644
--- a/src/row/arch/x86_avx2/tests/mod.rs
+++ b/src/row/arch/x86_avx2/tests/mod.rs
@@ -1,4 +1,5 @@
 mod ayuv64;
+mod endian;
 mod high_bit_4_2_0;
 mod high_bit_4_4_4_and_pn;
 mod legacy_rgb;
diff --git a/src/row/arch/x86_avx512/endian.rs b/src/row/arch/x86_avx512/endian.rs
new file mode 100644
index 00000000..b886d90a
--- /dev/null
+++ b/src/row/arch/x86_avx512/endian.rs
@@ -0,0 +1,135 @@
+//! Endian-aware u16/u32 SIMD loaders for x86_64 AVX-512 (F + BW).
+#![allow(dead_code)] // tier kernels (Phase 2 rollout PRs) will consume these
+//!
+//! Each helper takes a raw byte pointer to LE-encoded (or BE-encoded) data
+//! and returns a `__m512i` vector containing the elements in **host-native**
+//! byte order, ready for native u16/u32 SIMD math.
+//!
+//! The host-native conversion is monomorphized at compile time via
+//! `cfg(target_endian = ...)`:
+//!   - `load_le_*` is a no-op on LE targets (all real x86), byte-swap on BE
+//!   - `load_be_*` is byte-swap on LE targets, no-op on BE targets
+//!
+//! Byte-swap is implemented with `_mm512_shuffle_epi8` (AVX-512BW) using
+//! compile-time shuffle masks.  AVX-512's `vpshufb` operates per 128-bit
+//! lane, so the mask replicates the same within-lane byte permutation across
+//! all four 128-bit lanes of the 512-bit register.
+
+use core::arch::x86_64::*;
+
+// ---- Byte-swap shuffle masks -----------------------------------------------
+
+/// AVX-512BW `_mm512_shuffle_epi8` mask that swaps bytes within every 2-byte
+/// (u16) lane across all four 128-bit lanes.
+const BYTESWAP_MASK_U16: __m512i = unsafe {
+  core::mem::transmute([
+    // lane 0
+    1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, // lane 1
+    1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, // lane 2
+    1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, // lane 3
+    1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+  ])
+};
+
+/// AVX-512BW `_mm512_shuffle_epi8` mask that swaps bytes within every 4-byte
+/// (u32) lane across all four 128-bit lanes.
+const BYTESWAP_MASK_U32: __m512i = unsafe {
+  core::mem::transmute([
+    // lane 0
+    3u8, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, // lane 1
+    3u8, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, // lane 2
+    3u8, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, // lane 3
+    3u8, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
+  ])
+};
+
+// ---- u16x32 loaders --------------------------------------------------------
+
+/// Loads 32 × u16 from `ptr` (LE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 64 readable bytes.  Caller must have AVX-512F
+/// and AVX-512BW enabled via
+/// `#[target_feature(enable = "avx512f,avx512bw")]`.
+#[inline(always)]
+pub(crate) unsafe fn load_le_u16x32(ptr: *const u8) -> __m512i {
+  let v = unsafe { _mm512_loadu_si512(ptr.cast()) };
+  #[cfg(target_endian = "big")]
+  let v = unsafe { _mm512_shuffle_epi8(v, BYTESWAP_MASK_U16) };
+  v
+}
+
+/// Loads 32 × u16 from `ptr` (BE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 64 readable bytes.  Caller must have AVX-512F
+/// and AVX-512BW enabled.
+#[inline(always)]
+pub(crate) unsafe fn load_be_u16x32(ptr: *const u8) -> __m512i {
+  let v = unsafe { _mm512_loadu_si512(ptr.cast()) };
+  #[cfg(target_endian = "little")]
+  let v = unsafe { _mm512_shuffle_epi8(v, BYTESWAP_MASK_U16) };
+  v
+}
+
+/// Generic dispatcher: routes to `load_le_u16x32` or `load_be_u16x32` based
+/// on the compile-time `BE` const parameter.
+///
+/// # Safety
+///
+/// Same as `load_le_u16x32` / `load_be_u16x32`.
+#[inline(always)]
+pub(crate) unsafe fn load_endian_u16x32<const BE: bool>(ptr: *const u8) -> __m512i {
+  if BE {
+    unsafe { load_be_u16x32(ptr) }
+  } else {
+    unsafe { load_le_u16x32(ptr) }
+  }
+}
+
+// ---- u32x16 loaders --------------------------------------------------------
+
+/// Loads 16 × u32 from `ptr` (LE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 64 readable bytes.  Caller must have AVX-512F
+/// and AVX-512BW enabled.
+#[inline(always)]
+pub(crate) unsafe fn load_le_u32x16(ptr: *const u8) -> __m512i {
+  let v = unsafe { _mm512_loadu_si512(ptr.cast()) };
+  #[cfg(target_endian = "big")]
+  let v = unsafe { _mm512_shuffle_epi8(v, BYTESWAP_MASK_U32) };
+  v
+}
+
+/// Loads 16 × u32 from `ptr` (BE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 64 readable bytes.  Caller must have AVX-512F
+/// and AVX-512BW enabled.
+#[inline(always)]
+pub(crate) unsafe fn load_be_u32x16(ptr: *const u8) -> __m512i {
+  let v = unsafe { _mm512_loadu_si512(ptr.cast()) };
+  #[cfg(target_endian = "little")]
+  let v = unsafe { _mm512_shuffle_epi8(v, BYTESWAP_MASK_U32) };
+  v
+}
+
+/// Generic dispatcher: routes to `load_le_u32x16` or `load_be_u32x16` based
+/// on the compile-time `BE` const parameter.
+///
+/// # Safety
+///
+/// Same as `load_le_u32x16` / `load_be_u32x16`.
+#[inline(always)]
+pub(crate) unsafe fn load_endian_u32x16<const BE: bool>(ptr: *const u8) -> __m512i {
+  if BE {
+    unsafe { load_be_u32x16(ptr) }
+  } else {
+    unsafe { load_le_u32x16(ptr) }
+  }
+}
diff --git a/src/row/arch/x86_avx512/mod.rs b/src/row/arch/x86_avx512/mod.rs
index f93dece5..ec820dc3 100644
--- a/src/row/arch/x86_avx512/mod.rs
+++ b/src/row/arch/x86_avx512/mod.rs
@@ -73,6 +73,7 @@ pub(super) use crate::{
 
 mod alpha_extract;
 mod ayuv64;
+pub(crate) mod endian;
 mod gray;
 mod hsv;
 pub(crate) mod legacy_rgb;
diff --git a/src/row/arch/x86_avx512/tests/endian.rs b/src/row/arch/x86_avx512/tests/endian.rs
new file mode 100644
index 00000000..a5f5dd7c
--- /dev/null
+++ b/src/row/arch/x86_avx512/tests/endian.rs
@@ -0,0 +1,213 @@
+use crate::row::arch::x86_avx512::endian::*;
+
+// Helper: extract __m512i to a stack array of 32 u16 lanes.
+#[cfg(target_arch = "x86_64")]
+unsafe fn m512i_to_u16x32(v: core::arch::x86_64::__m512i) -> [u16; 32] {
+  let mut out = [0u16; 32];
+  unsafe { core::arch::x86_64::_mm512_storeu_si512(out.as_mut_ptr().cast(), v) };
+  out
+}
+
+// Helper: extract __m512i to a stack array of 16 u32 lanes.
+#[cfg(target_arch = "x86_64")]
+unsafe fn m512i_to_u32x16(v: core::arch::x86_64::__m512i) -> [u32; 16] {
+  let mut out = [0u32; 16];
+  unsafe { core::arch::x86_64::_mm512_storeu_si512(out.as_mut_ptr().cast(), v) };
+  out
+}
+
+// ---- LE loader on LE host (no-op) ------------------------------------------
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 AVX-512 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn avx512_load_le_u16x32_noop_on_le_host() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  // Build 64 bytes: pairs [lo, hi] for values 0x0102..0x2021.
+  let mut input = [0u8; 64];
+  for i in 0usize..32 {
+    // LE encoding: low byte first
+    input[i * 2] = ((i + 1) as u8).wrapping_add(1); // low byte
+    input[i * 2 + 1] = (i + 1) as u8; // high byte
+  }
+  let v = unsafe { load_le_u16x32(input.as_ptr()) };
+  let got = unsafe { m512i_to_u16x32(v) };
+  let expected: [u16; 32] = core::array::from_fn(|i| {
+    let i = i as u16;
+    ((i + 1) << 8) | (i + 2)
+  });
+  assert_eq!(
+    got, expected,
+    "AVX-512 load_le_u16x32 must not swap on LE host"
+  );
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 AVX-512 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "big")]
+fn avx512_load_le_u16x32_swaps_on_be_host() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  let mut input = [0u8; 64];
+  for i in 0usize..32 {
+    input[i * 2] = ((i + 1) as u8).wrapping_add(1);
+    input[i * 2 + 1] = (i + 1) as u8;
+  }
+  let v = unsafe { load_le_u16x32(input.as_ptr()) };
+  let got = unsafe { m512i_to_u16x32(v) };
+  let expected: [u16; 32] = core::array::from_fn(|i| {
+    let i = i as u16;
+    ((i + 1) << 8) | (i + 2)
+  });
+  assert_eq!(got, expected, "AVX-512 load_le_u16x32 must swap on BE host");
+}
+
+// ---- BE loader on LE host (swap) -------------------------------------------
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 AVX-512 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn avx512_load_be_u16x32_swaps_on_le_host() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  // BE encoding: high byte first.
+  let mut input = [0u8; 64];
+  for i in 0usize..32 {
+    input[i * 2] = (i + 1) as u8; // high byte
+    input[i * 2 + 1] = ((i + 1) as u8).wrapping_add(1); // low byte
+  }
+  let v = unsafe { load_be_u16x32(input.as_ptr()) };
+  let got = unsafe { m512i_to_u16x32(v) };
+  let expected: [u16; 32] = core::array::from_fn(|i| {
+    let i = i as u16;
+    ((i + 1) << 8) | (i + 2)
+  });
+  assert_eq!(got, expected, "AVX-512 load_be_u16x32 must swap on LE host");
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 AVX-512 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "big")]
+fn avx512_load_be_u16x32_noop_on_be_host() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  let mut input = [0u8; 64];
+  for i in 0usize..32 {
+    input[i * 2] = (i + 1) as u8;
+    input[i * 2 + 1] = ((i + 1) as u8).wrapping_add(1);
+  }
+  let v = unsafe { load_be_u16x32(input.as_ptr()) };
+  let got = unsafe { m512i_to_u16x32(v) };
+  let expected: [u16; 32] = core::array::from_fn(|i| {
+    let i = i as u16;
+    ((i + 1) << 8) | (i + 2)
+  });
+  assert_eq!(
+    got, expected,
+    "AVX-512 load_be_u16x32 must not swap on BE host"
+  );
+}
+
+// ---- u32x16 LE loader on LE host (no-op) -----------------------------------
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 AVX-512 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn avx512_load_le_u32x16_noop_on_le_host() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  let input: [u8; 64] = [
+    0x04, 0x03, 0x02, 0x01, 0x08, 0x07, 0x06, 0x05, 0x0c, 0x0b, 0x0a, 0x09, 0x10, 0x0f, 0x0e, 0x0d,
+    0x14, 0x13, 0x12, 0x11, 0x18, 0x17, 0x16, 0x15, 0x1c, 0x1b, 0x1a, 0x19, 0x20, 0x1f, 0x1e, 0x1d,
+    0x24, 0x23, 0x22, 0x21, 0x28, 0x27, 0x26, 0x25, 0x2c, 0x2b, 0x2a, 0x29, 0x30, 0x2f, 0x2e, 0x2d,
+    0x34, 0x33, 0x32, 0x31, 0x38, 0x37, 0x36, 0x35, 0x3c, 0x3b, 0x3a, 0x39, 0x40, 0x3f, 0x3e, 0x3d,
+  ];
+  let v = unsafe { load_le_u32x16(input.as_ptr()) };
+  let got = unsafe { m512i_to_u32x16(v) };
+  assert_eq!(
+    got,
+    [
+      0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10, 0x11121314, 0x15161718, 0x191a1b1c,
+      0x1d1e1f20, 0x21222324, 0x25262728, 0x292a2b2c, 0x2d2e2f30, 0x31323334, 0x35363738,
+      0x393a3b3c, 0x3d3e3f40,
+    ],
+    "AVX-512 load_le_u32x16 must not swap on LE host"
+  );
+}
+
+// ---- u32x16 BE loader on LE host (swap) ------------------------------------
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 AVX-512 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn avx512_load_be_u32x16_swaps_on_le_host() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  let input: [u8; 64] = [
+    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
+    0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
+    0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
+    0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40,
+  ];
+  let v = unsafe { load_be_u32x16(input.as_ptr()) };
+  let got = unsafe { m512i_to_u32x16(v) };
+  assert_eq!(
+    got,
+    [
+      0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10, 0x11121314, 0x15161718, 0x191a1b1c,
+      0x1d1e1f20, 0x21222324, 0x25262728, 0x292a2b2c, 0x2d2e2f30, 0x31323334, 0x35363738,
+      0x393a3b3c, 0x3d3e3f40,
+    ],
+    "AVX-512 load_be_u32x16 must swap on LE host"
+  );
+}
+
+// ---- Generic dispatcher consistency ----------------------------------------
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 AVX-512 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn avx512_load_endian_u16x32_le_dispatcher() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  let mut input = [0u8; 64];
+  for i in 0usize..32 {
+    input[i * 2] = ((i + 1) as u8).wrapping_add(1);
+    input[i * 2 + 1] = (i + 1) as u8;
+  }
+  let direct = unsafe { load_le_u16x32(input.as_ptr()) };
+  let via_dispatch = unsafe { load_endian_u16x32::<false>(input.as_ptr()) };
+  let d = unsafe { m512i_to_u16x32(direct) };
+  let g = unsafe { m512i_to_u16x32(via_dispatch) };
+  assert_eq!(
+    d, g,
+    "load_endian_u16x32::<false> must match load_le_u16x32"
+  );
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 AVX-512 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn avx512_load_endian_u16x32_be_dispatcher() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  let mut input = [0u8; 64];
+  for i in 0usize..32 {
+    input[i * 2] = (i + 1) as u8;
+    input[i * 2 + 1] = ((i + 1) as u8).wrapping_add(1);
+  }
+  let direct = unsafe { load_be_u16x32(input.as_ptr()) };
+  let via_dispatch = unsafe { load_endian_u16x32::<true>(input.as_ptr()) };
+  let d = unsafe { m512i_to_u16x32(direct) };
+  let g = unsafe { m512i_to_u16x32(via_dispatch) };
+  assert_eq!(d, g, "load_endian_u16x32::<true> must match load_be_u16x32");
+}
diff --git a/src/row/arch/x86_avx512/tests/mod.rs b/src/row/arch/x86_avx512/tests/mod.rs
index a66d6ea6..de1c57ad 100644
--- a/src/row/arch/x86_avx512/tests/mod.rs
+++ b/src/row/arch/x86_avx512/tests/mod.rs
@@ -1,4 +1,5 @@
 mod ayuv64;
+mod endian;
 mod high_bit_4_2_0;
 mod high_bit_4_4_4_and_pn;
 mod legacy_rgb;
diff --git a/src/row/arch/x86_sse41/endian.rs b/src/row/arch/x86_sse41/endian.rs
new file mode 100644
index 00000000..f7dc1d38
--- /dev/null
+++ b/src/row/arch/x86_sse41/endian.rs
@@ -0,0 +1,121 @@
+//! Endian-aware u16/u32 SIMD loaders for x86_64 SSE4.1.
+#![allow(dead_code)] // tier kernels (Phase 2 rollout PRs) will consume these
+//!
+//! Each helper takes a raw byte pointer to LE-encoded (or BE-encoded) data
+//! and returns an `__m128i` vector containing the elements in **host-native**
+//! byte order, ready for native u16/u32 SIMD math.
+//!
+//! The host-native conversion is monomorphized at compile time via
+//! `cfg(target_endian = ...)`:
+//!   - `load_le_*` is a no-op on LE targets (all real x86), byte-swap on BE
+//!   - `load_be_*` is byte-swap on LE targets, no-op on BE targets
+//!
+//! Byte-swap is implemented with `_mm_shuffle_epi8` (SSSE3, a subset of
+//! SSE4.1) using compile-time shuffle masks.  The mask constants use
+//! `core::mem::transmute` because `__m128i` has no `const` constructor in
+//! stable Rust; the transmutes are always safe — `__m128i` is a plain 128-bit
+//! bag of bits.
+
+use core::arch::x86_64::*;
+
+// ---- Byte-swap shuffle masks -----------------------------------------------
+
+/// SSSE3 `_mm_shuffle_epi8` mask that swaps bytes within every 2-byte (u16)
+/// lane: `[1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14]`.
+const BYTESWAP_MASK_U16: __m128i =
+  unsafe { core::mem::transmute([1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]) };
+
+/// SSSE3 `_mm_shuffle_epi8` mask that swaps bytes within every 4-byte (u32)
+/// lane: `[3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12]`.
+const BYTESWAP_MASK_U32: __m128i =
+  unsafe { core::mem::transmute([3u8, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12]) };
+
+// ---- u16x8 loaders ---------------------------------------------------------
+
+/// Loads 8 × u16 from `ptr` (LE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 16 readable bytes.  Caller must have SSE4.1
+/// (and SSSE3) enabled via `#[target_feature(enable = "sse4.1")]`.
+#[inline(always)]
+pub(crate) unsafe fn load_le_u16x8(ptr: *const u8) -> __m128i {
+  let v = unsafe { _mm_loadu_si128(ptr.cast()) };
+  #[cfg(target_endian = "big")]
+  let v = unsafe { _mm_shuffle_epi8(v, BYTESWAP_MASK_U16) };
+  v
+}
+
+/// Loads 8 × u16 from `ptr` (BE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 16 readable bytes.  Caller must have SSE4.1
+/// (and SSSE3) enabled.
+#[inline(always)]
+pub(crate) unsafe fn load_be_u16x8(ptr: *const u8) -> __m128i {
+  let v = unsafe { _mm_loadu_si128(ptr.cast()) };
+  #[cfg(target_endian = "little")]
+  let v = unsafe { _mm_shuffle_epi8(v, BYTESWAP_MASK_U16) };
+  v
+}
+
+/// Generic dispatcher: routes to `load_le_u16x8` or `load_be_u16x8` based on
+/// the compile-time `BE` const parameter.
+///
+/// # Safety
+///
+/// Same as `load_le_u16x8` / `load_be_u16x8`.
+#[inline(always)]
+pub(crate) unsafe fn load_endian_u16x8<const BE: bool>(ptr: *const u8) -> __m128i {
+  if BE {
+    unsafe { load_be_u16x8(ptr) }
+  } else {
+    unsafe { load_le_u16x8(ptr) }
+  }
+}
+
+// ---- u32x4 loaders ---------------------------------------------------------
+
+/// Loads 4 × u32 from `ptr` (LE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 16 readable bytes.  Caller must have SSE4.1
+/// (and SSSE3) enabled.
+#[inline(always)]
+pub(crate) unsafe fn load_le_u32x4(ptr: *const u8) -> __m128i {
+  let v = unsafe { _mm_loadu_si128(ptr.cast()) };
+  #[cfg(target_endian = "big")]
+  let v = unsafe { _mm_shuffle_epi8(v, BYTESWAP_MASK_U32) };
+  v
+}
+
+/// Loads 4 × u32 from `ptr` (BE-encoded on disk/wire) into host-native order.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 16 readable bytes.  Caller must have SSE4.1
+/// (and SSSE3) enabled.
+#[inline(always)]
+pub(crate) unsafe fn load_be_u32x4(ptr: *const u8) -> __m128i {
+  let v = unsafe { _mm_loadu_si128(ptr.cast()) };
+  #[cfg(target_endian = "little")]
+  let v = unsafe { _mm_shuffle_epi8(v, BYTESWAP_MASK_U32) };
+  v
+}
+
+/// Generic dispatcher: routes to `load_le_u32x4` or `load_be_u32x4` based on
+/// the compile-time `BE` const parameter.
+///
+/// # Safety
+///
+/// Same as `load_le_u32x4` / `load_be_u32x4`.
+#[inline(always)]
+pub(crate) unsafe fn load_endian_u32x4<const BE: bool>(ptr: *const u8) -> __m128i {
+  if BE {
+    unsafe { load_be_u32x4(ptr) }
+  } else {
+    unsafe { load_le_u32x4(ptr) }
+  }
+}
diff --git a/src/row/arch/x86_sse41/mod.rs b/src/row/arch/x86_sse41/mod.rs
index 2a54eb76..22a337f0 100644
--- a/src/row/arch/x86_sse41/mod.rs
+++ b/src/row/arch/x86_sse41/mod.rs
@@ -56,6 +56,7 @@ pub(super) use crate::{
 
 mod alpha_extract;
 mod ayuv64;
+pub(crate) mod endian;
 mod gray;
 mod hsv;
 pub(crate) mod legacy_rgb;
diff --git a/src/row/arch/x86_sse41/tests/endian.rs b/src/row/arch/x86_sse41/tests/endian.rs
new file mode 100644
index 00000000..c6dca72d
--- /dev/null
+++ b/src/row/arch/x86_sse41/tests/endian.rs
@@ -0,0 +1,207 @@
+use crate::row::arch::x86_sse41::endian::*;
+
+// Helper: extract __m128i to a stack array of 8 u16 lanes.
+#[cfg(target_arch = "x86_64")]
+unsafe fn m128i_to_u16x8(v: core::arch::x86_64::__m128i) -> [u16; 8] {
+  let mut out = [0u16; 8];
+  unsafe { core::arch::x86_64::_mm_storeu_si128(out.as_mut_ptr().cast(), v) };
+  out
+}
+
+// Helper: extract __m128i to a stack array of 4 u32 lanes.
+#[cfg(target_arch = "x86_64")]
+unsafe fn m128i_to_u32x4(v: core::arch::x86_64::__m128i) -> [u32; 4] {
+  let mut out = [0u32; 4];
+  unsafe { core::arch::x86_64::_mm_storeu_si128(out.as_mut_ptr().cast(), v) };
+  out
+}
+
+// ---- LE loader on LE host (no-op) ------------------------------------------
+
+/// On a LE host, `load_le_u16x8` must NOT swap bytes.
+#[test]
+#[cfg_attr(miri, ignore = "x86 SSE4.1 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn sse41_load_le_u16x8_noop_on_le_host() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  let input: [u8; 16] = [
+    0x02, 0x01, // u16[0] = 0x0102
+    0x04, 0x03, // u16[1] = 0x0304
+    0x06, 0x05, // u16[2] = 0x0506
+    0x08, 0x07, // u16[3] = 0x0708
+    0x0a, 0x09, // u16[4] = 0x090a
+    0x0c, 0x0b, // u16[5] = 0x0b0c
+    0x0e, 0x0d, // u16[6] = 0x0d0e
+    0x10, 0x0f, // u16[7] = 0x0f10
+  ];
+  let v = unsafe { load_le_u16x8(input.as_ptr()) };
+  let got = unsafe { m128i_to_u16x8(v) };
+  assert_eq!(
+    got,
+    [
+      0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10
+    ],
+    "SSE4.1 load_le_u16x8 must not swap on LE host"
+  );
+}
+
+/// On a BE host, `load_le_u16x8` MUST swap bytes.
+#[test]
+#[cfg_attr(miri, ignore = "x86 SSE4.1 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "big")]
+fn sse41_load_le_u16x8_swaps_on_be_host() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  let input: [u8; 16] = [
+    0x02, 0x01, 0x04, 0x03, 0x06, 0x05, 0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b, 0x0e, 0x0d, 0x10, 0x0f,
+  ];
+  let v = unsafe { load_le_u16x8(input.as_ptr()) };
+  let got = unsafe { m128i_to_u16x8(v) };
+  assert_eq!(
+    got,
+    [
+      0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10
+    ],
+    "SSE4.1 load_le_u16x8 must swap on BE host"
+  );
+}
+
+// ---- BE loader on LE host (swap) -------------------------------------------
+
+/// On a LE host, `load_be_u16x8` MUST swap bytes.
+#[test]
+#[cfg_attr(miri, ignore = "x86 SSE4.1 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn sse41_load_be_u16x8_swaps_on_le_host() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  let input: [u8; 16] = [
+    0x01, 0x02, // u16[0] = 0x0102 BE
+    0x03, 0x04, // u16[1] = 0x0304
+    0x05, 0x06, // u16[2] = 0x0506
+    0x07, 0x08, // u16[3] = 0x0708
+    0x09, 0x0a, // u16[4] = 0x090a
+    0x0b, 0x0c, // u16[5] = 0x0b0c
+    0x0d, 0x0e, // u16[6] = 0x0d0e
+    0x0f, 0x10, // u16[7] = 0x0f10
+  ];
+  let v = unsafe { load_be_u16x8(input.as_ptr()) };
+  let got = unsafe { m128i_to_u16x8(v) };
+  assert_eq!(
+    got,
+    [
+      0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10
+    ],
+    "SSE4.1 load_be_u16x8 must swap on LE host"
+  );
+}
+
+/// On a BE host, `load_be_u16x8` must NOT swap.
+#[test]
+#[cfg_attr(miri, ignore = "x86 SSE4.1 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "big")]
+fn sse41_load_be_u16x8_noop_on_be_host() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  let input: [u8; 16] = [
+    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
+  ];
+  let v = unsafe { load_be_u16x8(input.as_ptr()) };
+  let got = unsafe { m128i_to_u16x8(v) };
+  assert_eq!(
+    got,
+    [
+      0x0102, 0x0304, 0x0506, 0x0708, 0x090a, 0x0b0c, 0x0d0e, 0x0f10
+    ],
+    "SSE4.1 load_be_u16x8 must not swap on BE host"
+  );
+}
+
+// ---- u32x4 LE loader on LE host (no-op) ------------------------------------
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SSE4.1 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn sse41_load_le_u32x4_noop_on_le_host() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  let input: [u8; 16] = [
+    0x04, 0x03, 0x02, 0x01, // u32[0] = 0x01020304 LE
+    0x08, 0x07, 0x06, 0x05, // u32[1] = 0x05060708
+    0x0c, 0x0b, 0x0a, 0x09, // u32[2] = 0x090a0b0c
+    0x10, 0x0f, 0x0e, 0x0d, // u32[3] = 0x0d0e0f10
+  ];
+  let v = unsafe { load_le_u32x4(input.as_ptr()) };
+  let got = unsafe { m128i_to_u32x4(v) };
+  assert_eq!(
+    got,
+    [0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10],
+    "SSE4.1 load_le_u32x4 must not swap on LE host"
+  );
+}
+
+// ---- u32x4 BE loader on LE host (swap) -------------------------------------
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SSE4.1 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn sse41_load_be_u32x4_swaps_on_le_host() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  let input: [u8; 16] = [
+    0x01, 0x02, 0x03, 0x04, // u32[0] = 0x01020304 BE
+    0x05, 0x06, 0x07, 0x08, // u32[1] = 0x05060708
+    0x09, 0x0a, 0x0b, 0x0c, // u32[2] = 0x090a0b0c
+    0x0d, 0x0e, 0x0f, 0x10, // u32[3] = 0x0d0e0f10
+  ];
+  let v = unsafe { load_be_u32x4(input.as_ptr()) };
+  let got = unsafe { m128i_to_u32x4(v) };
+  assert_eq!(
+    got,
+    [0x01020304, 0x05060708, 0x090a0b0c, 0x0d0e0f10],
+    "SSE4.1 load_be_u32x4 must swap on LE host"
+  );
+}
+
+// ---- Generic dispatcher consistency ----------------------------------------
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SSE4.1 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn sse41_load_endian_u16x8_le_dispatcher() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  let input: [u8; 16] = [
+    0x02, 0x01, 0x04, 0x03, 0x06, 0x05, 0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b, 0x0e, 0x0d, 0x10, 0x0f,
+  ];
+  let direct = unsafe { load_le_u16x8(input.as_ptr()) };
+  let via_dispatch = unsafe { load_endian_u16x8::<false>(input.as_ptr()) };
+  let d = unsafe { m128i_to_u16x8(direct) };
+  let g = unsafe { m128i_to_u16x8(via_dispatch) };
+  assert_eq!(d, g, "load_endian_u16x8::<false> must match load_le_u16x8");
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "x86 SSE4.1 SIMD intrinsics unsupported by Miri")]
+#[cfg(target_endian = "little")]
+fn sse41_load_endian_u16x8_be_dispatcher() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  let input: [u8; 16] = [
+    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
+  ];
+  let direct = unsafe { load_be_u16x8(input.as_ptr()) };
+  let via_dispatch = unsafe { load_endian_u16x8::<true>(input.as_ptr()) };
+  let d = unsafe { m128i_to_u16x8(direct) };
+  let g = unsafe { m128i_to_u16x8(via_dispatch) };
+  assert_eq!(d, g, "load_endian_u16x8::<true> must match load_be_u16x8");
+}
diff --git a/src/row/arch/x86_sse41/tests/mod.rs b/src/row/arch/x86_sse41/tests/mod.rs
index be4f8679..f5e14528 100644
--- a/src/row/arch/x86_sse41/tests/mod.rs
+++ b/src/row/arch/x86_sse41/tests/mod.rs
@@ -1,4 +1,5 @@
 mod ayuv64;
+mod endian;
 mod high_bit_4_2_0;
 mod high_bit_4_4_4_and_pn;
 mod legacy_rgb;