diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d1a924704..feb7ad556 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -3,12 +3,12 @@ env: # version like 1.70. Note that we only specify MAJOR.MINOR and not PATCH so that bugfixes still # come automatically. If the version specified here is no longer the latest stable version, # then please feel free to submit a PR that adjusts it along with the potential clippy fixes. - RUST_STABLE_VER: "1.88" # In quotes because otherwise (e.g.) 1.70 would be interpreted as 1.7 + RUST_STABLE_VER: "1.89" # In quotes because otherwise (e.g.) 1.70 would be interpreted as 1.7 # The purpose of checking with the minimum supported Rust toolchain is to detect its staleness. # If the compilation fails, then the version specified here needs to be bumped up to reality. # Be sure to also update the rust-version property in the workspace Cargo.toml file, # plus all the README.md files of the affected packages. - RUST_MIN_VER: "1.88" + RUST_MIN_VER: "1.89" # List of packages that will be checked with the minimum supported Rust version. # This should be limited to packages that are intended for publishing. RUST_MIN_VER_PKGS: "-p fearless_simd" diff --git a/Cargo.toml b/Cargo.toml index 0158a30a3..b9b4aca49 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ license = "Apache-2.0 OR MIT" repository = "https://github.com/linebender/fearless_simd" # Keep in sync with RUST_MIN_VER in .github/workflows/ci.yml, with the relevant README.md files # and with the MSRV in the `Unreleased` section of CHANGELOG.md. -rust-version = "1.88" +rust-version = "1.89" [workspace.lints] @@ -47,7 +47,7 @@ clippy.debug_assert_with_mut_call = "warn" clippy.doc_markdown = "warn" clippy.fn_to_numeric_cast_any = "warn" clippy.infinite_loop = "warn" -clippy.large_stack_arrays = "warn" +clippy.large_stack_arrays = "allow" # appears to be buggy as of 1.93, fixed in nightly. TODO: re-enable clippy.mismatching_type_param_order = "warn" clippy.missing_assert_message = "warn" clippy.missing_fields_in_debug = "warn" diff --git a/README.md b/README.md index 7c3d95fe2..4749d6f9f 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ It benefited from conversations with Luca Versari, though he is not responsible ## Minimum supported Rust Version (MSRV) -This version of Fearless SIMD has been verified to compile with **Rust 1.88** and later. +This version of Fearless SIMD has been verified to compile with **Rust 1.89** and later. Future versions of Fearless SIMD might increase the Rust version requirement. It will not be treated as a breaking change and as such can even happen with small patch releases. diff --git a/fearless_simd/README.md b/fearless_simd/README.md index 883be8be3..f18d788a2 100644 --- a/fearless_simd/README.md +++ b/fearless_simd/README.md @@ -163,7 +163,7 @@ At least one of `std` and `libm` is required; `std` overrides `libm`. ## Minimum supported Rust Version (MSRV) -This version of Fearless SIMD has been verified to compile with **Rust 1.88** and later. +This version of Fearless SIMD has been verified to compile with **Rust 1.89** and later. Future versions of Fearless SIMD might increase the Rust version requirement. It will not be treated as a breaking change and as such can even happen with small patch releases. diff --git a/fearless_simd/src/core_arch/x86/avx512.rs b/fearless_simd/src/core_arch/x86/avx512.rs new file mode 100644 index 000000000..0c6cfd50e --- /dev/null +++ b/fearless_simd/src/core_arch/x86/avx512.rs @@ -0,0 +1,21 @@ +// Copyright 2025 the Fearless_SIMD Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +//! Access to AVX-512 intrinsics (Ice Lake feature set). + +/// A token for AVX-512 intrinsics (Ice Lake feature set) on `x86` and `x86_64`. +#[derive(Clone, Copy, Debug)] +pub struct Avx512 { + _private: (), +} + +impl Avx512 { + /// Create a SIMD token. + /// + /// # Safety + /// + /// The required CPU features must be available. + pub const unsafe fn new_unchecked() -> Self { + Self { _private: () } + } +} diff --git a/fearless_simd/src/core_arch/x86/mod.rs b/fearless_simd/src/core_arch/x86/mod.rs index 14f9a2a16..bb267b1ec 100644 --- a/fearless_simd/src/core_arch/x86/mod.rs +++ b/fearless_simd/src/core_arch/x86/mod.rs @@ -5,6 +5,7 @@ mod avx; mod avx2; +mod avx512; mod fma; mod sse; mod sse2; @@ -15,6 +16,7 @@ mod ssse3; pub use avx::Avx; pub use avx2::Avx2; +pub use avx512::Avx512; pub use fma::Fma; pub use sse::Sse; pub use sse2::Sse2; diff --git a/fearless_simd/src/generated.rs b/fearless_simd/src/generated.rs index 9d342539a..67245aae7 100644 --- a/fearless_simd/src/generated.rs +++ b/fearless_simd/src/generated.rs @@ -46,6 +46,8 @@ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] mod avx2; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +mod avx512; mod fallback; #[cfg(target_arch = "aarch64")] mod neon; @@ -59,6 +61,8 @@ mod wasm; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub use avx2::*; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub use avx512::*; pub use fallback::*; #[cfg(target_arch = "aarch64")] pub use neon::*; diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs index 425d46a11..63aa003de 100644 --- a/fearless_simd/src/generated/avx2.rs +++ b/fearless_simd/src/generated/avx2.rs @@ -8645,6 +8645,13 @@ unsafe fn cross_block_alignr_one( }; unsafe { dyn_alignr_256(hi_blocks, lo_blocks, intra_shift) } } +#[doc = r" Concatenates `b` and `a` (each 1 x __m256i = 2 blocks) and extracts 2 blocks starting at byte offset"] +#[doc = r" `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics."] +#[inline(always)] +unsafe fn cross_block_alignr_256x1(a: __m256i, b: __m256i, shift_bytes: usize) -> __m256i { + let regs = [b, a]; + unsafe { cross_block_alignr_one(®s, 0, shift_bytes) } +} #[doc = r" Concatenates `b` and `a` (each 2 x __m256i = 4 blocks) and extracts 4 blocks starting at byte offset"] #[doc = r" `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics."] #[inline(always)] @@ -8661,10 +8668,3 @@ unsafe fn cross_block_alignr_256x2( ] } } -#[doc = r" Concatenates `b` and `a` (each 1 x __m256i = 2 blocks) and extracts 2 blocks starting at byte offset"] -#[doc = r" `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics."] -#[inline(always)] -unsafe fn cross_block_alignr_256x1(a: __m256i, b: __m256i, shift_bytes: usize) -> __m256i { - let regs = [b, a]; - unsafe { cross_block_alignr_one(®s, 0, shift_bytes) } -} diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs new file mode 100644 index 000000000..74e391412 --- /dev/null +++ b/fearless_simd/src/generated/avx512.rs @@ -0,0 +1,9221 @@ +// Copyright 2025 the Fearless_SIMD Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// This file is autogenerated by fearless_simd_gen + +use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal}; +use crate::{ + f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4, + i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4, + mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32, + u32x4, u32x8, u32x16, +}; +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; +#[doc = "The SIMD token for the \"AVX-512\" level (Ice Lake feature set)."] +#[derive(Clone, Copy, Debug)] +pub struct Avx512 { + pub avx512: crate::core_arch::x86::Avx512, +} +impl Avx512 { + #[doc = r" Create a SIMD token."] + #[doc = r""] + #[doc = r" # Safety"] + #[doc = r""] + #[doc = r" The AVX-512 (Ice Lake feature set) CPU features must be available."] + #[inline] + pub const unsafe fn new_unchecked() -> Self { + Self { + avx512: unsafe { crate::core_arch::x86::Avx512::new_unchecked() }, + } + } +} +impl Seal for Avx512 {} +impl ArchTypes for Avx512 { + type f32x4 = crate::support::Aligned128<__m128>; + type i8x16 = crate::support::Aligned128<__m128i>; + type u8x16 = crate::support::Aligned128<__m128i>; + type mask8x16 = crate::support::Aligned128<__m128i>; + type i16x8 = crate::support::Aligned128<__m128i>; + type u16x8 = crate::support::Aligned128<__m128i>; + type mask16x8 = crate::support::Aligned128<__m128i>; + type i32x4 = crate::support::Aligned128<__m128i>; + type u32x4 = crate::support::Aligned128<__m128i>; + type mask32x4 = crate::support::Aligned128<__m128i>; + type f64x2 = crate::support::Aligned128<__m128d>; + type mask64x2 = crate::support::Aligned128<__m128i>; + type f32x8 = crate::support::Aligned256<__m256>; + type i8x32 = crate::support::Aligned256<__m256i>; + type u8x32 = crate::support::Aligned256<__m256i>; + type mask8x32 = crate::support::Aligned256<__m256i>; + type i16x16 = crate::support::Aligned256<__m256i>; + type u16x16 = crate::support::Aligned256<__m256i>; + type mask16x16 = crate::support::Aligned256<__m256i>; + type i32x8 = crate::support::Aligned256<__m256i>; + type u32x8 = crate::support::Aligned256<__m256i>; + type mask32x8 = crate::support::Aligned256<__m256i>; + type f64x4 = crate::support::Aligned256<__m256d>; + type mask64x4 = crate::support::Aligned256<__m256i>; + type f32x16 = crate::support::Aligned512<__m512>; + type i8x64 = crate::support::Aligned512<__m512i>; + type u8x64 = crate::support::Aligned512<__m512i>; + type mask8x64 = crate::support::Aligned512<__m512i>; + type i16x32 = crate::support::Aligned512<__m512i>; + type u16x32 = crate::support::Aligned512<__m512i>; + type mask16x32 = crate::support::Aligned512<__m512i>; + type i32x16 = crate::support::Aligned512<__m512i>; + type u32x16 = crate::support::Aligned512<__m512i>; + type mask32x16 = crate::support::Aligned512<__m512i>; + type f64x8 = crate::support::Aligned512<__m512d>; + type mask64x8 = crate::support::Aligned512<__m512i>; +} +impl Simd for Avx512 { + type f32s = f32x16; + type f64s = f64x8; + type u8s = u8x64; + type i8s = i8x64; + type u16s = u16x32; + type i16s = i16x32; + type u32s = u32x16; + type i32s = i32x16; + type mask8s = mask8x64; + type mask16s = mask16x32; + type mask32s = mask32x16; + type mask64s = mask64x8; + #[inline(always)] + fn level(self) -> Level { + Level::Avx512(self) + } + #[inline] + fn vectorize R, R>(self, f: F) -> R { + #[target_feature( + enable = "adx,aes,avx512bitalg,avx512bw,avx512cd,avx512dq,avx512f,avx512ifma,avx512vbmi,avx512vbmi2,avx512vl,avx512vnni,avx512vpopcntdq,bmi1,bmi2,cmpxchg16b,fma,gfni,lzcnt,movbe,pclmulqdq,popcnt,rdrand,rdseed,sha,vaes,vpclmulqdq,xsave,xsavec,xsaveopt,xsaves" + )] + unsafe fn vectorize_avx512 R, R>(f: F) -> R { + f() + } + unsafe { vectorize_avx512(f) } + } + #[inline(always)] + fn splat_f32x4(self, val: f32) -> f32x4 { + unsafe { _mm_set1_ps(val).simd_into(self) } + } + #[inline(always)] + fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4 { + f32x4 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4 { + f32x4 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_f32x4(self, a: f32x4) -> [f32; 4usize] { + unsafe { core::mem::transmute::<__m128, [f32; 4usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_f32x4(self, a: &f32x4) -> &[f32; 4usize] { + unsafe { core::mem::transmute::<&__m128, &[f32; 4usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_f32x4(self, a: &mut f32x4) -> &mut [f32; 4usize] { + unsafe { core::mem::transmute::<&mut __m128, &mut [f32; 4usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_f32x4(self, a: f32x4, dest: &mut [f32; 4usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const f32, + dest.as_mut_ptr(), + 4usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_f32x4(self, a: u8x16) -> f32x4 { + unsafe { + f32x4 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_f32x4(self, a: f32x4) -> u8x16 { + unsafe { + u8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_f32x4(b).val.0, + self.cvt_to_bytes_f32x4(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f32x4( + self, + a: f32x4, + b: f32x4, + ) -> f32x4 { + self.slide_f32x4::(a, b) + } + #[inline(always)] + fn abs_f32x4(self, a: f32x4) -> f32x4 { + unsafe { _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(self) } + } + #[inline(always)] + fn neg_f32x4(self, a: f32x4) -> f32x4 { + unsafe { _mm_xor_ps(a.into(), _mm_set1_ps(-0.0)).simd_into(self) } + } + #[inline(always)] + fn sqrt_f32x4(self, a: f32x4) -> f32x4 { + unsafe { _mm_sqrt_ps(a.into()).simd_into(self) } + } + #[inline(always)] + fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { _mm_sub_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { _mm_mul_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn div_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { _mm_div_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn copysign_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { + let mask = _mm_set1_ps(-0.0); + _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, a.into())).simd_into(self) + } + } + #[inline(always)] + fn simd_eq_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { + unsafe { _mm_castps_si128(_mm_cmpeq_ps(a.into(), b.into())).simd_into(self) } + } + #[inline(always)] + fn simd_lt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { + unsafe { _mm_castps_si128(_mm_cmplt_ps(a.into(), b.into())).simd_into(self) } + } + #[inline(always)] + fn simd_le_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { + unsafe { _mm_castps_si128(_mm_cmple_ps(a.into(), b.into())).simd_into(self) } + } + #[inline(always)] + fn simd_ge_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { + unsafe { _mm_castps_si128(_mm_cmpge_ps(a.into(), b.into())).simd_into(self) } + } + #[inline(always)] + fn simd_gt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { + unsafe { _mm_castps_si128(_mm_cmpgt_ps(a.into(), b.into())).simd_into(self) } + } + #[inline(always)] + fn zip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { _mm_unpacklo_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn zip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { _mm_unpackhi_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn unzip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { _mm_shuffle_ps::<0b10_00_10_00>(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn unzip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { _mm_shuffle_ps::<0b11_01_11_01>(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { + let intermediate = _mm_max_ps(a.into(), b.into()); + let b_is_nan = _mm_cmpunord_ps(b.into(), b.into()); + _mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self) + } + } + #[inline(always)] + fn min_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { + let intermediate = _mm_min_ps(a.into(), b.into()); + let b_is_nan = _mm_cmpunord_ps(b.into(), b.into()); + _mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self) + } + } + #[inline(always)] + fn mul_add_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { + unsafe { _mm_fmadd_ps(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn mul_sub_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { + unsafe { _mm_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn floor_f32x4(self, a: f32x4) -> f32x4 { + unsafe { + _mm_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) + } + } + #[inline(always)] + fn ceil_f32x4(self, a: f32x4) -> f32x4 { + unsafe { + _mm_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) + } + } + #[inline(always)] + fn round_ties_even_f32x4(self, a: f32x4) -> f32x4 { + unsafe { + _mm_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } + } + #[inline(always)] + fn fract_f32x4(self, a: f32x4) -> f32x4 { + a - self.trunc_f32x4(a) + } + #[inline(always)] + fn trunc_f32x4(self, a: f32x4) -> f32x4 { + unsafe { + _mm_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) + } + } + #[inline(always)] + fn select_f32x4(self, a: mask32x4, b: f32x4, c: f32x4) -> f32x4 { + unsafe { _mm_blendv_ps(c.into(), b.into(), _mm_castsi128_ps(a.into())).simd_into(self) } + } + #[inline(always)] + fn combine_f32x4(self, a: f32x4, b: f32x4) -> f32x8 { + unsafe { _mm256_setr_m128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_f64_f32x4(self, a: f32x4) -> f64x2 { + unsafe { _mm_castps_pd(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_i32_f32x4(self, a: f32x4) -> i32x4 { + unsafe { _mm_castps_si128(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_f32x4(self, a: f32x4) -> u8x16 { + unsafe { _mm_castps_si128(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u32_f32x4(self, a: f32x4) -> u32x4 { + unsafe { _mm_castps_si128(a.into()).simd_into(self) } + } + #[inline(always)] + fn cvt_u32_f32x4(self, a: f32x4) -> u32x4 { + unsafe { + let mut converted = _mm_cvttps_epi32(a.into()); + let in_range = _mm_cmplt_ps(a.into(), _mm_set1_ps(2147483648.0)); + let all_in_range = _mm_movemask_ps(in_range) == 0b1111; + if !all_in_range { + let excess = _mm_sub_ps(a.into(), _mm_set1_ps(2147483648.0)); + let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess)); + converted = _mm_add_epi32(converted, excess_converted); + } + converted.simd_into(self) + } + } + #[inline(always)] + fn cvt_u32_precise_f32x4(self, a: f32x4) -> u32x4 { + unsafe { + let a = _mm_max_ps(a.into(), _mm_setzero_ps()); + let mut converted = _mm_cvttps_epi32(a); + let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0)); + let all_in_range = _mm_movemask_ps(in_range) == 0b1111; + if !all_in_range { + let exceeds_unsigned_range = + _mm_castps_si128(_mm_cmplt_ps(_mm_set1_ps(4294967040.0), a)); + let excess = _mm_sub_ps(a, _mm_set1_ps(2147483648.0)); + let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess)); + converted = _mm_add_epi32(converted, excess_converted); + converted = _mm_blendv_epi8( + converted, + _mm_set1_epi32(u32::MAX.cast_signed()), + exceeds_unsigned_range, + ); + } + converted.simd_into(self) + } + } + #[inline(always)] + fn cvt_i32_f32x4(self, a: f32x4) -> i32x4 { + unsafe { _mm_cvttps_epi32(a.into()).simd_into(self) } + } + #[inline(always)] + fn cvt_i32_precise_f32x4(self, a: f32x4) -> i32x4 { + unsafe { + let a = a.into(); + let mut converted = _mm_cvttps_epi32(a); + let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0)); + let all_in_range = _mm_movemask_ps(in_range) == 0b1111; + if !all_in_range { + converted = _mm_blendv_epi8( + _mm_set1_epi32(i32::MAX), + converted, + _mm_castps_si128(in_range), + ); + let is_not_nan = _mm_castps_si128(_mm_cmpord_ps(a, a)); + converted = _mm_and_si128(converted, is_not_nan); + } + converted.simd_into(self) + } + } + #[inline(always)] + fn splat_i8x16(self, val: i8) -> i8x16 { + unsafe { _mm_set1_epi8(val).simd_into(self) } + } + #[inline(always)] + fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16 { + i8x16 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16 { + i8x16 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_i8x16(self, a: i8x16) -> [i8; 16usize] { + unsafe { core::mem::transmute::<__m128i, [i8; 16usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_i8x16(self, a: &i8x16) -> &[i8; 16usize] { + unsafe { core::mem::transmute::<&__m128i, &[i8; 16usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_i8x16(self, a: &mut i8x16) -> &mut [i8; 16usize] { + unsafe { core::mem::transmute::<&mut __m128i, &mut [i8; 16usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_i8x16(self, a: i8x16, dest: &mut [i8; 16usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i8, + dest.as_mut_ptr(), + 16usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_i8x16(self, a: u8x16) -> i8x16 { + unsafe { + i8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_i8x16(self, a: i8x16) -> u8x16 { + unsafe { + u8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_i8x16(b).val.0, + self.cvt_to_bytes_i8x16(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i8x16( + self, + a: i8x16, + b: i8x16, + ) -> i8x16 { + self.slide_i8x16::(a, b) + } + #[inline(always)] + fn add_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { + let dst_even = _mm_mullo_epi16(a.into(), b.into()); + let dst_odd = + _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into())); + _mm_or_si128( + _mm_slli_epi16(dst_odd, 8), + _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)), + ) + .simd_into(self) + } + } + #[inline(always)] + fn and_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_i8x16(self, a: i8x16) -> i8x16 { + a ^ !0 + } + #[inline(always)] + fn shl_i8x16(self, a: i8x16, shift: u32) -> i8x16 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); + let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); + let lo_shifted = _mm_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm_sll_epi16(hi_16, shift_count); + _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self) + } + } + #[inline(always)] + fn shlv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_i8x16(self, a: i8x16, shift: u32) -> i8x16 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); + let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); + let lo_shifted = _mm_sra_epi16(lo_16, shift_count); + let hi_shifted = _mm_sra_epi16(hi_16, shift_count); + _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self) + } + } + #[inline(always)] + fn shrv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { + unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_lt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { + unsafe { _mm_cmpgt_epi8(b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_le_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { + unsafe { _mm_cmpeq_epi8(_mm_min_epi8(a.into(), b.into()), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_ge_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { + unsafe { _mm_cmpeq_epi8(_mm_max_epi8(a.into(), b.into()), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_gt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { + unsafe { _mm_cmpgt_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn zip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn zip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn unzip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { + let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpacklo_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { + let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpackhi_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn select_i8x16(self, a: mask8x16, b: i8x16, c: i8x16) -> i8x16 { + unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn min_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { _mm_min_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { _mm_max_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_i8x16(self, a: i8x16, b: i8x16) -> i8x32 { + unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn neg_i8x16(self, a: i8x16) -> i8x16 { + unsafe { _mm_sub_epi8(_mm_setzero_si128(), a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_i8x16(self, a: i8x16) -> u8x16 { + __m128i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_i8x16(self, a: i8x16) -> u32x4 { + __m128i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_u8x16(self, val: u8) -> u8x16 { + unsafe { _mm_set1_epi8(val.cast_signed()).simd_into(self) } + } + #[inline(always)] + fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16 { + u8x16 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16 { + u8x16 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_u8x16(self, a: u8x16) -> [u8; 16usize] { + unsafe { core::mem::transmute::<__m128i, [u8; 16usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_u8x16(self, a: &u8x16) -> &[u8; 16usize] { + unsafe { core::mem::transmute::<&__m128i, &[u8; 16usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_u8x16(self, a: &mut u8x16) -> &mut [u8; 16usize] { + unsafe { core::mem::transmute::<&mut __m128i, &mut [u8; 16usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_u8x16(self, a: u8x16, dest: &mut [u8; 16usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const u8, + dest.as_mut_ptr(), + 16usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_u8x16(self, a: u8x16) -> u8x16 { + unsafe { + u8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_u8x16(self, a: u8x16) -> u8x16 { + unsafe { + u8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_u8x16(b).val.0, + self.cvt_to_bytes_u8x16(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u8x16( + self, + a: u8x16, + b: u8x16, + ) -> u8x16 { + self.slide_u8x16::(a, b) + } + #[inline(always)] + fn add_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { + let dst_even = _mm_mullo_epi16(a.into(), b.into()); + let dst_odd = + _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into())); + _mm_or_si128( + _mm_slli_epi16(dst_odd, 8), + _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)), + ) + .simd_into(self) + } + } + #[inline(always)] + fn and_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_u8x16(self, a: u8x16) -> u8x16 { + a ^ !0 + } + #[inline(always)] + fn shl_u8x16(self, a: u8x16, shift: u32) -> u8x16 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128()); + let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128()); + let lo_shifted = _mm_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm_sll_epi16(hi_16, shift_count); + _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) + } + } + #[inline(always)] + fn shlv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_u8x16(self, a: u8x16, shift: u32) -> u8x16 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128()); + let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128()); + let lo_shifted = _mm_srl_epi16(lo_16, shift_count); + let hi_shifted = _mm_srl_epi16(hi_16, shift_count); + _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) + } + } + #[inline(always)] + fn shrv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { + unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_lt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { + unsafe { + let sign_bit = _mm_set1_epi8(0x80u8.cast_signed()); + let a_signed = _mm_xor_si128(a.into(), sign_bit); + let b_signed = _mm_xor_si128(b.into(), sign_bit); + _mm_cmpgt_epi8(b_signed, a_signed).simd_into(self) + } + } + #[inline(always)] + fn simd_le_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { + unsafe { _mm_cmpeq_epi8(_mm_min_epu8(a.into(), b.into()), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_ge_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { + unsafe { _mm_cmpeq_epi8(_mm_max_epu8(a.into(), b.into()), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_gt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { + unsafe { + let sign_bit = _mm_set1_epi8(0x80u8.cast_signed()); + let a_signed = _mm_xor_si128(a.into(), sign_bit); + let b_signed = _mm_xor_si128(b.into(), sign_bit); + _mm_cmpgt_epi8(a_signed, b_signed).simd_into(self) + } + } + #[inline(always)] + fn zip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn zip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn unzip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { + let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpacklo_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { + let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpackhi_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn select_u8x16(self, a: mask8x16, b: u8x16, c: u8x16) -> u8x16 { + unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn min_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { _mm_min_epu8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { _mm_max_epu8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_u8x16(self, a: u8x16, b: u8x16) -> u8x32 { + unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn widen_u8x16(self, a: u8x16) -> u16x16 { + unsafe { _mm256_cvtepu8_epi16(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u32_u8x16(self, a: u8x16) -> u32x4 { + __m128i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_mask8x16(self, val: i8) -> mask8x16 { + unsafe { _mm_set1_epi8(val).simd_into(self) } + } + #[inline(always)] + fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16 { + mask8x16 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_mask8x16(self, val: &[i8; 16usize]) -> mask8x16 { + mask8x16 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_mask8x16(self, a: mask8x16) -> [i8; 16usize] { + unsafe { core::mem::transmute::<__m128i, [i8; 16usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_mask8x16(self, a: &mask8x16) -> &[i8; 16usize] { + unsafe { core::mem::transmute::<&__m128i, &[i8; 16usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_mask8x16(self, a: &mut mask8x16) -> &mut [i8; 16usize] { + unsafe { core::mem::transmute::<&mut __m128i, &mut [i8; 16usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_mask8x16(self, a: mask8x16, dest: &mut [i8; 16usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i8, + dest.as_mut_ptr(), + 16usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_mask8x16(self, a: u8x16) -> mask8x16 { + unsafe { + mask8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_mask8x16(self, a: mask8x16) -> u8x16 { + unsafe { + u8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_mask8x16( + self, + a: mask8x16, + b: mask8x16, + ) -> mask8x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_mask8x16(b).val.0, + self.cvt_to_bytes_mask8x16(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_mask8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask8x16( + self, + a: mask8x16, + b: mask8x16, + ) -> mask8x16 { + self.slide_mask8x16::(a, b) + } + #[inline(always)] + fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { + unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { + unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { + unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_mask8x16(self, a: mask8x16) -> mask8x16 { + a ^ !0 + } + #[inline(always)] + fn select_mask8x16( + self, + a: mask8x16, + b: mask8x16, + c: mask8x16, + ) -> mask8x16 { + unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_eq_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { + unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn any_true_mask8x16(self, a: mask8x16) -> bool { + unsafe { _mm_movemask_epi8(a.into()) as u32 != 0 } + } + #[inline(always)] + fn all_true_mask8x16(self, a: mask8x16) -> bool { + unsafe { _mm_movemask_epi8(a.into()) as u32 == 0xffff } + } + #[inline(always)] + fn any_false_mask8x16(self, a: mask8x16) -> bool { + unsafe { _mm_movemask_epi8(a.into()) as u32 != 0xffff } + } + #[inline(always)] + fn all_false_mask8x16(self, a: mask8x16) -> bool { + unsafe { _mm_movemask_epi8(a.into()) as u32 == 0 } + } + #[inline(always)] + fn combine_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x32 { + unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn splat_i16x8(self, val: i16) -> i16x8 { + unsafe { _mm_set1_epi16(val).simd_into(self) } + } + #[inline(always)] + fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8 { + i16x8 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8 { + i16x8 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_i16x8(self, a: i16x8) -> [i16; 8usize] { + unsafe { core::mem::transmute::<__m128i, [i16; 8usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_i16x8(self, a: &i16x8) -> &[i16; 8usize] { + unsafe { core::mem::transmute::<&__m128i, &[i16; 8usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_i16x8(self, a: &mut i16x8) -> &mut [i16; 8usize] { + unsafe { core::mem::transmute::<&mut __m128i, &mut [i16; 8usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_i16x8(self, a: i16x8, dest: &mut [i16; 8usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i16, + dest.as_mut_ptr(), + 8usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_i16x8(self, a: u8x16) -> i16x8 { + unsafe { + i16x8 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_i16x8(self, a: i16x8) -> u8x16 { + unsafe { + u8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_i16x8(b).val.0, + self.cvt_to_bytes_i16x8(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i16x8( + self, + a: i16x8, + b: i16x8, + ) -> i16x8 { + self.slide_i16x8::(a, b) + } + #[inline(always)] + fn add_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_i16x8(self, a: i16x8) -> i16x8 { + a ^ !0 + } + #[inline(always)] + fn shl_i16x8(self, a: i16x8, shift: u32) -> i16x8 { + unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + } + #[inline(always)] + fn shlv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_i16x8(self, a: i16x8, shift: u32) -> i16x8 { + unsafe { _mm_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + } + #[inline(always)] + fn shrv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { + unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_lt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { + unsafe { _mm_cmpgt_epi16(b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_le_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { + unsafe { _mm_cmpeq_epi16(_mm_min_epi16(a.into(), b.into()), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_ge_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { + unsafe { _mm_cmpeq_epi16(_mm_max_epi16(a.into(), b.into()), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_gt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { + unsafe { _mm_cmpgt_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn zip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn zip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn unzip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { + let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpacklo_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { + let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpackhi_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn select_i16x8(self, a: mask16x8, b: i16x8, c: i16x8) -> i16x8 { + unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn min_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { _mm_min_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { _mm_max_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_i16x8(self, a: i16x8, b: i16x8) -> i16x16 { + unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn neg_i16x8(self, a: i16x8) -> i16x8 { + unsafe { _mm_sub_epi16(_mm_setzero_si128(), a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_i16x8(self, a: i16x8) -> u8x16 { + __m128i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_i16x8(self, a: i16x8) -> u32x4 { + __m128i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_u16x8(self, val: u16) -> u16x8 { + unsafe { _mm_set1_epi16(val.cast_signed()).simd_into(self) } + } + #[inline(always)] + fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8 { + u16x8 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8 { + u16x8 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_u16x8(self, a: u16x8) -> [u16; 8usize] { + unsafe { core::mem::transmute::<__m128i, [u16; 8usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_u16x8(self, a: &u16x8) -> &[u16; 8usize] { + unsafe { core::mem::transmute::<&__m128i, &[u16; 8usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_u16x8(self, a: &mut u16x8) -> &mut [u16; 8usize] { + unsafe { core::mem::transmute::<&mut __m128i, &mut [u16; 8usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_u16x8(self, a: u16x8, dest: &mut [u16; 8usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const u16, + dest.as_mut_ptr(), + 8usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_u16x8(self, a: u8x16) -> u16x8 { + unsafe { + u16x8 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_u16x8(self, a: u16x8) -> u8x16 { + unsafe { + u8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_u16x8(b).val.0, + self.cvt_to_bytes_u16x8(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u16x8( + self, + a: u16x8, + b: u16x8, + ) -> u16x8 { + self.slide_u16x8::(a, b) + } + #[inline(always)] + fn add_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_u16x8(self, a: u16x8) -> u16x8 { + a ^ !0 + } + #[inline(always)] + fn shl_u16x8(self, a: u16x8, shift: u32) -> u16x8 { + unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + } + #[inline(always)] + fn shlv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_u16x8(self, a: u16x8, shift: u32) -> u16x8 { + unsafe { _mm_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + } + #[inline(always)] + fn shrv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { + unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_lt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { + unsafe { + let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed()); + let a_signed = _mm_xor_si128(a.into(), sign_bit); + let b_signed = _mm_xor_si128(b.into(), sign_bit); + _mm_cmpgt_epi16(b_signed, a_signed).simd_into(self) + } + } + #[inline(always)] + fn simd_le_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { + unsafe { _mm_cmpeq_epi16(_mm_min_epu16(a.into(), b.into()), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_ge_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { + unsafe { _mm_cmpeq_epi16(_mm_max_epu16(a.into(), b.into()), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_gt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { + unsafe { + let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed()); + let a_signed = _mm_xor_si128(a.into(), sign_bit); + let b_signed = _mm_xor_si128(b.into(), sign_bit); + _mm_cmpgt_epi16(a_signed, b_signed).simd_into(self) + } + } + #[inline(always)] + fn zip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn zip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn unzip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { + let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpacklo_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { + let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpackhi_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn select_u16x8(self, a: mask16x8, b: u16x8, c: u16x8) -> u16x8 { + unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn min_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { _mm_min_epu16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { _mm_max_epu16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_u16x8(self, a: u16x8, b: u16x8) -> u16x16 { + unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_u16x8(self, a: u16x8) -> u8x16 { + __m128i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_u16x8(self, a: u16x8) -> u32x4 { + __m128i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_mask16x8(self, val: i16) -> mask16x8 { + unsafe { _mm_set1_epi16(val).simd_into(self) } + } + #[inline(always)] + fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8 { + mask16x8 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_mask16x8(self, val: &[i16; 8usize]) -> mask16x8 { + mask16x8 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_mask16x8(self, a: mask16x8) -> [i16; 8usize] { + unsafe { core::mem::transmute::<__m128i, [i16; 8usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_mask16x8(self, a: &mask16x8) -> &[i16; 8usize] { + unsafe { core::mem::transmute::<&__m128i, &[i16; 8usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_mask16x8(self, a: &mut mask16x8) -> &mut [i16; 8usize] { + unsafe { core::mem::transmute::<&mut __m128i, &mut [i16; 8usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_mask16x8(self, a: mask16x8, dest: &mut [i16; 8usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i16, + dest.as_mut_ptr(), + 8usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_mask16x8(self, a: u8x16) -> mask16x8 { + unsafe { + mask16x8 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_mask16x8(self, a: mask16x8) -> u8x16 { + unsafe { + u8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_mask16x8( + self, + a: mask16x8, + b: mask16x8, + ) -> mask16x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_mask16x8(b).val.0, + self.cvt_to_bytes_mask16x8(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_mask16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask16x8( + self, + a: mask16x8, + b: mask16x8, + ) -> mask16x8 { + self.slide_mask16x8::(a, b) + } + #[inline(always)] + fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { + unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { + unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { + unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_mask16x8(self, a: mask16x8) -> mask16x8 { + a ^ !0 + } + #[inline(always)] + fn select_mask16x8( + self, + a: mask16x8, + b: mask16x8, + c: mask16x8, + ) -> mask16x8 { + unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_eq_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { + unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn any_true_mask16x8(self, a: mask16x8) -> bool { + unsafe { _mm_movemask_epi8(a.into()) as u32 != 0 } + } + #[inline(always)] + fn all_true_mask16x8(self, a: mask16x8) -> bool { + unsafe { _mm_movemask_epi8(a.into()) as u32 == 0xffff } + } + #[inline(always)] + fn any_false_mask16x8(self, a: mask16x8) -> bool { + unsafe { _mm_movemask_epi8(a.into()) as u32 != 0xffff } + } + #[inline(always)] + fn all_false_mask16x8(self, a: mask16x8) -> bool { + unsafe { _mm_movemask_epi8(a.into()) as u32 == 0 } + } + #[inline(always)] + fn combine_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x16 { + unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn splat_i32x4(self, val: i32) -> i32x4 { + unsafe { _mm_set1_epi32(val).simd_into(self) } + } + #[inline(always)] + fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4 { + i32x4 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4 { + i32x4 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_i32x4(self, a: i32x4) -> [i32; 4usize] { + unsafe { core::mem::transmute::<__m128i, [i32; 4usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_i32x4(self, a: &i32x4) -> &[i32; 4usize] { + unsafe { core::mem::transmute::<&__m128i, &[i32; 4usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_i32x4(self, a: &mut i32x4) -> &mut [i32; 4usize] { + unsafe { core::mem::transmute::<&mut __m128i, &mut [i32; 4usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_i32x4(self, a: i32x4, dest: &mut [i32; 4usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i32, + dest.as_mut_ptr(), + 4usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_i32x4(self, a: u8x16) -> i32x4 { + unsafe { + i32x4 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_i32x4(self, a: i32x4) -> u8x16 { + unsafe { + u8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_i32x4(b).val.0, + self.cvt_to_bytes_i32x4(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i32x4( + self, + a: i32x4, + b: i32x4, + ) -> i32x4 { + self.slide_i32x4::(a, b) + } + #[inline(always)] + fn add_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_i32x4(self, a: i32x4) -> i32x4 { + a ^ !0 + } + #[inline(always)] + fn shl_i32x4(self, a: i32x4, shift: u32) -> i32x4 { + unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + } + #[inline(always)] + fn shlv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_sllv_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn shr_i32x4(self, a: i32x4, shift: u32) -> i32x4 { + unsafe { _mm_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + } + #[inline(always)] + fn shrv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_srav_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_eq_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { + unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_lt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { + unsafe { _mm_cmpgt_epi32(b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_le_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { + unsafe { _mm_cmpeq_epi32(_mm_min_epi32(a.into(), b.into()), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_ge_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { + unsafe { _mm_cmpeq_epi32(_mm_max_epi32(a.into(), b.into()), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_gt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { + unsafe { _mm_cmpgt_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn zip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn zip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn unzip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { + let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); + let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); + _mm_unpacklo_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { + let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); + let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); + _mm_unpackhi_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn select_i32x4(self, a: mask32x4, b: i32x4, c: i32x4) -> i32x4 { + unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn min_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_min_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_max_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_i32x4(self, a: i32x4, b: i32x4) -> i32x8 { + unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn neg_i32x4(self, a: i32x4) -> i32x4 { + unsafe { _mm_sub_epi32(_mm_setzero_si128(), a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_i32x4(self, a: i32x4) -> u8x16 { + __m128i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_i32x4(self, a: i32x4) -> u32x4 { + __m128i::from(a).simd_into(self) + } + #[inline(always)] + fn cvt_f32_i32x4(self, a: i32x4) -> f32x4 { + unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) } + } + #[inline(always)] + fn splat_u32x4(self, val: u32) -> u32x4 { + unsafe { _mm_set1_epi32(val.cast_signed()).simd_into(self) } + } + #[inline(always)] + fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4 { + u32x4 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4 { + u32x4 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_u32x4(self, a: u32x4) -> [u32; 4usize] { + unsafe { core::mem::transmute::<__m128i, [u32; 4usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_u32x4(self, a: &u32x4) -> &[u32; 4usize] { + unsafe { core::mem::transmute::<&__m128i, &[u32; 4usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_u32x4(self, a: &mut u32x4) -> &mut [u32; 4usize] { + unsafe { core::mem::transmute::<&mut __m128i, &mut [u32; 4usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_u32x4(self, a: u32x4, dest: &mut [u32; 4usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const u32, + dest.as_mut_ptr(), + 4usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_u32x4(self, a: u8x16) -> u32x4 { + unsafe { + u32x4 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_u32x4(self, a: u32x4) -> u8x16 { + unsafe { + u8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_u32x4(b).val.0, + self.cvt_to_bytes_u32x4(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u32x4( + self, + a: u32x4, + b: u32x4, + ) -> u32x4 { + self.slide_u32x4::(a, b) + } + #[inline(always)] + fn add_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_u32x4(self, a: u32x4) -> u32x4 { + a ^ !0 + } + #[inline(always)] + fn shl_u32x4(self, a: u32x4, shift: u32) -> u32x4 { + unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + } + #[inline(always)] + fn shlv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_sllv_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn shr_u32x4(self, a: u32x4, shift: u32) -> u32x4 { + unsafe { _mm_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + } + #[inline(always)] + fn shrv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_srlv_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_eq_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { + unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_lt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { + unsafe { + let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed()); + let a_signed = _mm_xor_si128(a.into(), sign_bit); + let b_signed = _mm_xor_si128(b.into(), sign_bit); + _mm_cmpgt_epi32(b_signed, a_signed).simd_into(self) + } + } + #[inline(always)] + fn simd_le_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { + unsafe { _mm_cmpeq_epi32(_mm_min_epu32(a.into(), b.into()), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_ge_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { + unsafe { _mm_cmpeq_epi32(_mm_max_epu32(a.into(), b.into()), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_gt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { + unsafe { + let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed()); + let a_signed = _mm_xor_si128(a.into(), sign_bit); + let b_signed = _mm_xor_si128(b.into(), sign_bit); + _mm_cmpgt_epi32(a_signed, b_signed).simd_into(self) + } + } + #[inline(always)] + fn zip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn zip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn unzip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { + let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); + let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); + _mm_unpacklo_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { + let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); + let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); + _mm_unpackhi_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn select_u32x4(self, a: mask32x4, b: u32x4, c: u32x4) -> u32x4 { + unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn min_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_min_epu32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_max_epu32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_u32x4(self, a: u32x4, b: u32x4) -> u32x8 { + unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_u32x4(self, a: u32x4) -> u8x16 { + __m128i::from(a).simd_into(self) + } + #[inline(always)] + fn cvt_f32_u32x4(self, a: u32x4) -> f32x4 { + unsafe { + let a = a.into(); + let lo = _mm_blend_epi16::<0xAA>(a, _mm_set1_epi32(0x4B000000)); + let hi = _mm_blend_epi16::<0xAA>(_mm_srli_epi32::<16>(a), _mm_set1_epi32(0x53000000)); + let fhi = _mm_sub_ps( + _mm_castsi128_ps(hi), + _mm_set1_ps(f32::from_bits(0x53000080)), + ); + let result = _mm_add_ps(_mm_castsi128_ps(lo), fhi); + result.simd_into(self) + } + } + #[inline(always)] + fn splat_mask32x4(self, val: i32) -> mask32x4 { + unsafe { _mm_set1_epi32(val).simd_into(self) } + } + #[inline(always)] + fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4 { + mask32x4 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_mask32x4(self, val: &[i32; 4usize]) -> mask32x4 { + mask32x4 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_mask32x4(self, a: mask32x4) -> [i32; 4usize] { + unsafe { core::mem::transmute::<__m128i, [i32; 4usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_mask32x4(self, a: &mask32x4) -> &[i32; 4usize] { + unsafe { core::mem::transmute::<&__m128i, &[i32; 4usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_mask32x4(self, a: &mut mask32x4) -> &mut [i32; 4usize] { + unsafe { core::mem::transmute::<&mut __m128i, &mut [i32; 4usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_mask32x4(self, a: mask32x4, dest: &mut [i32; 4usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i32, + dest.as_mut_ptr(), + 4usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_mask32x4(self, a: u8x16) -> mask32x4 { + unsafe { + mask32x4 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_mask32x4(self, a: mask32x4) -> u8x16 { + unsafe { + u8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_mask32x4( + self, + a: mask32x4, + b: mask32x4, + ) -> mask32x4 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_mask32x4(b).val.0, + self.cvt_to_bytes_mask32x4(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_mask32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask32x4( + self, + a: mask32x4, + b: mask32x4, + ) -> mask32x4 { + self.slide_mask32x4::(a, b) + } + #[inline(always)] + fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { + unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { + unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { + unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_mask32x4(self, a: mask32x4) -> mask32x4 { + a ^ !0 + } + #[inline(always)] + fn select_mask32x4( + self, + a: mask32x4, + b: mask32x4, + c: mask32x4, + ) -> mask32x4 { + unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_eq_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { + unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn any_true_mask32x4(self, a: mask32x4) -> bool { + unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0 } + } + #[inline(always)] + fn all_true_mask32x4(self, a: mask32x4) -> bool { + unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0b1111 } + } + #[inline(always)] + fn any_false_mask32x4(self, a: mask32x4) -> bool { + unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0b1111 } + } + #[inline(always)] + fn all_false_mask32x4(self, a: mask32x4) -> bool { + unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0 } + } + #[inline(always)] + fn combine_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x8 { + unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn splat_f64x2(self, val: f64) -> f64x2 { + unsafe { _mm_set1_pd(val).simd_into(self) } + } + #[inline(always)] + fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2 { + f64x2 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2 { + f64x2 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_f64x2(self, a: f64x2) -> [f64; 2usize] { + unsafe { core::mem::transmute::<__m128d, [f64; 2usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_f64x2(self, a: &f64x2) -> &[f64; 2usize] { + unsafe { core::mem::transmute::<&__m128d, &[f64; 2usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_f64x2(self, a: &mut f64x2) -> &mut [f64; 2usize] { + unsafe { core::mem::transmute::<&mut __m128d, &mut [f64; 2usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_f64x2(self, a: f64x2, dest: &mut [f64; 2usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const f64, + dest.as_mut_ptr(), + 2usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_f64x2(self, a: u8x16) -> f64x2 { + unsafe { + f64x2 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_f64x2(self, a: f64x2) -> u8x16 { + unsafe { + u8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { + if SHIFT >= 2usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_f64x2(b).val.0, + self.cvt_to_bytes_f64x2(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x2(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f64x2( + self, + a: f64x2, + b: f64x2, + ) -> f64x2 { + self.slide_f64x2::(a, b) + } + #[inline(always)] + fn abs_f64x2(self, a: f64x2) -> f64x2 { + unsafe { _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(self) } + } + #[inline(always)] + fn neg_f64x2(self, a: f64x2) -> f64x2 { + unsafe { _mm_xor_pd(a.into(), _mm_set1_pd(-0.0)).simd_into(self) } + } + #[inline(always)] + fn sqrt_f64x2(self, a: f64x2) -> f64x2 { + unsafe { _mm_sqrt_pd(a.into()).simd_into(self) } + } + #[inline(always)] + fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { _mm_sub_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { _mm_mul_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn div_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { _mm_div_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn copysign_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { + let mask = _mm_set1_pd(-0.0); + _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, a.into())).simd_into(self) + } + } + #[inline(always)] + fn simd_eq_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { + unsafe { _mm_castpd_si128(_mm_cmpeq_pd(a.into(), b.into())).simd_into(self) } + } + #[inline(always)] + fn simd_lt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { + unsafe { _mm_castpd_si128(_mm_cmplt_pd(a.into(), b.into())).simd_into(self) } + } + #[inline(always)] + fn simd_le_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { + unsafe { _mm_castpd_si128(_mm_cmple_pd(a.into(), b.into())).simd_into(self) } + } + #[inline(always)] + fn simd_ge_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { + unsafe { _mm_castpd_si128(_mm_cmpge_pd(a.into(), b.into())).simd_into(self) } + } + #[inline(always)] + fn simd_gt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { + unsafe { _mm_castpd_si128(_mm_cmpgt_pd(a.into(), b.into())).simd_into(self) } + } + #[inline(always)] + fn zip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { _mm_unpacklo_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn zip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { _mm_unpackhi_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn unzip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { _mm_shuffle_pd::<0b00>(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn unzip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { _mm_shuffle_pd::<0b11>(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { + let intermediate = _mm_max_pd(a.into(), b.into()); + let b_is_nan = _mm_cmpunord_pd(b.into(), b.into()); + _mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self) + } + } + #[inline(always)] + fn min_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { + let intermediate = _mm_min_pd(a.into(), b.into()); + let b_is_nan = _mm_cmpunord_pd(b.into(), b.into()); + _mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self) + } + } + #[inline(always)] + fn mul_add_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { + unsafe { _mm_fmadd_pd(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn mul_sub_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { + unsafe { _mm_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn floor_f64x2(self, a: f64x2) -> f64x2 { + unsafe { + _mm_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) + } + } + #[inline(always)] + fn ceil_f64x2(self, a: f64x2) -> f64x2 { + unsafe { + _mm_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) + } + } + #[inline(always)] + fn round_ties_even_f64x2(self, a: f64x2) -> f64x2 { + unsafe { + _mm_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } + } + #[inline(always)] + fn fract_f64x2(self, a: f64x2) -> f64x2 { + a - self.trunc_f64x2(a) + } + #[inline(always)] + fn trunc_f64x2(self, a: f64x2) -> f64x2 { + unsafe { + _mm_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) + } + } + #[inline(always)] + fn select_f64x2(self, a: mask64x2, b: f64x2, c: f64x2) -> f64x2 { + unsafe { _mm_blendv_pd(c.into(), b.into(), _mm_castsi128_pd(a.into())).simd_into(self) } + } + #[inline(always)] + fn combine_f64x2(self, a: f64x2, b: f64x2) -> f64x4 { + unsafe { _mm256_setr_m128d(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_f32_f64x2(self, a: f64x2) -> f32x4 { + unsafe { _mm_castpd_ps(a.into()).simd_into(self) } + } + #[inline(always)] + fn splat_mask64x2(self, val: i64) -> mask64x2 { + unsafe { _mm_set1_epi64x(val).simd_into(self) } + } + #[inline(always)] + fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { + mask64x2 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_mask64x2(self, val: &[i64; 2usize]) -> mask64x2 { + mask64x2 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_mask64x2(self, a: mask64x2) -> [i64; 2usize] { + unsafe { core::mem::transmute::<__m128i, [i64; 2usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_mask64x2(self, a: &mask64x2) -> &[i64; 2usize] { + unsafe { core::mem::transmute::<&__m128i, &[i64; 2usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_mask64x2(self, a: &mut mask64x2) -> &mut [i64; 2usize] { + unsafe { core::mem::transmute::<&mut __m128i, &mut [i64; 2usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_mask64x2(self, a: mask64x2, dest: &mut [i64; 2usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i64, + dest.as_mut_ptr(), + 2usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_mask64x2(self, a: u8x16) -> mask64x2 { + unsafe { + mask64x2 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_mask64x2(self, a: mask64x2) -> u8x16 { + unsafe { + u8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_mask64x2( + self, + a: mask64x2, + b: mask64x2, + ) -> mask64x2 { + unsafe { + if SHIFT >= 2usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_mask64x2(b).val.0, + self.cvt_to_bytes_mask64x2(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_mask64x2(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask64x2( + self, + a: mask64x2, + b: mask64x2, + ) -> mask64x2 { + self.slide_mask64x2::(a, b) + } + #[inline(always)] + fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_mask64x2(self, a: mask64x2) -> mask64x2 { + a ^ !0 + } + #[inline(always)] + fn select_mask64x2( + self, + a: mask64x2, + b: mask64x2, + c: mask64x2, + ) -> mask64x2 { + unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + unsafe { _mm_cmpeq_epi64(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn any_true_mask64x2(self, a: mask64x2) -> bool { + unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0 } + } + #[inline(always)] + fn all_true_mask64x2(self, a: mask64x2) -> bool { + unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0b11 } + } + #[inline(always)] + fn any_false_mask64x2(self, a: mask64x2) -> bool { + unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0b11 } + } + #[inline(always)] + fn all_false_mask64x2(self, a: mask64x2) -> bool { + unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0 } + } + #[inline(always)] + fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { + unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn splat_f32x8(self, val: f32) -> f32x8 { + unsafe { _mm256_set1_ps(val).simd_into(self) } + } + #[inline(always)] + fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { + f32x8 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8 { + f32x8 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_f32x8(self, a: f32x8) -> [f32; 8usize] { + unsafe { core::mem::transmute::<__m256, [f32; 8usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_f32x8(self, a: &f32x8) -> &[f32; 8usize] { + unsafe { core::mem::transmute::<&__m256, &[f32; 8usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_f32x8(self, a: &mut f32x8) -> &mut [f32; 8usize] { + unsafe { core::mem::transmute::<&mut __m256, &mut [f32; 8usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_f32x8(self, a: f32x8, dest: &mut [f32; 8usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const f32, + dest.as_mut_ptr(), + 8usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_f32x8(self, a: u8x32) -> f32x8 { + unsafe { + f32x8 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_f32x8(self, a: f32x8) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_f32x8(b).val.0, + self.cvt_to_bytes_f32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f32x8( + self, + a: f32x8, + b: f32x8, + ) -> f32x8 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_f32x8(b).val.0, + self.cvt_to_bytes_f32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn abs_f32x8(self, a: f32x8) -> f32x8 { + unsafe { _mm256_andnot_ps(_mm256_set1_ps(-0.0), a.into()).simd_into(self) } + } + #[inline(always)] + fn neg_f32x8(self, a: f32x8) -> f32x8 { + unsafe { _mm256_xor_ps(a.into(), _mm256_set1_ps(-0.0)).simd_into(self) } + } + #[inline(always)] + fn sqrt_f32x8(self, a: f32x8) -> f32x8 { + unsafe { _mm256_sqrt_ps(a.into()).simd_into(self) } + } + #[inline(always)] + fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { _mm256_add_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { _mm256_sub_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { _mm256_mul_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { _mm256_div_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn copysign_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { + let mask = _mm256_set1_ps(-0.0); + _mm256_or_ps( + _mm256_and_ps(mask, b.into()), + _mm256_andnot_ps(mask, a.into()), + ) + .simd_into(self) + } + } + #[inline(always)] + fn simd_eq_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + unsafe { _mm256_castps_si256(_mm256_cmp_ps::<0i32>(a.into(), b.into())).simd_into(self) } + } + #[inline(always)] + fn simd_lt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + unsafe { _mm256_castps_si256(_mm256_cmp_ps::<17i32>(a.into(), b.into())).simd_into(self) } + } + #[inline(always)] + fn simd_le_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + unsafe { _mm256_castps_si256(_mm256_cmp_ps::<18i32>(a.into(), b.into())).simd_into(self) } + } + #[inline(always)] + fn simd_ge_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + unsafe { _mm256_castps_si256(_mm256_cmp_ps::<29i32>(a.into(), b.into())).simd_into(self) } + } + #[inline(always)] + fn simd_gt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + unsafe { _mm256_castps_si256(_mm256_cmp_ps::<30i32>(a.into(), b.into())).simd_into(self) } + } + #[inline(always)] + fn zip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { + let lo = _mm256_unpacklo_ps(a.into(), b.into()); + let hi = _mm256_unpackhi_ps(a.into(), b.into()); + _mm256_permute2f128_ps::<0b0010_0000>(lo, hi).simd_into(self) + } + } + #[inline(always)] + fn zip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { + let lo = _mm256_unpacklo_ps(a.into(), b.into()); + let hi = _mm256_unpackhi_ps(a.into(), b.into()); + _mm256_permute2f128_ps::<0b0011_0001>(lo, hi).simd_into(self) + } + } + #[inline(always)] + fn unzip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { + let t1 = _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + let t2 = _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + _mm256_permute2f128_ps::<0b0010_0000>(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { + let t1 = _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + let t2 = _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + _mm256_permute2f128_ps::<0b0011_0001>(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { _mm256_max_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { _mm256_min_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { + let intermediate = _mm256_max_ps(a.into(), b.into()); + let b_is_nan = _mm256_cmp_ps::<3i32>(b.into(), b.into()); + _mm256_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self) + } + } + #[inline(always)] + fn min_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { + let intermediate = _mm256_min_ps(a.into(), b.into()); + let b_is_nan = _mm256_cmp_ps::<3i32>(b.into(), b.into()); + _mm256_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self) + } + } + #[inline(always)] + fn mul_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + unsafe { _mm256_fmadd_ps(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn mul_sub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + unsafe { _mm256_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn floor_f32x8(self, a: f32x8) -> f32x8 { + unsafe { + _mm256_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } + } + #[inline(always)] + fn ceil_f32x8(self, a: f32x8) -> f32x8 { + unsafe { + _mm256_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } + } + #[inline(always)] + fn round_ties_even_f32x8(self, a: f32x8) -> f32x8 { + unsafe { + _mm256_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } + } + #[inline(always)] + fn fract_f32x8(self, a: f32x8) -> f32x8 { + a - self.trunc_f32x8(a) + } + #[inline(always)] + fn trunc_f32x8(self, a: f32x8) -> f32x8 { + unsafe { + _mm256_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) + } + } + #[inline(always)] + fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8 { + unsafe { + _mm256_blendv_ps(c.into(), b.into(), _mm256_castsi256_ps(a.into())).simd_into(self) + } + } + #[inline(always)] + fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16 { + unsafe { + let lo = _mm512_castps256_ps512(a.into()); + _mm512_insertf32x8::<1>(lo, b.into()).simd_into(self) + } + } + #[inline(always)] + fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4) { + unsafe { + ( + _mm256_extractf128_ps::<0>(a.into()).simd_into(self), + _mm256_extractf128_ps::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { + unsafe { _mm256_castps_pd(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_i32_f32x8(self, a: f32x8) -> i32x8 { + unsafe { _mm256_castps_si256(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32 { + unsafe { _mm256_castps_si256(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u32_f32x8(self, a: f32x8) -> u32x8 { + unsafe { _mm256_castps_si256(a.into()).simd_into(self) } + } + #[inline(always)] + fn cvt_u32_f32x8(self, a: f32x8) -> u32x8 { + unsafe { + let mut converted = _mm256_cvttps_epi32(a.into()); + let in_range = _mm256_cmp_ps::<17i32>(a.into(), _mm256_set1_ps(2147483648.0)); + let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; + if !all_in_range { + let excess = _mm256_sub_ps(a.into(), _mm256_set1_ps(2147483648.0)); + let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess)); + converted = _mm256_add_epi32(converted, excess_converted); + } + converted.simd_into(self) + } + } + #[inline(always)] + fn cvt_u32_precise_f32x8(self, a: f32x8) -> u32x8 { + unsafe { + let a = _mm256_max_ps(a.into(), _mm256_setzero_ps()); + let mut converted = _mm256_cvttps_epi32(a); + let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0)); + let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; + if !all_in_range { + let exceeds_unsigned_range = + _mm256_castps_si256(_mm256_cmp_ps::<17i32>(_mm256_set1_ps(4294967040.0), a)); + let excess = _mm256_sub_ps(a, _mm256_set1_ps(2147483648.0)); + let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess)); + converted = _mm256_add_epi32(converted, excess_converted); + converted = _mm256_blendv_epi8( + converted, + _mm256_set1_epi32(u32::MAX.cast_signed()), + exceeds_unsigned_range, + ); + } + converted.simd_into(self) + } + } + #[inline(always)] + fn cvt_i32_f32x8(self, a: f32x8) -> i32x8 { + unsafe { _mm256_cvttps_epi32(a.into()).simd_into(self) } + } + #[inline(always)] + fn cvt_i32_precise_f32x8(self, a: f32x8) -> i32x8 { + unsafe { + let a = a.into(); + let mut converted = _mm256_cvttps_epi32(a); + let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0)); + let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; + if !all_in_range { + converted = _mm256_blendv_epi8( + _mm256_set1_epi32(i32::MAX), + converted, + _mm256_castps_si256(in_range), + ); + let is_not_nan = _mm256_castps_si256(_mm256_cmp_ps::<7i32>(a, a)); + converted = _mm256_and_si256(converted, is_not_nan); + } + converted.simd_into(self) + } + } + #[inline(always)] + fn splat_i8x32(self, val: i8) -> i8x32 { + unsafe { _mm256_set1_epi8(val).simd_into(self) } + } + #[inline(always)] + fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { + i8x32 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32 { + i8x32 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_i8x32(self, a: i8x32) -> [i8; 32usize] { + unsafe { core::mem::transmute::<__m256i, [i8; 32usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_i8x32(self, a: &i8x32) -> &[i8; 32usize] { + unsafe { core::mem::transmute::<&__m256i, &[i8; 32usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_i8x32(self, a: &mut i8x32) -> &mut [i8; 32usize] { + unsafe { core::mem::transmute::<&mut __m256i, &mut [i8; 32usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_i8x32(self, a: i8x32, dest: &mut [i8; 32usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i8, + dest.as_mut_ptr(), + 32usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_i8x32(self, a: u8x32) -> i8x32 { + unsafe { + i8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_i8x32(self, a: i8x32) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_i8x32(b).val.0, + self.cvt_to_bytes_i8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i8x32( + self, + a: i8x32, + b: i8x32, + ) -> i8x32 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_i8x32(b).val.0, + self.cvt_to_bytes_i8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { _mm256_add_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { _mm256_sub_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { + let dst_even = _mm256_mullo_epi16(a.into(), b.into()); + let dst_odd = _mm256_mullo_epi16( + _mm256_srli_epi16::<8>(a.into()), + _mm256_srli_epi16::<8>(b.into()), + ); + _mm256_or_si256( + _mm256_slli_epi16(dst_odd, 8), + _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)), + ) + .simd_into(self) + } + } + #[inline(always)] + fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_i8x32(self, a: i8x32) -> i8x32 { + a ^ !0 + } + #[inline(always)] + fn shl_i8x32(self, a: i8x32, shift: u32) -> i8x32 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); + let hi_16 = _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); + let lo_shifted = _mm256_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm256_sll_epi16(hi_16, shift_count); + _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(self) + } + } + #[inline(always)] + fn shlv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_i8x32(self, a: i8x32, shift: u32) -> i8x32 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); + let hi_16 = _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); + let lo_shifted = _mm256_sra_epi16(lo_16, shift_count); + let hi_shifted = _mm256_sra_epi16(hi_16, shift_count); + _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(self) + } + } + #[inline(always)] + fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + unsafe { _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_lt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + unsafe { _mm256_cmpgt_epi8(b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_le_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + unsafe { _mm256_cmpeq_epi8(_mm256_min_epi8(a.into(), b.into()), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_ge_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + unsafe { _mm256_cmpeq_epi8(_mm256_max_epi8(a.into(), b.into()), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_gt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + unsafe { _mm256_cmpgt_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn zip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { + let lo = _mm256_unpacklo_epi8(a.into(), b.into()); + let hi = _mm256_unpackhi_epi8(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self) + } + } + #[inline(always)] + fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { + let lo = _mm256_unpacklo_epi8(a.into(), b.into()); + let hi = _mm256_unpackhi_epi8(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self) + } + } + #[inline(always)] + fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { + let mask = _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, + 3, 5, 7, 9, 11, 13, 15, + ); + let a_shuffled = _mm256_shuffle_epi8(a.into(), mask); + let b_shuffled = _mm256_shuffle_epi8(b.into(), mask); + let packed = _mm256_permute2x128_si256::<0b0010_0000>( + _mm256_permute4x64_epi64::<0b11_01_10_00>(a_shuffled), + _mm256_permute4x64_epi64::<0b11_01_10_00>(b_shuffled), + ); + packed.simd_into(self) + } + } + #[inline(always)] + fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { + let mask = _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, + 3, 5, 7, 9, 11, 13, 15, + ); + let a_shuffled = _mm256_shuffle_epi8(a.into(), mask); + let b_shuffled = _mm256_shuffle_epi8(b.into(), mask); + let packed = _mm256_permute2x128_si256::<0b0011_0001>( + _mm256_permute4x64_epi64::<0b11_01_10_00>(a_shuffled), + _mm256_permute4x64_epi64::<0b11_01_10_00>(b_shuffled), + ); + packed.simd_into(self) + } + } + #[inline(always)] + fn select_i8x32(self, a: mask8x32, b: i8x32, c: i8x32) -> i8x32 { + unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { _mm256_min_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { _mm256_max_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_i8x32(self, a: i8x32, b: i8x32) -> i8x64 { + unsafe { + let lo = _mm512_castsi256_si512(a.into()); + _mm512_inserti64x4::<1>(lo, b.into()).simd_into(self) + } + } + #[inline(always)] + fn split_i8x32(self, a: i8x32) -> (i8x16, i8x16) { + unsafe { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(self), + _mm256_extracti128_si256::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn neg_i8x32(self, a: i8x32) -> i8x32 { + unsafe { _mm256_sub_epi8(_mm256_setzero_si256(), a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_i8x32(self, a: i8x32) -> u8x32 { + __m256i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_i8x32(self, a: i8x32) -> u32x8 { + __m256i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_u8x32(self, val: u8) -> u8x32 { + unsafe { _mm256_set1_epi8(val.cast_signed()).simd_into(self) } + } + #[inline(always)] + fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { + u8x32 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32 { + u8x32 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_u8x32(self, a: u8x32) -> [u8; 32usize] { + unsafe { core::mem::transmute::<__m256i, [u8; 32usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_u8x32(self, a: &u8x32) -> &[u8; 32usize] { + unsafe { core::mem::transmute::<&__m256i, &[u8; 32usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_u8x32(self, a: &mut u8x32) -> &mut [u8; 32usize] { + unsafe { core::mem::transmute::<&mut __m256i, &mut [u8; 32usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_u8x32(self, a: u8x32, dest: &mut [u8; 32usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const u8, + dest.as_mut_ptr(), + 32usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_u8x32(self, a: u8x32) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_u8x32(self, a: u8x32) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_u8x32(b).val.0, + self.cvt_to_bytes_u8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u8x32( + self, + a: u8x32, + b: u8x32, + ) -> u8x32 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_u8x32(b).val.0, + self.cvt_to_bytes_u8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { _mm256_add_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { _mm256_sub_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { + let dst_even = _mm256_mullo_epi16(a.into(), b.into()); + let dst_odd = _mm256_mullo_epi16( + _mm256_srli_epi16::<8>(a.into()), + _mm256_srli_epi16::<8>(b.into()), + ); + _mm256_or_si256( + _mm256_slli_epi16(dst_odd, 8), + _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)), + ) + .simd_into(self) + } + } + #[inline(always)] + fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_u8x32(self, a: u8x32) -> u8x32 { + a ^ !0 + } + #[inline(always)] + fn shl_u8x32(self, a: u8x32, shift: u32) -> u8x32 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256()); + let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256()); + let lo_shifted = _mm256_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm256_sll_epi16(hi_16, shift_count); + _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self) + } + } + #[inline(always)] + fn shlv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_u8x32(self, a: u8x32, shift: u32) -> u8x32 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256()); + let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256()); + let lo_shifted = _mm256_srl_epi16(lo_16, shift_count); + let hi_shifted = _mm256_srl_epi16(hi_16, shift_count); + _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self) + } + } + #[inline(always)] + fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + unsafe { _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_lt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + unsafe { + let sign_bit = _mm256_set1_epi8(0x80u8.cast_signed()); + let a_signed = _mm256_xor_si256(a.into(), sign_bit); + let b_signed = _mm256_xor_si256(b.into(), sign_bit); + _mm256_cmpgt_epi8(b_signed, a_signed).simd_into(self) + } + } + #[inline(always)] + fn simd_le_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + unsafe { _mm256_cmpeq_epi8(_mm256_min_epu8(a.into(), b.into()), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_ge_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + unsafe { _mm256_cmpeq_epi8(_mm256_max_epu8(a.into(), b.into()), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_gt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + unsafe { + let sign_bit = _mm256_set1_epi8(0x80u8.cast_signed()); + let a_signed = _mm256_xor_si256(a.into(), sign_bit); + let b_signed = _mm256_xor_si256(b.into(), sign_bit); + _mm256_cmpgt_epi8(a_signed, b_signed).simd_into(self) + } + } + #[inline(always)] + fn zip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { + let lo = _mm256_unpacklo_epi8(a.into(), b.into()); + let hi = _mm256_unpackhi_epi8(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self) + } + } + #[inline(always)] + fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { + let lo = _mm256_unpacklo_epi8(a.into(), b.into()); + let hi = _mm256_unpackhi_epi8(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self) + } + } + #[inline(always)] + fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { + let mask = _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, + 3, 5, 7, 9, 11, 13, 15, + ); + let a_shuffled = _mm256_shuffle_epi8(a.into(), mask); + let b_shuffled = _mm256_shuffle_epi8(b.into(), mask); + let packed = _mm256_permute2x128_si256::<0b0010_0000>( + _mm256_permute4x64_epi64::<0b11_01_10_00>(a_shuffled), + _mm256_permute4x64_epi64::<0b11_01_10_00>(b_shuffled), + ); + packed.simd_into(self) + } + } + #[inline(always)] + fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { + let mask = _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, + 3, 5, 7, 9, 11, 13, 15, + ); + let a_shuffled = _mm256_shuffle_epi8(a.into(), mask); + let b_shuffled = _mm256_shuffle_epi8(b.into(), mask); + let packed = _mm256_permute2x128_si256::<0b0011_0001>( + _mm256_permute4x64_epi64::<0b11_01_10_00>(a_shuffled), + _mm256_permute4x64_epi64::<0b11_01_10_00>(b_shuffled), + ); + packed.simd_into(self) + } + } + #[inline(always)] + fn select_u8x32(self, a: mask8x32, b: u8x32, c: u8x32) -> u8x32 { + unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { _mm256_min_epu8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { _mm256_max_epu8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_u8x32(self, a: u8x32, b: u8x32) -> u8x64 { + unsafe { + let lo = _mm512_castsi256_si512(a.into()); + _mm512_inserti64x4::<1>(lo, b.into()).simd_into(self) + } + } + #[inline(always)] + fn split_u8x32(self, a: u8x32) -> (u8x16, u8x16) { + unsafe { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(self), + _mm256_extracti128_si256::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn widen_u8x32(self, a: u8x32) -> u16x32 { + unsafe { _mm512_cvtepu8_epi16(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u32_u8x32(self, a: u8x32) -> u32x8 { + __m256i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_mask8x32(self, val: i8) -> mask8x32 { + unsafe { _mm256_set1_epi8(val).simd_into(self) } + } + #[inline(always)] + fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { + mask8x32 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_mask8x32(self, val: &[i8; 32usize]) -> mask8x32 { + mask8x32 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_mask8x32(self, a: mask8x32) -> [i8; 32usize] { + unsafe { core::mem::transmute::<__m256i, [i8; 32usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_mask8x32(self, a: &mask8x32) -> &[i8; 32usize] { + unsafe { core::mem::transmute::<&__m256i, &[i8; 32usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_mask8x32(self, a: &mut mask8x32) -> &mut [i8; 32usize] { + unsafe { core::mem::transmute::<&mut __m256i, &mut [i8; 32usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_mask8x32(self, a: mask8x32, dest: &mut [i8; 32usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i8, + dest.as_mut_ptr(), + 32usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_mask8x32(self, a: u8x32) -> mask8x32 { + unsafe { + mask8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_mask8x32(self, a: mask8x32) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_mask8x32( + self, + a: mask8x32, + b: mask8x32, + ) -> mask8x32 { + unsafe { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_mask8x32(b).val.0, + self.cvt_to_bytes_mask8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_mask8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask8x32( + self, + a: mask8x32, + b: mask8x32, + ) -> mask8x32 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_mask8x32(b).val.0, + self.cvt_to_bytes_mask8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_mask8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_mask8x32(self, a: mask8x32) -> mask8x32 { + a ^ !0 + } + #[inline(always)] + fn select_mask8x32( + self, + a: mask8x32, + b: mask8x32, + c: mask8x32, + ) -> mask8x32 { + unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_eq_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + unsafe { _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn any_true_mask8x32(self, a: mask8x32) -> bool { + unsafe { _mm256_movemask_epi8(a.into()) as u32 != 0 } + } + #[inline(always)] + fn all_true_mask8x32(self, a: mask8x32) -> bool { + unsafe { _mm256_movemask_epi8(a.into()) as u32 == 0xffffffff } + } + #[inline(always)] + fn any_false_mask8x32(self, a: mask8x32) -> bool { + unsafe { _mm256_movemask_epi8(a.into()) as u32 != 0xffffffff } + } + #[inline(always)] + fn all_false_mask8x32(self, a: mask8x32) -> bool { + unsafe { _mm256_movemask_epi8(a.into()) as u32 == 0 } + } + #[inline(always)] + fn combine_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x64 { + unsafe { + let lo = _mm512_castsi256_si512(a.into()); + _mm512_inserti64x4::<1>(lo, b.into()).simd_into(self) + } + } + #[inline(always)] + fn split_mask8x32(self, a: mask8x32) -> (mask8x16, mask8x16) { + unsafe { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(self), + _mm256_extracti128_si256::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn splat_i16x16(self, val: i16) -> i16x16 { + unsafe { _mm256_set1_epi16(val).simd_into(self) } + } + #[inline(always)] + fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { + i16x16 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16 { + i16x16 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_i16x16(self, a: i16x16) -> [i16; 16usize] { + unsafe { core::mem::transmute::<__m256i, [i16; 16usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_i16x16(self, a: &i16x16) -> &[i16; 16usize] { + unsafe { core::mem::transmute::<&__m256i, &[i16; 16usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_i16x16(self, a: &mut i16x16) -> &mut [i16; 16usize] { + unsafe { core::mem::transmute::<&mut __m256i, &mut [i16; 16usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_i16x16(self, a: i16x16, dest: &mut [i16; 16usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i16, + dest.as_mut_ptr(), + 16usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_i16x16(self, a: u8x32) -> i16x16 { + unsafe { + i16x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_i16x16(self, a: i16x16) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_i16x16(b).val.0, + self.cvt_to_bytes_i16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i16x16( + self, + a: i16x16, + b: i16x16, + ) -> i16x16 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_i16x16(b).val.0, + self.cvt_to_bytes_i16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { _mm256_add_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { _mm256_sub_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { _mm256_mullo_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_i16x16(self, a: i16x16) -> i16x16 { + a ^ !0 + } + #[inline(always)] + fn shl_i16x16(self, a: i16x16, shift: u32) -> i16x16 { + unsafe { + _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shlv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_i16x16(self, a: i16x16, shift: u32) -> i16x16 { + unsafe { + _mm256_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + unsafe { _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_lt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + unsafe { _mm256_cmpgt_epi16(b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_le_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + unsafe { + _mm256_cmpeq_epi16(_mm256_min_epi16(a.into(), b.into()), a.into()).simd_into(self) + } + } + #[inline(always)] + fn simd_ge_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + unsafe { + _mm256_cmpeq_epi16(_mm256_max_epi16(a.into(), b.into()), a.into()).simd_into(self) + } + } + #[inline(always)] + fn simd_gt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + unsafe { _mm256_cmpgt_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn zip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { + let lo = _mm256_unpacklo_epi16(a.into(), b.into()); + let hi = _mm256_unpackhi_epi16(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self) + } + } + #[inline(always)] + fn zip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { + let lo = _mm256_unpacklo_epi16(a.into(), b.into()); + let hi = _mm256_unpackhi_epi16(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self) + } + } + #[inline(always)] + fn unzip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { + let mask = _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, 2, + 3, 6, 7, 10, 11, 14, 15, + ); + let a_shuffled = _mm256_shuffle_epi8(a.into(), mask); + let b_shuffled = _mm256_shuffle_epi8(b.into(), mask); + let packed = _mm256_permute2x128_si256::<0b0010_0000>( + _mm256_permute4x64_epi64::<0b11_01_10_00>(a_shuffled), + _mm256_permute4x64_epi64::<0b11_01_10_00>(b_shuffled), + ); + packed.simd_into(self) + } + } + #[inline(always)] + fn unzip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { + let mask = _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, 2, + 3, 6, 7, 10, 11, 14, 15, + ); + let a_shuffled = _mm256_shuffle_epi8(a.into(), mask); + let b_shuffled = _mm256_shuffle_epi8(b.into(), mask); + let packed = _mm256_permute2x128_si256::<0b0011_0001>( + _mm256_permute4x64_epi64::<0b11_01_10_00>(a_shuffled), + _mm256_permute4x64_epi64::<0b11_01_10_00>(b_shuffled), + ); + packed.simd_into(self) + } + } + #[inline(always)] + fn select_i16x16(self, a: mask16x16, b: i16x16, c: i16x16) -> i16x16 { + unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { _mm256_min_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { _mm256_max_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_i16x16(self, a: i16x16, b: i16x16) -> i16x32 { + unsafe { + let lo = _mm512_castsi256_si512(a.into()); + _mm512_inserti64x4::<1>(lo, b.into()).simd_into(self) + } + } + #[inline(always)] + fn split_i16x16(self, a: i16x16) -> (i16x8, i16x8) { + unsafe { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(self), + _mm256_extracti128_si256::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn neg_i16x16(self, a: i16x16) -> i16x16 { + unsafe { _mm256_sub_epi16(_mm256_setzero_si256(), a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_i16x16(self, a: i16x16) -> u8x32 { + __m256i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_i16x16(self, a: i16x16) -> u32x8 { + __m256i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_u16x16(self, val: u16) -> u16x16 { + unsafe { _mm256_set1_epi16(val.cast_signed()).simd_into(self) } + } + #[inline(always)] + fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { + u16x16 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16 { + u16x16 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_u16x16(self, a: u16x16) -> [u16; 16usize] { + unsafe { core::mem::transmute::<__m256i, [u16; 16usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_u16x16(self, a: &u16x16) -> &[u16; 16usize] { + unsafe { core::mem::transmute::<&__m256i, &[u16; 16usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_u16x16(self, a: &mut u16x16) -> &mut [u16; 16usize] { + unsafe { core::mem::transmute::<&mut __m256i, &mut [u16; 16usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_u16x16(self, a: u16x16, dest: &mut [u16; 16usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const u16, + dest.as_mut_ptr(), + 16usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_u16x16(self, a: u8x32) -> u16x16 { + unsafe { + u16x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_u16x16(self, a: u16x16) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_u16x16(b).val.0, + self.cvt_to_bytes_u16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u16x16( + self, + a: u16x16, + b: u16x16, + ) -> u16x16 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_u16x16(b).val.0, + self.cvt_to_bytes_u16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { _mm256_add_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { _mm256_sub_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { _mm256_mullo_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_u16x16(self, a: u16x16) -> u16x16 { + a ^ !0 + } + #[inline(always)] + fn shl_u16x16(self, a: u16x16, shift: u32) -> u16x16 { + unsafe { + _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shlv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_u16x16(self, a: u16x16, shift: u32) -> u16x16 { + unsafe { + _mm256_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + unsafe { _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_lt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + unsafe { + let sign_bit = _mm256_set1_epi16(0x8000u16.cast_signed()); + let a_signed = _mm256_xor_si256(a.into(), sign_bit); + let b_signed = _mm256_xor_si256(b.into(), sign_bit); + _mm256_cmpgt_epi16(b_signed, a_signed).simd_into(self) + } + } + #[inline(always)] + fn simd_le_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + unsafe { + _mm256_cmpeq_epi16(_mm256_min_epu16(a.into(), b.into()), a.into()).simd_into(self) + } + } + #[inline(always)] + fn simd_ge_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + unsafe { + _mm256_cmpeq_epi16(_mm256_max_epu16(a.into(), b.into()), a.into()).simd_into(self) + } + } + #[inline(always)] + fn simd_gt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + unsafe { + let sign_bit = _mm256_set1_epi16(0x8000u16.cast_signed()); + let a_signed = _mm256_xor_si256(a.into(), sign_bit); + let b_signed = _mm256_xor_si256(b.into(), sign_bit); + _mm256_cmpgt_epi16(a_signed, b_signed).simd_into(self) + } + } + #[inline(always)] + fn zip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { + let lo = _mm256_unpacklo_epi16(a.into(), b.into()); + let hi = _mm256_unpackhi_epi16(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self) + } + } + #[inline(always)] + fn zip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { + let lo = _mm256_unpacklo_epi16(a.into(), b.into()); + let hi = _mm256_unpackhi_epi16(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self) + } + } + #[inline(always)] + fn unzip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { + let mask = _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, 2, + 3, 6, 7, 10, 11, 14, 15, + ); + let a_shuffled = _mm256_shuffle_epi8(a.into(), mask); + let b_shuffled = _mm256_shuffle_epi8(b.into(), mask); + let packed = _mm256_permute2x128_si256::<0b0010_0000>( + _mm256_permute4x64_epi64::<0b11_01_10_00>(a_shuffled), + _mm256_permute4x64_epi64::<0b11_01_10_00>(b_shuffled), + ); + packed.simd_into(self) + } + } + #[inline(always)] + fn unzip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { + let mask = _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, 2, + 3, 6, 7, 10, 11, 14, 15, + ); + let a_shuffled = _mm256_shuffle_epi8(a.into(), mask); + let b_shuffled = _mm256_shuffle_epi8(b.into(), mask); + let packed = _mm256_permute2x128_si256::<0b0011_0001>( + _mm256_permute4x64_epi64::<0b11_01_10_00>(a_shuffled), + _mm256_permute4x64_epi64::<0b11_01_10_00>(b_shuffled), + ); + packed.simd_into(self) + } + } + #[inline(always)] + fn select_u16x16(self, a: mask16x16, b: u16x16, c: u16x16) -> u16x16 { + unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { _mm256_min_epu16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { _mm256_max_epu16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_u16x16(self, a: u16x16, b: u16x16) -> u16x32 { + unsafe { + let lo = _mm512_castsi256_si512(a.into()); + _mm512_inserti64x4::<1>(lo, b.into()).simd_into(self) + } + } + #[inline(always)] + fn split_u16x16(self, a: u16x16) -> (u16x8, u16x8) { + unsafe { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(self), + _mm256_extracti128_si256::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn narrow_u16x16(self, a: u16x16) -> u8x16 { + unsafe { + let mask = _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10, 12, + 14, -1, -1, -1, -1, -1, -1, -1, -1, + ); + let shuffled = _mm256_shuffle_epi8(a.into(), mask); + let packed = _mm256_permute4x64_epi64::<0b11_01_10_00>(shuffled); + _mm256_castsi256_si128(packed).simd_into(self) + } + } + #[inline(always)] + fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { + __m256i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_u16x16(self, a: u16x16) -> u32x8 { + __m256i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_mask16x16(self, val: i16) -> mask16x16 { + unsafe { _mm256_set1_epi16(val).simd_into(self) } + } + #[inline(always)] + fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { + mask16x16 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_mask16x16(self, val: &[i16; 16usize]) -> mask16x16 { + mask16x16 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_mask16x16(self, a: mask16x16) -> [i16; 16usize] { + unsafe { core::mem::transmute::<__m256i, [i16; 16usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_mask16x16(self, a: &mask16x16) -> &[i16; 16usize] { + unsafe { core::mem::transmute::<&__m256i, &[i16; 16usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_mask16x16(self, a: &mut mask16x16) -> &mut [i16; 16usize] { + unsafe { core::mem::transmute::<&mut __m256i, &mut [i16; 16usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_mask16x16(self, a: mask16x16, dest: &mut [i16; 16usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i16, + dest.as_mut_ptr(), + 16usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_mask16x16(self, a: u8x32) -> mask16x16 { + unsafe { + mask16x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_mask16x16(self, a: mask16x16) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_mask16x16( + self, + a: mask16x16, + b: mask16x16, + ) -> mask16x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_mask16x16(b).val.0, + self.cvt_to_bytes_mask16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_mask16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask16x16( + self, + a: mask16x16, + b: mask16x16, + ) -> mask16x16 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_mask16x16(b).val.0, + self.cvt_to_bytes_mask16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_mask16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_mask16x16(self, a: mask16x16) -> mask16x16 { + a ^ !0 + } + #[inline(always)] + fn select_mask16x16( + self, + a: mask16x16, + b: mask16x16, + c: mask16x16, + ) -> mask16x16 { + unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_eq_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + unsafe { _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn any_true_mask16x16(self, a: mask16x16) -> bool { + unsafe { _mm256_movemask_epi8(a.into()) as u32 != 0 } + } + #[inline(always)] + fn all_true_mask16x16(self, a: mask16x16) -> bool { + unsafe { _mm256_movemask_epi8(a.into()) as u32 == 0xffffffff } + } + #[inline(always)] + fn any_false_mask16x16(self, a: mask16x16) -> bool { + unsafe { _mm256_movemask_epi8(a.into()) as u32 != 0xffffffff } + } + #[inline(always)] + fn all_false_mask16x16(self, a: mask16x16) -> bool { + unsafe { _mm256_movemask_epi8(a.into()) as u32 == 0 } + } + #[inline(always)] + fn combine_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x32 { + unsafe { + let lo = _mm512_castsi256_si512(a.into()); + _mm512_inserti64x4::<1>(lo, b.into()).simd_into(self) + } + } + #[inline(always)] + fn split_mask16x16(self, a: mask16x16) -> (mask16x8, mask16x8) { + unsafe { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(self), + _mm256_extracti128_si256::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn splat_i32x8(self, val: i32) -> i32x8 { + unsafe { _mm256_set1_epi32(val).simd_into(self) } + } + #[inline(always)] + fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { + i32x8 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8 { + i32x8 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_i32x8(self, a: i32x8) -> [i32; 8usize] { + unsafe { core::mem::transmute::<__m256i, [i32; 8usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_i32x8(self, a: &i32x8) -> &[i32; 8usize] { + unsafe { core::mem::transmute::<&__m256i, &[i32; 8usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_i32x8(self, a: &mut i32x8) -> &mut [i32; 8usize] { + unsafe { core::mem::transmute::<&mut __m256i, &mut [i32; 8usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_i32x8(self, a: i32x8, dest: &mut [i32; 8usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i32, + dest.as_mut_ptr(), + 8usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_i32x8(self, a: u8x32) -> i32x8 { + unsafe { + i32x8 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_i32x8(self, a: i32x8) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_i32x8(b).val.0, + self.cvt_to_bytes_i32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i32x8( + self, + a: i32x8, + b: i32x8, + ) -> i32x8 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_i32x8(b).val.0, + self.cvt_to_bytes_i32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { _mm256_add_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { _mm256_sub_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { _mm256_mullo_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_i32x8(self, a: i32x8) -> i32x8 { + a ^ !0 + } + #[inline(always)] + fn shl_i32x8(self, a: i32x8, shift: u32) -> i32x8 { + unsafe { + _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shlv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { _mm256_sllv_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn shr_i32x8(self, a: i32x8, shift: u32) -> i32x8 { + unsafe { + _mm256_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shrv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { _mm256_srav_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_eq_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + unsafe { _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_lt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + unsafe { _mm256_cmpgt_epi32(b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_le_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + unsafe { + _mm256_cmpeq_epi32(_mm256_min_epi32(a.into(), b.into()), a.into()).simd_into(self) + } + } + #[inline(always)] + fn simd_ge_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + unsafe { + _mm256_cmpeq_epi32(_mm256_max_epi32(a.into(), b.into()), a.into()).simd_into(self) + } + } + #[inline(always)] + fn simd_gt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + unsafe { _mm256_cmpgt_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { + let lo = _mm256_unpacklo_epi32(a.into(), b.into()); + let hi = _mm256_unpackhi_epi32(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self) + } + } + #[inline(always)] + fn zip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { + let lo = _mm256_unpacklo_epi32(a.into(), b.into()); + let hi = _mm256_unpackhi_epi32(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self) + } + } + #[inline(always)] + fn unzip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { + let t1 = + _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + let t2 = + _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { + let t1 = + _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + let t2 = + _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn select_i32x8(self, a: mask32x8, b: i32x8, c: i32x8) -> i32x8 { + unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { _mm256_min_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { _mm256_max_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_i32x8(self, a: i32x8, b: i32x8) -> i32x16 { + unsafe { + let lo = _mm512_castsi256_si512(a.into()); + _mm512_inserti64x4::<1>(lo, b.into()).simd_into(self) + } + } + #[inline(always)] + fn split_i32x8(self, a: i32x8) -> (i32x4, i32x4) { + unsafe { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(self), + _mm256_extracti128_si256::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn neg_i32x8(self, a: i32x8) -> i32x8 { + unsafe { _mm256_sub_epi32(_mm256_setzero_si256(), a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_i32x8(self, a: i32x8) -> u8x32 { + __m256i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_i32x8(self, a: i32x8) -> u32x8 { + __m256i::from(a).simd_into(self) + } + #[inline(always)] + fn cvt_f32_i32x8(self, a: i32x8) -> f32x8 { + unsafe { _mm256_cvtepi32_ps(a.into()).simd_into(self) } + } + #[inline(always)] + fn splat_u32x8(self, val: u32) -> u32x8 { + unsafe { _mm256_set1_epi32(val.cast_signed()).simd_into(self) } + } + #[inline(always)] + fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { + u32x8 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8 { + u32x8 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_u32x8(self, a: u32x8) -> [u32; 8usize] { + unsafe { core::mem::transmute::<__m256i, [u32; 8usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_u32x8(self, a: &u32x8) -> &[u32; 8usize] { + unsafe { core::mem::transmute::<&__m256i, &[u32; 8usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_u32x8(self, a: &mut u32x8) -> &mut [u32; 8usize] { + unsafe { core::mem::transmute::<&mut __m256i, &mut [u32; 8usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_u32x8(self, a: u32x8, dest: &mut [u32; 8usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const u32, + dest.as_mut_ptr(), + 8usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_u32x8(self, a: u8x32) -> u32x8 { + unsafe { + u32x8 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_u32x8(self, a: u32x8) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_u32x8(b).val.0, + self.cvt_to_bytes_u32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u32x8( + self, + a: u32x8, + b: u32x8, + ) -> u32x8 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_u32x8(b).val.0, + self.cvt_to_bytes_u32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { _mm256_add_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { _mm256_sub_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { _mm256_mullo_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_u32x8(self, a: u32x8) -> u32x8 { + a ^ !0 + } + #[inline(always)] + fn shl_u32x8(self, a: u32x8, shift: u32) -> u32x8 { + unsafe { + _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shlv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { _mm256_sllv_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn shr_u32x8(self, a: u32x8, shift: u32) -> u32x8 { + unsafe { + _mm256_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shrv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { _mm256_srlv_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_eq_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + unsafe { _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_lt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + unsafe { + let sign_bit = _mm256_set1_epi32(0x80000000u32.cast_signed()); + let a_signed = _mm256_xor_si256(a.into(), sign_bit); + let b_signed = _mm256_xor_si256(b.into(), sign_bit); + _mm256_cmpgt_epi32(b_signed, a_signed).simd_into(self) + } + } + #[inline(always)] + fn simd_le_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + unsafe { + _mm256_cmpeq_epi32(_mm256_min_epu32(a.into(), b.into()), a.into()).simd_into(self) + } + } + #[inline(always)] + fn simd_ge_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + unsafe { + _mm256_cmpeq_epi32(_mm256_max_epu32(a.into(), b.into()), a.into()).simd_into(self) + } + } + #[inline(always)] + fn simd_gt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + unsafe { + let sign_bit = _mm256_set1_epi32(0x80000000u32.cast_signed()); + let a_signed = _mm256_xor_si256(a.into(), sign_bit); + let b_signed = _mm256_xor_si256(b.into(), sign_bit); + _mm256_cmpgt_epi32(a_signed, b_signed).simd_into(self) + } + } + #[inline(always)] + fn zip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { + let lo = _mm256_unpacklo_epi32(a.into(), b.into()); + let hi = _mm256_unpackhi_epi32(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self) + } + } + #[inline(always)] + fn zip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { + let lo = _mm256_unpacklo_epi32(a.into(), b.into()); + let hi = _mm256_unpackhi_epi32(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self) + } + } + #[inline(always)] + fn unzip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { + let t1 = + _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + let t2 = + _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { + let t1 = + _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + let t2 = + _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn select_u32x8(self, a: mask32x8, b: u32x8, c: u32x8) -> u32x8 { + unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { _mm256_min_epu32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { _mm256_max_epu32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_u32x8(self, a: u32x8, b: u32x8) -> u32x16 { + unsafe { + let lo = _mm512_castsi256_si512(a.into()); + _mm512_inserti64x4::<1>(lo, b.into()).simd_into(self) + } + } + #[inline(always)] + fn split_u32x8(self, a: u32x8) -> (u32x4, u32x4) { + unsafe { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(self), + _mm256_extracti128_si256::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn reinterpret_u8_u32x8(self, a: u32x8) -> u8x32 { + __m256i::from(a).simd_into(self) + } + #[inline(always)] + fn cvt_f32_u32x8(self, a: u32x8) -> f32x8 { + unsafe { + let a = a.into(); + let lo = _mm256_blend_epi16::<0xAA>(a, _mm256_set1_epi32(0x4B000000)); + let hi = _mm256_blend_epi16::<0xAA>( + _mm256_srli_epi32::<16>(a), + _mm256_set1_epi32(0x53000000), + ); + let fhi = _mm256_sub_ps( + _mm256_castsi256_ps(hi), + _mm256_set1_ps(f32::from_bits(0x53000080)), + ); + let result = _mm256_add_ps(_mm256_castsi256_ps(lo), fhi); + result.simd_into(self) + } + } + #[inline(always)] + fn splat_mask32x8(self, val: i32) -> mask32x8 { + unsafe { _mm256_set1_epi32(val).simd_into(self) } + } + #[inline(always)] + fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { + mask32x8 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_mask32x8(self, val: &[i32; 8usize]) -> mask32x8 { + mask32x8 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_mask32x8(self, a: mask32x8) -> [i32; 8usize] { + unsafe { core::mem::transmute::<__m256i, [i32; 8usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_mask32x8(self, a: &mask32x8) -> &[i32; 8usize] { + unsafe { core::mem::transmute::<&__m256i, &[i32; 8usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_mask32x8(self, a: &mut mask32x8) -> &mut [i32; 8usize] { + unsafe { core::mem::transmute::<&mut __m256i, &mut [i32; 8usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_mask32x8(self, a: mask32x8, dest: &mut [i32; 8usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i32, + dest.as_mut_ptr(), + 8usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_mask32x8(self, a: u8x32) -> mask32x8 { + unsafe { + mask32x8 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_mask32x8(self, a: mask32x8) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_mask32x8( + self, + a: mask32x8, + b: mask32x8, + ) -> mask32x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_mask32x8(b).val.0, + self.cvt_to_bytes_mask32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_mask32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask32x8( + self, + a: mask32x8, + b: mask32x8, + ) -> mask32x8 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_mask32x8(b).val.0, + self.cvt_to_bytes_mask32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_mask32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_mask32x8(self, a: mask32x8) -> mask32x8 { + a ^ !0 + } + #[inline(always)] + fn select_mask32x8( + self, + a: mask32x8, + b: mask32x8, + c: mask32x8, + ) -> mask32x8 { + unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_eq_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + unsafe { _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn any_true_mask32x8(self, a: mask32x8) -> bool { + unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 != 0 } + } + #[inline(always)] + fn all_true_mask32x8(self, a: mask32x8) -> bool { + unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 == 0b11111111 } + } + #[inline(always)] + fn any_false_mask32x8(self, a: mask32x8) -> bool { + unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 != 0b11111111 } + } + #[inline(always)] + fn all_false_mask32x8(self, a: mask32x8) -> bool { + unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 == 0 } + } + #[inline(always)] + fn combine_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x16 { + unsafe { + let lo = _mm512_castsi256_si512(a.into()); + _mm512_inserti64x4::<1>(lo, b.into()).simd_into(self) + } + } + #[inline(always)] + fn split_mask32x8(self, a: mask32x8) -> (mask32x4, mask32x4) { + unsafe { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(self), + _mm256_extracti128_si256::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn splat_f64x4(self, val: f64) -> f64x4 { + unsafe { _mm256_set1_pd(val).simd_into(self) } + } + #[inline(always)] + fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { + f64x4 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4 { + f64x4 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_f64x4(self, a: f64x4) -> [f64; 4usize] { + unsafe { core::mem::transmute::<__m256d, [f64; 4usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_f64x4(self, a: &f64x4) -> &[f64; 4usize] { + unsafe { core::mem::transmute::<&__m256d, &[f64; 4usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_f64x4(self, a: &mut f64x4) -> &mut [f64; 4usize] { + unsafe { core::mem::transmute::<&mut __m256d, &mut [f64; 4usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_f64x4(self, a: f64x4, dest: &mut [f64; 4usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const f64, + dest.as_mut_ptr(), + 4usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_f64x4(self, a: u8x32) -> f64x4 { + unsafe { + f64x4 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_f64x4(self, a: f64x4) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_f64x4(b).val.0, + self.cvt_to_bytes_f64x4(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f64x4( + self, + a: f64x4, + b: f64x4, + ) -> f64x4 { + unsafe { + if SHIFT >= 2usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_f64x4(b).val.0, + self.cvt_to_bytes_f64x4(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn abs_f64x4(self, a: f64x4) -> f64x4 { + unsafe { _mm256_andnot_pd(_mm256_set1_pd(-0.0), a.into()).simd_into(self) } + } + #[inline(always)] + fn neg_f64x4(self, a: f64x4) -> f64x4 { + unsafe { _mm256_xor_pd(a.into(), _mm256_set1_pd(-0.0)).simd_into(self) } + } + #[inline(always)] + fn sqrt_f64x4(self, a: f64x4) -> f64x4 { + unsafe { _mm256_sqrt_pd(a.into()).simd_into(self) } + } + #[inline(always)] + fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { _mm256_add_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { _mm256_sub_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { _mm256_mul_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { _mm256_div_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn copysign_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { + let mask = _mm256_set1_pd(-0.0); + _mm256_or_pd( + _mm256_and_pd(mask, b.into()), + _mm256_andnot_pd(mask, a.into()), + ) + .simd_into(self) + } + } + #[inline(always)] + fn simd_eq_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<0i32>(a.into(), b.into())).simd_into(self) } + } + #[inline(always)] + fn simd_lt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<17i32>(a.into(), b.into())).simd_into(self) } + } + #[inline(always)] + fn simd_le_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<18i32>(a.into(), b.into())).simd_into(self) } + } + #[inline(always)] + fn simd_ge_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<29i32>(a.into(), b.into())).simd_into(self) } + } + #[inline(always)] + fn simd_gt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<30i32>(a.into(), b.into())).simd_into(self) } + } + #[inline(always)] + fn zip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { + let lo = _mm256_unpacklo_pd(a.into(), b.into()); + let hi = _mm256_unpackhi_pd(a.into(), b.into()); + _mm256_permute2f128_pd::<0b0010_0000>(lo, hi).simd_into(self) + } + } + #[inline(always)] + fn zip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { + let lo = _mm256_unpacklo_pd(a.into(), b.into()); + let hi = _mm256_unpackhi_pd(a.into(), b.into()); + _mm256_permute2f128_pd::<0b0011_0001>(lo, hi).simd_into(self) + } + } + #[inline(always)] + fn unzip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { + let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into()); + let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into()); + _mm256_permute2f128_pd::<0b0010_0000>(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { + let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into()); + let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into()); + _mm256_permute2f128_pd::<0b0011_0001>(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { _mm256_max_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { _mm256_min_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { + let intermediate = _mm256_max_pd(a.into(), b.into()); + let b_is_nan = _mm256_cmp_pd::<3i32>(b.into(), b.into()); + _mm256_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self) + } + } + #[inline(always)] + fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { + let intermediate = _mm256_min_pd(a.into(), b.into()); + let b_is_nan = _mm256_cmp_pd::<3i32>(b.into(), b.into()); + _mm256_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self) + } + } + #[inline(always)] + fn mul_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + unsafe { _mm256_fmadd_pd(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn mul_sub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + unsafe { _mm256_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn floor_f64x4(self, a: f64x4) -> f64x4 { + unsafe { + _mm256_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } + } + #[inline(always)] + fn ceil_f64x4(self, a: f64x4) -> f64x4 { + unsafe { + _mm256_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } + } + #[inline(always)] + fn round_ties_even_f64x4(self, a: f64x4) -> f64x4 { + unsafe { + _mm256_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } + } + #[inline(always)] + fn fract_f64x4(self, a: f64x4) -> f64x4 { + a - self.trunc_f64x4(a) + } + #[inline(always)] + fn trunc_f64x4(self, a: f64x4) -> f64x4 { + unsafe { + _mm256_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) + } + } + #[inline(always)] + fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4 { + unsafe { + _mm256_blendv_pd(c.into(), b.into(), _mm256_castsi256_pd(a.into())).simd_into(self) + } + } + #[inline(always)] + fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8 { + unsafe { + let lo = _mm512_castpd256_pd512(a.into()); + _mm512_insertf64x4::<1>(lo, b.into()).simd_into(self) + } + } + #[inline(always)] + fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2) { + unsafe { + ( + _mm256_extractf128_pd::<0>(a.into()).simd_into(self), + _mm256_extractf128_pd::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { + unsafe { _mm256_castpd_ps(a.into()).simd_into(self) } + } + #[inline(always)] + fn splat_mask64x4(self, val: i64) -> mask64x4 { + unsafe { _mm256_set1_epi64x(val).simd_into(self) } + } + #[inline(always)] + fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { + mask64x4 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_mask64x4(self, val: &[i64; 4usize]) -> mask64x4 { + mask64x4 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_mask64x4(self, a: mask64x4) -> [i64; 4usize] { + unsafe { core::mem::transmute::<__m256i, [i64; 4usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_mask64x4(self, a: &mask64x4) -> &[i64; 4usize] { + unsafe { core::mem::transmute::<&__m256i, &[i64; 4usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_mask64x4(self, a: &mut mask64x4) -> &mut [i64; 4usize] { + unsafe { core::mem::transmute::<&mut __m256i, &mut [i64; 4usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_mask64x4(self, a: mask64x4, dest: &mut [i64; 4usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i64, + dest.as_mut_ptr(), + 4usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_mask64x4(self, a: u8x32) -> mask64x4 { + unsafe { + mask64x4 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_mask64x4(self, a: mask64x4) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_mask64x4( + self, + a: mask64x4, + b: mask64x4, + ) -> mask64x4 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_mask64x4(b).val.0, + self.cvt_to_bytes_mask64x4(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_mask64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask64x4( + self, + a: mask64x4, + b: mask64x4, + ) -> mask64x4 { + unsafe { + if SHIFT >= 2usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_mask64x4(b).val.0, + self.cvt_to_bytes_mask64x4(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_mask64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_mask64x4(self, a: mask64x4) -> mask64x4 { + a ^ !0 + } + #[inline(always)] + fn select_mask64x4( + self, + a: mask64x4, + b: mask64x4, + c: mask64x4, + ) -> mask64x4 { + unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) } + } + #[inline(always)] + fn simd_eq_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + unsafe { _mm256_cmpeq_epi64(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn any_true_mask64x4(self, a: mask64x4) -> bool { + unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 != 0 } + } + #[inline(always)] + fn all_true_mask64x4(self, a: mask64x4) -> bool { + unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 == 0b1111 } + } + #[inline(always)] + fn any_false_mask64x4(self, a: mask64x4) -> bool { + unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 != 0b1111 } + } + #[inline(always)] + fn all_false_mask64x4(self, a: mask64x4) -> bool { + unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 == 0 } + } + #[inline(always)] + fn combine_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x8 { + unsafe { + let lo = _mm512_castsi256_si512(a.into()); + _mm512_inserti64x4::<1>(lo, b.into()).simd_into(self) + } + } + #[inline(always)] + fn split_mask64x4(self, a: mask64x4) -> (mask64x2, mask64x2) { + unsafe { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(self), + _mm256_extracti128_si256::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn splat_f32x16(self, val: f32) -> f32x16 { + unsafe { _mm512_set1_ps(val).simd_into(self) } + } + #[inline(always)] + fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16 { + f32x16 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16 { + f32x16 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_f32x16(self, a: f32x16) -> [f32; 16usize] { + unsafe { core::mem::transmute::<__m512, [f32; 16usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_f32x16(self, a: &f32x16) -> &[f32; 16usize] { + unsafe { core::mem::transmute::<&__m512, &[f32; 16usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_f32x16(self, a: &mut f32x16) -> &mut [f32; 16usize] { + unsafe { core::mem::transmute::<&mut __m512, &mut [f32; 16usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const f32, + dest.as_mut_ptr(), + 16usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_f32x16(self, a: u8x64) -> f32x16 { + unsafe { + f32x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_f32x16(self, a: f32x16) -> u8x64 { + unsafe { + u8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_512( + self.cvt_to_bytes_f32x16(b).val.0, + self.cvt_to_bytes_f32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f32x16( + self, + a: f32x16, + b: f32x16, + ) -> f32x16 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_512( + self.cvt_to_bytes_f32x16(b).val.0, + self.cvt_to_bytes_f32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn abs_f32x16(self, a: f32x16) -> f32x16 { + unsafe { _mm512_andnot_ps(_mm512_set1_ps(-0.0), a.into()).simd_into(self) } + } + #[inline(always)] + fn neg_f32x16(self, a: f32x16) -> f32x16 { + unsafe { _mm512_xor_ps(a.into(), _mm512_set1_ps(-0.0)).simd_into(self) } + } + #[inline(always)] + fn sqrt_f32x16(self, a: f32x16) -> f32x16 { + unsafe { _mm512_sqrt_ps(a.into()).simd_into(self) } + } + #[inline(always)] + fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { _mm512_add_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { _mm512_sub_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { _mm512_mul_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn div_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { _mm512_div_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn copysign_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { + let mask = _mm512_set1_ps(-0.0); + _mm512_or_ps( + _mm512_and_ps(mask, b.into()), + _mm512_andnot_ps(mask, a.into()), + ) + .simd_into(self) + } + } + #[inline(always)] + fn simd_eq_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + unsafe { + let mask = _mm512_cmp_ps_mask::<0i32>(a.into(), b.into()); + _mm512_movm_epi32(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_lt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + unsafe { + let mask = _mm512_cmp_ps_mask::<17i32>(a.into(), b.into()); + _mm512_movm_epi32(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_le_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + unsafe { + let mask = _mm512_cmp_ps_mask::<18i32>(a.into(), b.into()); + _mm512_movm_epi32(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_ge_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + unsafe { + let mask = _mm512_cmp_ps_mask::<29i32>(a.into(), b.into()); + _mm512_movm_epi32(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_gt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + unsafe { + let mask = _mm512_cmp_ps_mask::<30i32>(a.into(), b.into()); + _mm512_movm_epi32(mask).simd_into(self) + } + } + #[inline(always)] + fn zip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { + let idx = _mm512_set_epi32( + 23i32, 7i32, 22i32, 6i32, 21i32, 5i32, 20i32, 4i32, 19i32, 3i32, 18i32, 2i32, + 17i32, 1i32, 16i32, 0i32, + ); + _mm512_permutex2var_ps(a.into(), idx, b.into()).simd_into(self) + } + } + #[inline(always)] + fn zip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { + let idx = _mm512_set_epi32( + 31i32, 15i32, 30i32, 14i32, 29i32, 13i32, 28i32, 12i32, 27i32, 11i32, 26i32, 10i32, + 25i32, 9i32, 24i32, 8i32, + ); + _mm512_permutex2var_ps(a.into(), idx, b.into()).simd_into(self) + } + } + #[inline(always)] + fn unzip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { + let t1 = _mm512_permutexvar_ps( + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15), + a.into(), + ); + let t2 = _mm512_permutexvar_ps( + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15), + b.into(), + ); + _mm512_shuffle_f32x4::<0b01_00_01_00>(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { + let t1 = _mm512_permutexvar_ps( + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15), + a.into(), + ); + let t2 = _mm512_permutexvar_ps( + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15), + b.into(), + ); + _mm512_shuffle_f32x4::<0b11_10_11_10>(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn max_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { _mm512_max_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { _mm512_min_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { + let intermediate = _mm512_max_ps(a.into(), b.into()); + let b_is_nan = _mm512_cmp_ps_mask::<0x03>(b.into(), b.into()); + _mm512_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self) + } + } + #[inline(always)] + fn min_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { + let intermediate = _mm512_min_ps(a.into(), b.into()); + let b_is_nan = _mm512_cmp_ps_mask::<0x03>(b.into(), b.into()); + _mm512_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self) + } + } + #[inline(always)] + fn mul_add_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + unsafe { _mm512_fmadd_ps(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn mul_sub_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + unsafe { _mm512_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn floor_f32x16(self, a: f32x16) -> f32x16 { + unsafe { _mm512_roundscale_ps::<_MM_FROUND_TO_NEG_INF>(a.into()).simd_into(self) } + } + #[inline(always)] + fn ceil_f32x16(self, a: f32x16) -> f32x16 { + unsafe { _mm512_roundscale_ps::<_MM_FROUND_TO_POS_INF>(a.into()).simd_into(self) } + } + #[inline(always)] + fn round_ties_even_f32x16(self, a: f32x16) -> f32x16 { + unsafe { _mm512_roundscale_ps::<_MM_FROUND_TO_NEAREST_INT>(a.into()).simd_into(self) } + } + #[inline(always)] + fn fract_f32x16(self, a: f32x16) -> f32x16 { + a - self.trunc_f32x16(a) + } + #[inline(always)] + fn trunc_f32x16(self, a: f32x16) -> f32x16 { + unsafe { _mm512_roundscale_ps::<_MM_FROUND_TO_ZERO>(a.into()).simd_into(self) } + } + #[inline(always)] + fn select_f32x16(self, a: mask32x16, b: f32x16, c: f32x16) -> f32x16 { + unsafe { + let k = _mm512_movepi32_mask(a.into()); + _mm512_mask_blend_ps(k, c.into(), b.into()).simd_into(self) + } + } + #[inline(always)] + fn split_f32x16(self, a: f32x16) -> (f32x8, f32x8) { + unsafe { + ( + _mm512_castps512_ps256(a.into()).simd_into(self), + _mm512_extractf32x8_ps::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { + unsafe { _mm512_castps_pd(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16 { + unsafe { _mm512_castps_si512(a.into()).simd_into(self) } + } + #[inline(always)] + fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { + unsafe { + let v0 = _mm_loadu_ps(src.as_ptr() as *const _); + let v1 = _mm_loadu_ps(src.as_ptr().add(4usize) as *const _); + let v2 = _mm_loadu_ps(src.as_ptr().add(2 * 4usize) as *const _); + let v3 = _mm_loadu_ps(src.as_ptr().add(3 * 4usize) as *const _); + let tmp0 = _mm_unpacklo_ps(v0, v1); + let tmp1 = _mm_unpackhi_ps(v0, v1); + let tmp2 = _mm_unpacklo_ps(v2, v3); + let tmp3 = _mm_unpackhi_ps(v2, v3); + let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + self.combine_f32x8( + self.combine_f32x4(out0.simd_into(self), out1.simd_into(self)), + self.combine_f32x4(out2.simd_into(self), out3.simd_into(self)), + ) + } + } + #[inline(always)] + fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { + let (v01, v23) = self.split_f32x16(a); + let (v0, v1) = self.split_f32x8(v01); + let (v2, v3) = self.split_f32x8(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + unsafe { + let tmp0 = _mm_unpacklo_ps(v0, v1); + let tmp1 = _mm_unpackhi_ps(v0, v1); + let tmp2 = _mm_unpacklo_ps(v2, v3); + let tmp3 = _mm_unpackhi_ps(v2, v3); + let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + _mm_storeu_ps(dest.as_mut_ptr() as *mut _, out0); + _mm_storeu_ps(dest.as_mut_ptr().add(4usize) as *mut _, out1); + _mm_storeu_ps(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2); + _mm_storeu_ps(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3); + } + } + #[inline(always)] + fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { + unsafe { _mm512_castps_si512(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u32_f32x16(self, a: f32x16) -> u32x16 { + unsafe { _mm512_castps_si512(a.into()).simd_into(self) } + } + #[inline(always)] + fn cvt_u32_f32x16(self, a: f32x16) -> u32x16 { + unsafe { _mm512_cvttps_epu32(a.into()).simd_into(self) } + } + #[inline(always)] + fn cvt_u32_precise_f32x16(self, a: f32x16) -> u32x16 { + unsafe { + let a = _mm512_max_ps(a.into(), _mm512_setzero_ps()); + let mut converted = _mm512_cvttps_epu32(a); + let exceeds_range_mask = + _mm512_cmp_ps_mask::<{ _CMP_GT_OQ }>(a, _mm512_set1_ps(4294967040.0)); + if exceeds_range_mask != 0 { + converted = _mm512_mask_blend_epi32( + exceeds_range_mask, + converted, + _mm512_set1_epi32(u32::MAX.cast_signed()), + ); + } + converted.simd_into(self) + } + } + #[inline(always)] + fn cvt_i32_f32x16(self, a: f32x16) -> i32x16 { + unsafe { _mm512_cvttps_epi32(a.into()).simd_into(self) } + } + #[inline(always)] + fn cvt_i32_precise_f32x16(self, a: f32x16) -> i32x16 { + unsafe { + let a = a.into(); + let mut converted = _mm512_cvttps_epi32(a); + let in_range_mask = + _mm512_cmp_ps_mask::<{ _CMP_LT_OQ }>(a, _mm512_set1_ps(2147483648.0)); + let all_in_range = in_range_mask == 0xFFFF; + if !all_in_range { + converted = + _mm512_mask_blend_epi32(in_range_mask, _mm512_set1_epi32(i32::MAX), converted); + let is_not_nan_mask = _mm512_cmp_ps_mask::<{ _CMP_ORD_Q }>(a, a); + converted = _mm512_maskz_mov_epi32(is_not_nan_mask, converted); + } + converted.simd_into(self) + } + } + #[inline(always)] + fn splat_i8x64(self, val: i8) -> i8x64 { + unsafe { _mm512_set1_epi8(val).simd_into(self) } + } + #[inline(always)] + fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64 { + i8x64 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64 { + i8x64 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_i8x64(self, a: i8x64) -> [i8; 64usize] { + unsafe { core::mem::transmute::<__m512i, [i8; 64usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_i8x64(self, a: &i8x64) -> &[i8; 64usize] { + unsafe { core::mem::transmute::<&__m512i, &[i8; 64usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_i8x64(self, a: &mut i8x64) -> &mut [i8; 64usize] { + unsafe { core::mem::transmute::<&mut __m512i, &mut [i8; 64usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_i8x64(self, a: i8x64, dest: &mut [i8; 64usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i8, + dest.as_mut_ptr(), + 64usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_i8x64(self, a: u8x64) -> i8x64 { + unsafe { + i8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_i8x64(self, a: i8x64) -> u8x64 { + unsafe { + u8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { + if SHIFT >= 64usize { + return b; + } + let result = cross_block_alignr_512( + self.cvt_to_bytes_i8x64(b).val.0, + self.cvt_to_bytes_i8x64(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i8x64( + self, + a: i8x64, + b: i8x64, + ) -> i8x64 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = dyn_alignr_512( + self.cvt_to_bytes_i8x64(b).val.0, + self.cvt_to_bytes_i8x64(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { _mm512_add_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { _mm512_sub_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { + let dst_even = _mm512_mullo_epi16(a.into(), b.into()); + let dst_odd = _mm512_mullo_epi16( + _mm512_srli_epi16::<8>(a.into()), + _mm512_srli_epi16::<8>(b.into()), + ); + _mm512_or_si512( + _mm512_slli_epi16(dst_odd, 8), + _mm512_and_si512(dst_even, _mm512_set1_epi16(0xFF)), + ) + .simd_into(self) + } + } + #[inline(always)] + fn and_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_i8x64(self, a: i8x64) -> i8x64 { + a ^ !0 + } + #[inline(always)] + fn shl_i8x64(self, a: i8x64, shift: u32) -> i8x64 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_256 = _mm512_castsi512_si256(val); + let hi_256 = _mm512_extracti64x4_epi64::<1>(val); + let lo_16 = _mm512_cvtepi8_epi16(lo_256); + let hi_16 = _mm512_cvtepi8_epi16(hi_256); + let lo_shifted = _mm512_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm512_sll_epi16(hi_16, shift_count); + const PACK_LO_BYTES: __m512i = unsafe { + core::mem::transmute([ + 0u8, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, + 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, + 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, + 116, 118, 120, 122, 124, 126, + ]) + }; + let result = _mm512_permutex2var_epi8(lo_shifted, PACK_LO_BYTES, hi_shifted); + result.simd_into(self) + } + } + #[inline(always)] + fn shlv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_i8x64(self, a: i8x64, shift: u32) -> i8x64 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_256 = _mm512_castsi512_si256(val); + let hi_256 = _mm512_extracti64x4_epi64::<1>(val); + let lo_16 = _mm512_cvtepi8_epi16(lo_256); + let hi_16 = _mm512_cvtepi8_epi16(hi_256); + let lo_shifted = _mm512_sra_epi16(lo_16, shift_count); + let hi_shifted = _mm512_sra_epi16(hi_16, shift_count); + const PACK_LO_BYTES: __m512i = unsafe { + core::mem::transmute([ + 0u8, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, + 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, + 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, + 116, 118, 120, 122, 124, 126, + ]) + }; + let result = _mm512_permutex2var_epi8(lo_shifted, PACK_LO_BYTES, hi_shifted); + result.simd_into(self) + } + } + #[inline(always)] + fn shrv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + unsafe { + let mask = _mm512_cmpeq_epi8_mask(a.into(), b.into()); + _mm512_movm_epi8(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_lt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + unsafe { + let mask = _mm512_cmplt_epi8_mask(a.into(), b.into()); + _mm512_movm_epi8(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_le_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + unsafe { + let mask = _mm512_cmple_epi8_mask(a.into(), b.into()); + _mm512_movm_epi8(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_ge_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + unsafe { + let mask = _mm512_cmpge_epi8_mask(a.into(), b.into()); + _mm512_movm_epi8(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_gt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + unsafe { + let mask = _mm512_cmpgt_epi8_mask(a.into(), b.into()); + _mm512_movm_epi8(mask).simd_into(self) + } + } + #[inline(always)] + fn zip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { + let idx = _mm512_set_epi8( + 95i8, 31i8, 94i8, 30i8, 93i8, 29i8, 92i8, 28i8, 91i8, 27i8, 90i8, 26i8, 89i8, 25i8, + 88i8, 24i8, 87i8, 23i8, 86i8, 22i8, 85i8, 21i8, 84i8, 20i8, 83i8, 19i8, 82i8, 18i8, + 81i8, 17i8, 80i8, 16i8, 79i8, 15i8, 78i8, 14i8, 77i8, 13i8, 76i8, 12i8, 75i8, 11i8, + 74i8, 10i8, 73i8, 9i8, 72i8, 8i8, 71i8, 7i8, 70i8, 6i8, 69i8, 5i8, 68i8, 4i8, 67i8, + 3i8, 66i8, 2i8, 65i8, 1i8, 64i8, 0i8, + ); + _mm512_permutex2var_epi8(a.into(), idx, b.into()).simd_into(self) + } + } + #[inline(always)] + fn zip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { + let idx = _mm512_set_epi8( + 127i8, 63i8, 126i8, 62i8, 125i8, 61i8, 124i8, 60i8, 123i8, 59i8, 122i8, 58i8, + 121i8, 57i8, 120i8, 56i8, 119i8, 55i8, 118i8, 54i8, 117i8, 53i8, 116i8, 52i8, + 115i8, 51i8, 114i8, 50i8, 113i8, 49i8, 112i8, 48i8, 111i8, 47i8, 110i8, 46i8, + 109i8, 45i8, 108i8, 44i8, 107i8, 43i8, 106i8, 42i8, 105i8, 41i8, 104i8, 40i8, + 103i8, 39i8, 102i8, 38i8, 101i8, 37i8, 100i8, 36i8, 99i8, 35i8, 98i8, 34i8, 97i8, + 33i8, 96i8, 32i8, + ); + _mm512_permutex2var_epi8(a.into(), idx, b.into()).simd_into(self) + } + } + #[inline(always)] + fn unzip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { + let mask = _mm512_set4_epi64( + 0x0F0D0B0907050301u64.cast_signed(), + 0x0E0C0A0806040200u64.cast_signed(), + 0x0F0D0B0907050301u64.cast_signed(), + 0x0E0C0A0806040200u64.cast_signed(), + ); + let a_shuffled = _mm512_shuffle_epi8(a.into(), mask); + let b_shuffled = _mm512_shuffle_epi8(b.into(), mask); + let a_packed = _mm512_permutex_epi64::<0b11_01_10_00>(a_shuffled); + let b_packed = _mm512_permutex_epi64::<0b11_01_10_00>(b_shuffled); + _mm512_shuffle_i64x2::<0b01_00_01_00>(a_packed, b_packed).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { + let mask = _mm512_set4_epi64( + 0x0F0D0B0907050301u64.cast_signed(), + 0x0E0C0A0806040200u64.cast_signed(), + 0x0F0D0B0907050301u64.cast_signed(), + 0x0E0C0A0806040200u64.cast_signed(), + ); + let a_shuffled = _mm512_shuffle_epi8(a.into(), mask); + let b_shuffled = _mm512_shuffle_epi8(b.into(), mask); + let a_packed = _mm512_permutex_epi64::<0b11_01_10_00>(a_shuffled); + let b_packed = _mm512_permutex_epi64::<0b11_01_10_00>(b_shuffled); + _mm512_shuffle_i64x2::<0b11_10_11_10>(a_packed, b_packed).simd_into(self) + } + } + #[inline(always)] + fn select_i8x64(self, a: mask8x64, b: i8x64, c: i8x64) -> i8x64 { + unsafe { + let k = _mm512_movepi8_mask(a.into()); + _mm512_mask_blend_epi8(k, c.into(), b.into()).simd_into(self) + } + } + #[inline(always)] + fn min_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { _mm512_min_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { _mm512_max_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn split_i8x64(self, a: i8x64) -> (i8x32, i8x32) { + unsafe { + ( + _mm512_castsi512_si256(a.into()).simd_into(self), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn neg_i8x64(self, a: i8x64) -> i8x64 { + unsafe { _mm512_sub_epi8(_mm512_setzero_si512(), a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_i8x64(self, a: i8x64) -> u8x64 { + __m512i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_i8x64(self, a: i8x64) -> u32x16 { + __m512i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_u8x64(self, val: u8) -> u8x64 { + unsafe { _mm512_set1_epi8(val.cast_signed()).simd_into(self) } + } + #[inline(always)] + fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64 { + u8x64 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64 { + u8x64 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_u8x64(self, a: u8x64) -> [u8; 64usize] { + unsafe { core::mem::transmute::<__m512i, [u8; 64usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_u8x64(self, a: &u8x64) -> &[u8; 64usize] { + unsafe { core::mem::transmute::<&__m512i, &[u8; 64usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_u8x64(self, a: &mut u8x64) -> &mut [u8; 64usize] { + unsafe { core::mem::transmute::<&mut __m512i, &mut [u8; 64usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const u8, + dest.as_mut_ptr(), + 64usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_u8x64(self, a: u8x64) -> u8x64 { + unsafe { + u8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_u8x64(self, a: u8x64) -> u8x64 { + unsafe { + u8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { + if SHIFT >= 64usize { + return b; + } + let result = cross_block_alignr_512( + self.cvt_to_bytes_u8x64(b).val.0, + self.cvt_to_bytes_u8x64(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u8x64( + self, + a: u8x64, + b: u8x64, + ) -> u8x64 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = dyn_alignr_512( + self.cvt_to_bytes_u8x64(b).val.0, + self.cvt_to_bytes_u8x64(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { _mm512_add_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { _mm512_sub_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { + let dst_even = _mm512_mullo_epi16(a.into(), b.into()); + let dst_odd = _mm512_mullo_epi16( + _mm512_srli_epi16::<8>(a.into()), + _mm512_srli_epi16::<8>(b.into()), + ); + _mm512_or_si512( + _mm512_slli_epi16(dst_odd, 8), + _mm512_and_si512(dst_even, _mm512_set1_epi16(0xFF)), + ) + .simd_into(self) + } + } + #[inline(always)] + fn and_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_u8x64(self, a: u8x64) -> u8x64 { + a ^ !0 + } + #[inline(always)] + fn shl_u8x64(self, a: u8x64, shift: u32) -> u8x64 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_256 = _mm512_castsi512_si256(val); + let hi_256 = _mm512_extracti64x4_epi64::<1>(val); + let lo_16 = _mm512_cvtepu8_epi16(lo_256); + let hi_16 = _mm512_cvtepu8_epi16(hi_256); + let lo_shifted = _mm512_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm512_sll_epi16(hi_16, shift_count); + const PACK_LO_BYTES: __m512i = unsafe { + core::mem::transmute([ + 0u8, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, + 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, + 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, + 116, 118, 120, 122, 124, 126, + ]) + }; + let result = _mm512_permutex2var_epi8(lo_shifted, PACK_LO_BYTES, hi_shifted); + result.simd_into(self) + } + } + #[inline(always)] + fn shlv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_u8x64(self, a: u8x64, shift: u32) -> u8x64 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_256 = _mm512_castsi512_si256(val); + let hi_256 = _mm512_extracti64x4_epi64::<1>(val); + let lo_16 = _mm512_cvtepu8_epi16(lo_256); + let hi_16 = _mm512_cvtepu8_epi16(hi_256); + let lo_shifted = _mm512_srl_epi16(lo_16, shift_count); + let hi_shifted = _mm512_srl_epi16(hi_16, shift_count); + const PACK_LO_BYTES: __m512i = unsafe { + core::mem::transmute([ + 0u8, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, + 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, + 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, + 116, 118, 120, 122, 124, 126, + ]) + }; + let result = _mm512_permutex2var_epi8(lo_shifted, PACK_LO_BYTES, hi_shifted); + result.simd_into(self) + } + } + #[inline(always)] + fn shrv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + unsafe { + let mask = _mm512_cmpeq_epi8_mask(a.into(), b.into()); + _mm512_movm_epi8(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_lt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + unsafe { + let mask = _mm512_cmplt_epu8_mask(a.into(), b.into()); + _mm512_movm_epi8(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_le_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + unsafe { + let mask = _mm512_cmple_epu8_mask(a.into(), b.into()); + _mm512_movm_epi8(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_ge_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + unsafe { + let mask = _mm512_cmpge_epu8_mask(a.into(), b.into()); + _mm512_movm_epi8(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_gt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + unsafe { + let mask = _mm512_cmpgt_epu8_mask(a.into(), b.into()); + _mm512_movm_epi8(mask).simd_into(self) + } + } + #[inline(always)] + fn zip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { + let idx = _mm512_set_epi8( + 95i8, 31i8, 94i8, 30i8, 93i8, 29i8, 92i8, 28i8, 91i8, 27i8, 90i8, 26i8, 89i8, 25i8, + 88i8, 24i8, 87i8, 23i8, 86i8, 22i8, 85i8, 21i8, 84i8, 20i8, 83i8, 19i8, 82i8, 18i8, + 81i8, 17i8, 80i8, 16i8, 79i8, 15i8, 78i8, 14i8, 77i8, 13i8, 76i8, 12i8, 75i8, 11i8, + 74i8, 10i8, 73i8, 9i8, 72i8, 8i8, 71i8, 7i8, 70i8, 6i8, 69i8, 5i8, 68i8, 4i8, 67i8, + 3i8, 66i8, 2i8, 65i8, 1i8, 64i8, 0i8, + ); + _mm512_permutex2var_epi8(a.into(), idx, b.into()).simd_into(self) + } + } + #[inline(always)] + fn zip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { + let idx = _mm512_set_epi8( + 127i8, 63i8, 126i8, 62i8, 125i8, 61i8, 124i8, 60i8, 123i8, 59i8, 122i8, 58i8, + 121i8, 57i8, 120i8, 56i8, 119i8, 55i8, 118i8, 54i8, 117i8, 53i8, 116i8, 52i8, + 115i8, 51i8, 114i8, 50i8, 113i8, 49i8, 112i8, 48i8, 111i8, 47i8, 110i8, 46i8, + 109i8, 45i8, 108i8, 44i8, 107i8, 43i8, 106i8, 42i8, 105i8, 41i8, 104i8, 40i8, + 103i8, 39i8, 102i8, 38i8, 101i8, 37i8, 100i8, 36i8, 99i8, 35i8, 98i8, 34i8, 97i8, + 33i8, 96i8, 32i8, + ); + _mm512_permutex2var_epi8(a.into(), idx, b.into()).simd_into(self) + } + } + #[inline(always)] + fn unzip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { + let mask = _mm512_set4_epi64( + 0x0F0D0B0907050301u64.cast_signed(), + 0x0E0C0A0806040200u64.cast_signed(), + 0x0F0D0B0907050301u64.cast_signed(), + 0x0E0C0A0806040200u64.cast_signed(), + ); + let a_shuffled = _mm512_shuffle_epi8(a.into(), mask); + let b_shuffled = _mm512_shuffle_epi8(b.into(), mask); + let a_packed = _mm512_permutex_epi64::<0b11_01_10_00>(a_shuffled); + let b_packed = _mm512_permutex_epi64::<0b11_01_10_00>(b_shuffled); + _mm512_shuffle_i64x2::<0b01_00_01_00>(a_packed, b_packed).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { + let mask = _mm512_set4_epi64( + 0x0F0D0B0907050301u64.cast_signed(), + 0x0E0C0A0806040200u64.cast_signed(), + 0x0F0D0B0907050301u64.cast_signed(), + 0x0E0C0A0806040200u64.cast_signed(), + ); + let a_shuffled = _mm512_shuffle_epi8(a.into(), mask); + let b_shuffled = _mm512_shuffle_epi8(b.into(), mask); + let a_packed = _mm512_permutex_epi64::<0b11_01_10_00>(a_shuffled); + let b_packed = _mm512_permutex_epi64::<0b11_01_10_00>(b_shuffled); + _mm512_shuffle_i64x2::<0b11_10_11_10>(a_packed, b_packed).simd_into(self) + } + } + #[inline(always)] + fn select_u8x64(self, a: mask8x64, b: u8x64, c: u8x64) -> u8x64 { + unsafe { + let k = _mm512_movepi8_mask(a.into()); + _mm512_mask_blend_epi8(k, c.into(), b.into()).simd_into(self) + } + } + #[inline(always)] + fn min_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { _mm512_min_epu8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { _mm512_max_epu8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn split_u8x64(self, a: u8x64) -> (u8x32, u8x32) { + unsafe { + ( + _mm512_castsi512_si256(a.into()).simd_into(self), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { + unsafe { + let v0 = _mm_loadu_si128(src.as_ptr() as *const _); + let v1 = _mm_loadu_si128(src.as_ptr().add(16usize) as *const _); + let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 16usize) as *const _); + let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 16usize) as *const _); + let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); + let v0 = _mm_shuffle_epi8(v0, mask); + let v1 = _mm_shuffle_epi8(v1, mask); + let v2 = _mm_shuffle_epi8(v2, mask); + let v3 = _mm_shuffle_epi8(v3, mask); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + self.combine_u8x32( + self.combine_u8x16(out0.simd_into(self), out1.simd_into(self)), + self.combine_u8x16(out2.simd_into(self), out3.simd_into(self)), + ) + } + } + #[inline(always)] + fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { + let (v01, v23) = self.split_u8x64(a); + let (v0, v1) = self.split_u8x32(v01); + let (v2, v3) = self.split_u8x32(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + unsafe { + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); + let out0 = _mm_shuffle_epi8(out0, mask); + let out1 = _mm_shuffle_epi8(out1, mask); + let out2 = _mm_shuffle_epi8(out2, mask); + let out3 = _mm_shuffle_epi8(out3, mask); + _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0); + _mm_storeu_si128(dest.as_mut_ptr().add(16usize) as *mut _, out1); + _mm_storeu_si128(dest.as_mut_ptr().add(2 * 16usize) as *mut _, out2); + _mm_storeu_si128(dest.as_mut_ptr().add(3 * 16usize) as *mut _, out3); + } + } + #[inline(always)] + fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { + __m512i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_mask8x64(self, val: i8) -> mask8x64 { + unsafe { _mm512_set1_epi8(val).simd_into(self) } + } + #[inline(always)] + fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64 { + mask8x64 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_mask8x64(self, val: &[i8; 64usize]) -> mask8x64 { + mask8x64 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_mask8x64(self, a: mask8x64) -> [i8; 64usize] { + unsafe { core::mem::transmute::<__m512i, [i8; 64usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_mask8x64(self, a: &mask8x64) -> &[i8; 64usize] { + unsafe { core::mem::transmute::<&__m512i, &[i8; 64usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_mask8x64(self, a: &mut mask8x64) -> &mut [i8; 64usize] { + unsafe { core::mem::transmute::<&mut __m512i, &mut [i8; 64usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_mask8x64(self, a: mask8x64, dest: &mut [i8; 64usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i8, + dest.as_mut_ptr(), + 64usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_mask8x64(self, a: u8x64) -> mask8x64 { + unsafe { + mask8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_mask8x64(self, a: mask8x64) -> u8x64 { + unsafe { + u8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_mask8x64( + self, + a: mask8x64, + b: mask8x64, + ) -> mask8x64 { + unsafe { + if SHIFT >= 64usize { + return b; + } + let result = cross_block_alignr_512( + self.cvt_to_bytes_mask8x64(b).val.0, + self.cvt_to_bytes_mask8x64(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_mask8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask8x64( + self, + a: mask8x64, + b: mask8x64, + ) -> mask8x64 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = dyn_alignr_512( + self.cvt_to_bytes_mask8x64(b).val.0, + self.cvt_to_bytes_mask8x64(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_mask8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_mask8x64(self, a: mask8x64) -> mask8x64 { + a ^ !0 + } + #[inline(always)] + fn select_mask8x64( + self, + a: mask8x64, + b: mask8x64, + c: mask8x64, + ) -> mask8x64 { + unsafe { + let k = _mm512_movepi8_mask(a.into()); + _mm512_mask_blend_epi8(k, c.into(), b.into()).simd_into(self) + } + } + #[inline(always)] + fn simd_eq_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + unsafe { + let mask = _mm512_cmpeq_epi8_mask(a.into(), b.into()); + _mm512_movm_epi8(mask).simd_into(self) + } + } + #[inline(always)] + fn any_true_mask8x64(self, a: mask8x64) -> bool { + unsafe { _mm512_movepi8_mask(a.into()) != 0 } + } + #[inline(always)] + fn all_true_mask8x64(self, a: mask8x64) -> bool { + unsafe { _mm512_movepi8_mask(a.into()) == 0xFFFFFFFFFFFFFFFFu64 } + } + #[inline(always)] + fn any_false_mask8x64(self, a: mask8x64) -> bool { + unsafe { _mm512_movepi8_mask(a.into()) != 0xFFFFFFFFFFFFFFFFu64 } + } + #[inline(always)] + fn all_false_mask8x64(self, a: mask8x64) -> bool { + unsafe { _mm512_movepi8_mask(a.into()) == 0 } + } + #[inline(always)] + fn split_mask8x64(self, a: mask8x64) -> (mask8x32, mask8x32) { + unsafe { + ( + _mm512_castsi512_si256(a.into()).simd_into(self), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn splat_i16x32(self, val: i16) -> i16x32 { + unsafe { _mm512_set1_epi16(val).simd_into(self) } + } + #[inline(always)] + fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32 { + i16x32 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32 { + i16x32 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_i16x32(self, a: i16x32) -> [i16; 32usize] { + unsafe { core::mem::transmute::<__m512i, [i16; 32usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_i16x32(self, a: &i16x32) -> &[i16; 32usize] { + unsafe { core::mem::transmute::<&__m512i, &[i16; 32usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_i16x32(self, a: &mut i16x32) -> &mut [i16; 32usize] { + unsafe { core::mem::transmute::<&mut __m512i, &mut [i16; 32usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_i16x32(self, a: i16x32, dest: &mut [i16; 32usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i16, + dest.as_mut_ptr(), + 32usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_i16x32(self, a: u8x64) -> i16x32 { + unsafe { + i16x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_i16x32(self, a: i16x32) -> u8x64 { + unsafe { + u8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_alignr_512( + self.cvt_to_bytes_i16x32(b).val.0, + self.cvt_to_bytes_i16x32(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i16x32( + self, + a: i16x32, + b: i16x32, + ) -> i16x32 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = dyn_alignr_512( + self.cvt_to_bytes_i16x32(b).val.0, + self.cvt_to_bytes_i16x32(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { _mm512_add_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { _mm512_sub_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { _mm512_mullo_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_i16x32(self, a: i16x32) -> i16x32 { + a ^ !0 + } + #[inline(always)] + fn shl_i16x32(self, a: i16x32, shift: u32) -> i16x32 { + unsafe { + _mm512_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shlv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_i16x32(self, a: i16x32, shift: u32) -> i16x32 { + unsafe { + _mm512_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shrv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + unsafe { + let mask = _mm512_cmpeq_epi16_mask(a.into(), b.into()); + _mm512_movm_epi16(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_lt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + unsafe { + let mask = _mm512_cmplt_epi16_mask(a.into(), b.into()); + _mm512_movm_epi16(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_le_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + unsafe { + let mask = _mm512_cmple_epi16_mask(a.into(), b.into()); + _mm512_movm_epi16(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_ge_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + unsafe { + let mask = _mm512_cmpge_epi16_mask(a.into(), b.into()); + _mm512_movm_epi16(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_gt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + unsafe { + let mask = _mm512_cmpgt_epi16_mask(a.into(), b.into()); + _mm512_movm_epi16(mask).simd_into(self) + } + } + #[inline(always)] + fn zip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { + let idx = _mm512_set_epi16( + 47i16, 15i16, 46i16, 14i16, 45i16, 13i16, 44i16, 12i16, 43i16, 11i16, 42i16, 10i16, + 41i16, 9i16, 40i16, 8i16, 39i16, 7i16, 38i16, 6i16, 37i16, 5i16, 36i16, 4i16, + 35i16, 3i16, 34i16, 2i16, 33i16, 1i16, 32i16, 0i16, + ); + _mm512_permutex2var_epi16(a.into(), idx, b.into()).simd_into(self) + } + } + #[inline(always)] + fn zip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { + let idx = _mm512_set_epi16( + 63i16, 31i16, 62i16, 30i16, 61i16, 29i16, 60i16, 28i16, 59i16, 27i16, 58i16, 26i16, + 57i16, 25i16, 56i16, 24i16, 55i16, 23i16, 54i16, 22i16, 53i16, 21i16, 52i16, 20i16, + 51i16, 19i16, 50i16, 18i16, 49i16, 17i16, 48i16, 16i16, + ); + _mm512_permutex2var_epi16(a.into(), idx, b.into()).simd_into(self) + } + } + #[inline(always)] + fn unzip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { + let mask = _mm512_set4_epi64( + 0x0F0E0B0A07060302u64.cast_signed(), + 0x0D0C090805040100u64.cast_signed(), + 0x0F0E0B0A07060302u64.cast_signed(), + 0x0D0C090805040100u64.cast_signed(), + ); + let a_shuffled = _mm512_shuffle_epi8(a.into(), mask); + let b_shuffled = _mm512_shuffle_epi8(b.into(), mask); + let a_packed = _mm512_permutex_epi64::<0b11_01_10_00>(a_shuffled); + let b_packed = _mm512_permutex_epi64::<0b11_01_10_00>(b_shuffled); + _mm512_shuffle_i64x2::<0b01_00_01_00>(a_packed, b_packed).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { + let mask = _mm512_set4_epi64( + 0x0F0E0B0A07060302u64.cast_signed(), + 0x0D0C090805040100u64.cast_signed(), + 0x0F0E0B0A07060302u64.cast_signed(), + 0x0D0C090805040100u64.cast_signed(), + ); + let a_shuffled = _mm512_shuffle_epi8(a.into(), mask); + let b_shuffled = _mm512_shuffle_epi8(b.into(), mask); + let a_packed = _mm512_permutex_epi64::<0b11_01_10_00>(a_shuffled); + let b_packed = _mm512_permutex_epi64::<0b11_01_10_00>(b_shuffled); + _mm512_shuffle_i64x2::<0b11_10_11_10>(a_packed, b_packed).simd_into(self) + } + } + #[inline(always)] + fn select_i16x32(self, a: mask16x32, b: i16x32, c: i16x32) -> i16x32 { + unsafe { + let k = _mm512_movepi16_mask(a.into()); + _mm512_mask_blend_epi16(k, c.into(), b.into()).simd_into(self) + } + } + #[inline(always)] + fn min_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { _mm512_min_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { _mm512_max_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn split_i16x32(self, a: i16x32) -> (i16x16, i16x16) { + unsafe { + ( + _mm512_castsi512_si256(a.into()).simd_into(self), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn neg_i16x32(self, a: i16x32) -> i16x32 { + unsafe { _mm512_sub_epi16(_mm512_setzero_si512(), a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_i16x32(self, a: i16x32) -> u8x64 { + __m512i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_i16x32(self, a: i16x32) -> u32x16 { + __m512i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_u16x32(self, val: u16) -> u16x32 { + unsafe { _mm512_set1_epi16(val.cast_signed()).simd_into(self) } + } + #[inline(always)] + fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32 { + u16x32 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32 { + u16x32 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_u16x32(self, a: u16x32) -> [u16; 32usize] { + unsafe { core::mem::transmute::<__m512i, [u16; 32usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_u16x32(self, a: &u16x32) -> &[u16; 32usize] { + unsafe { core::mem::transmute::<&__m512i, &[u16; 32usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_u16x32(self, a: &mut u16x32) -> &mut [u16; 32usize] { + unsafe { core::mem::transmute::<&mut __m512i, &mut [u16; 32usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const u16, + dest.as_mut_ptr(), + 32usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_u16x32(self, a: u8x64) -> u16x32 { + unsafe { + u16x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_u16x32(self, a: u16x32) -> u8x64 { + unsafe { + u8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_alignr_512( + self.cvt_to_bytes_u16x32(b).val.0, + self.cvt_to_bytes_u16x32(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u16x32( + self, + a: u16x32, + b: u16x32, + ) -> u16x32 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = dyn_alignr_512( + self.cvt_to_bytes_u16x32(b).val.0, + self.cvt_to_bytes_u16x32(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { _mm512_add_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { _mm512_sub_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { _mm512_mullo_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_u16x32(self, a: u16x32) -> u16x32 { + a ^ !0 + } + #[inline(always)] + fn shl_u16x32(self, a: u16x32, shift: u32) -> u16x32 { + unsafe { + _mm512_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shlv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_u16x32(self, a: u16x32, shift: u32) -> u16x32 { + unsafe { + _mm512_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shrv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + unsafe { + let mask = _mm512_cmpeq_epi16_mask(a.into(), b.into()); + _mm512_movm_epi16(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_lt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + unsafe { + let mask = _mm512_cmplt_epu16_mask(a.into(), b.into()); + _mm512_movm_epi16(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_le_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + unsafe { + let mask = _mm512_cmple_epu16_mask(a.into(), b.into()); + _mm512_movm_epi16(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_ge_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + unsafe { + let mask = _mm512_cmpge_epu16_mask(a.into(), b.into()); + _mm512_movm_epi16(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_gt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + unsafe { + let mask = _mm512_cmpgt_epu16_mask(a.into(), b.into()); + _mm512_movm_epi16(mask).simd_into(self) + } + } + #[inline(always)] + fn zip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { + let idx = _mm512_set_epi16( + 47i16, 15i16, 46i16, 14i16, 45i16, 13i16, 44i16, 12i16, 43i16, 11i16, 42i16, 10i16, + 41i16, 9i16, 40i16, 8i16, 39i16, 7i16, 38i16, 6i16, 37i16, 5i16, 36i16, 4i16, + 35i16, 3i16, 34i16, 2i16, 33i16, 1i16, 32i16, 0i16, + ); + _mm512_permutex2var_epi16(a.into(), idx, b.into()).simd_into(self) + } + } + #[inline(always)] + fn zip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { + let idx = _mm512_set_epi16( + 63i16, 31i16, 62i16, 30i16, 61i16, 29i16, 60i16, 28i16, 59i16, 27i16, 58i16, 26i16, + 57i16, 25i16, 56i16, 24i16, 55i16, 23i16, 54i16, 22i16, 53i16, 21i16, 52i16, 20i16, + 51i16, 19i16, 50i16, 18i16, 49i16, 17i16, 48i16, 16i16, + ); + _mm512_permutex2var_epi16(a.into(), idx, b.into()).simd_into(self) + } + } + #[inline(always)] + fn unzip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { + let mask = _mm512_set4_epi64( + 0x0F0E0B0A07060302u64.cast_signed(), + 0x0D0C090805040100u64.cast_signed(), + 0x0F0E0B0A07060302u64.cast_signed(), + 0x0D0C090805040100u64.cast_signed(), + ); + let a_shuffled = _mm512_shuffle_epi8(a.into(), mask); + let b_shuffled = _mm512_shuffle_epi8(b.into(), mask); + let a_packed = _mm512_permutex_epi64::<0b11_01_10_00>(a_shuffled); + let b_packed = _mm512_permutex_epi64::<0b11_01_10_00>(b_shuffled); + _mm512_shuffle_i64x2::<0b01_00_01_00>(a_packed, b_packed).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { + let mask = _mm512_set4_epi64( + 0x0F0E0B0A07060302u64.cast_signed(), + 0x0D0C090805040100u64.cast_signed(), + 0x0F0E0B0A07060302u64.cast_signed(), + 0x0D0C090805040100u64.cast_signed(), + ); + let a_shuffled = _mm512_shuffle_epi8(a.into(), mask); + let b_shuffled = _mm512_shuffle_epi8(b.into(), mask); + let a_packed = _mm512_permutex_epi64::<0b11_01_10_00>(a_shuffled); + let b_packed = _mm512_permutex_epi64::<0b11_01_10_00>(b_shuffled); + _mm512_shuffle_i64x2::<0b11_10_11_10>(a_packed, b_packed).simd_into(self) + } + } + #[inline(always)] + fn select_u16x32(self, a: mask16x32, b: u16x32, c: u16x32) -> u16x32 { + unsafe { + let k = _mm512_movepi16_mask(a.into()); + _mm512_mask_blend_epi16(k, c.into(), b.into()).simd_into(self) + } + } + #[inline(always)] + fn min_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { _mm512_min_epu16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { _mm512_max_epu16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn split_u16x32(self, a: u16x32) -> (u16x16, u16x16) { + unsafe { + ( + _mm512_castsi512_si256(a.into()).simd_into(self), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { + unsafe { + let v0 = _mm_loadu_si128(src.as_ptr() as *const _); + let v1 = _mm_loadu_si128(src.as_ptr().add(8usize) as *const _); + let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 8usize) as *const _); + let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 8usize) as *const _); + let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); + let v0 = _mm_shuffle_epi8(v0, mask); + let v1 = _mm_shuffle_epi8(v1, mask); + let v2 = _mm_shuffle_epi8(v2, mask); + let v3 = _mm_shuffle_epi8(v3, mask); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + self.combine_u16x16( + self.combine_u16x8(out0.simd_into(self), out1.simd_into(self)), + self.combine_u16x8(out2.simd_into(self), out3.simd_into(self)), + ) + } + } + #[inline(always)] + fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { + let (v01, v23) = self.split_u16x32(a); + let (v0, v1) = self.split_u16x16(v01); + let (v2, v3) = self.split_u16x16(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + unsafe { + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let out0 = _mm_shuffle_epi8(out0, mask); + let out1 = _mm_shuffle_epi8(out1, mask); + let out2 = _mm_shuffle_epi8(out2, mask); + let out3 = _mm_shuffle_epi8(out3, mask); + _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0); + _mm_storeu_si128(dest.as_mut_ptr().add(8usize) as *mut _, out1); + _mm_storeu_si128(dest.as_mut_ptr().add(2 * 8usize) as *mut _, out2); + _mm_storeu_si128(dest.as_mut_ptr().add(3 * 8usize) as *mut _, out3); + } + } + #[inline(always)] + fn narrow_u16x32(self, a: u16x32) -> u8x32 { + unsafe { _mm512_cvtepi16_epi8(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_u16x32(self, a: u16x32) -> u8x64 { + __m512i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_u16x32(self, a: u16x32) -> u32x16 { + __m512i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_mask16x32(self, val: i16) -> mask16x32 { + unsafe { _mm512_set1_epi16(val).simd_into(self) } + } + #[inline(always)] + fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32 { + mask16x32 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_mask16x32(self, val: &[i16; 32usize]) -> mask16x32 { + mask16x32 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_mask16x32(self, a: mask16x32) -> [i16; 32usize] { + unsafe { core::mem::transmute::<__m512i, [i16; 32usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_mask16x32(self, a: &mask16x32) -> &[i16; 32usize] { + unsafe { core::mem::transmute::<&__m512i, &[i16; 32usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_mask16x32(self, a: &mut mask16x32) -> &mut [i16; 32usize] { + unsafe { core::mem::transmute::<&mut __m512i, &mut [i16; 32usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_mask16x32(self, a: mask16x32, dest: &mut [i16; 32usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i16, + dest.as_mut_ptr(), + 32usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_mask16x32(self, a: u8x64) -> mask16x32 { + unsafe { + mask16x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_mask16x32(self, a: mask16x32) -> u8x64 { + unsafe { + u8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_mask16x32( + self, + a: mask16x32, + b: mask16x32, + ) -> mask16x32 { + unsafe { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_alignr_512( + self.cvt_to_bytes_mask16x32(b).val.0, + self.cvt_to_bytes_mask16x32(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_mask16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask16x32( + self, + a: mask16x32, + b: mask16x32, + ) -> mask16x32 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = dyn_alignr_512( + self.cvt_to_bytes_mask16x32(b).val.0, + self.cvt_to_bytes_mask16x32(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_mask16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_mask16x32(self, a: mask16x32) -> mask16x32 { + a ^ !0 + } + #[inline(always)] + fn select_mask16x32( + self, + a: mask16x32, + b: mask16x32, + c: mask16x32, + ) -> mask16x32 { + unsafe { + let k = _mm512_movepi16_mask(a.into()); + _mm512_mask_blend_epi16(k, c.into(), b.into()).simd_into(self) + } + } + #[inline(always)] + fn simd_eq_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + unsafe { + let mask = _mm512_cmpeq_epi16_mask(a.into(), b.into()); + _mm512_movm_epi16(mask).simd_into(self) + } + } + #[inline(always)] + fn any_true_mask16x32(self, a: mask16x32) -> bool { + unsafe { _mm512_movepi16_mask(a.into()) != 0 } + } + #[inline(always)] + fn all_true_mask16x32(self, a: mask16x32) -> bool { + unsafe { _mm512_movepi16_mask(a.into()) == 0xFFFFFFFFu32 } + } + #[inline(always)] + fn any_false_mask16x32(self, a: mask16x32) -> bool { + unsafe { _mm512_movepi16_mask(a.into()) != 0xFFFFFFFFu32 } + } + #[inline(always)] + fn all_false_mask16x32(self, a: mask16x32) -> bool { + unsafe { _mm512_movepi16_mask(a.into()) == 0 } + } + #[inline(always)] + fn split_mask16x32(self, a: mask16x32) -> (mask16x16, mask16x16) { + unsafe { + ( + _mm512_castsi512_si256(a.into()).simd_into(self), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn splat_i32x16(self, val: i32) -> i32x16 { + unsafe { _mm512_set1_epi32(val).simd_into(self) } + } + #[inline(always)] + fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16 { + i32x16 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16 { + i32x16 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_i32x16(self, a: i32x16) -> [i32; 16usize] { + unsafe { core::mem::transmute::<__m512i, [i32; 16usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_i32x16(self, a: &i32x16) -> &[i32; 16usize] { + unsafe { core::mem::transmute::<&__m512i, &[i32; 16usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_i32x16(self, a: &mut i32x16) -> &mut [i32; 16usize] { + unsafe { core::mem::transmute::<&mut __m512i, &mut [i32; 16usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_i32x16(self, a: i32x16, dest: &mut [i32; 16usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i32, + dest.as_mut_ptr(), + 16usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_i32x16(self, a: u8x64) -> i32x16 { + unsafe { + i32x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_i32x16(self, a: i32x16) -> u8x64 { + unsafe { + u8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_512( + self.cvt_to_bytes_i32x16(b).val.0, + self.cvt_to_bytes_i32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i32x16( + self, + a: i32x16, + b: i32x16, + ) -> i32x16 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_512( + self.cvt_to_bytes_i32x16(b).val.0, + self.cvt_to_bytes_i32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { _mm512_add_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { _mm512_sub_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { _mm512_mullo_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_i32x16(self, a: i32x16) -> i32x16 { + a ^ !0 + } + #[inline(always)] + fn shl_i32x16(self, a: i32x16, shift: u32) -> i32x16 { + unsafe { + _mm512_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shlv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { _mm512_sllv_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn shr_i32x16(self, a: i32x16, shift: u32) -> i32x16 { + unsafe { + _mm512_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shrv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { _mm512_srav_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_eq_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + unsafe { + let mask = _mm512_cmpeq_epi32_mask(a.into(), b.into()); + _mm512_movm_epi32(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_lt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + unsafe { + let mask = _mm512_cmplt_epi32_mask(a.into(), b.into()); + _mm512_movm_epi32(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_le_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + unsafe { + let mask = _mm512_cmple_epi32_mask(a.into(), b.into()); + _mm512_movm_epi32(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_ge_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + unsafe { + let mask = _mm512_cmpge_epi32_mask(a.into(), b.into()); + _mm512_movm_epi32(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_gt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + unsafe { + let mask = _mm512_cmpgt_epi32_mask(a.into(), b.into()); + _mm512_movm_epi32(mask).simd_into(self) + } + } + #[inline(always)] + fn zip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { + let idx = _mm512_set_epi32( + 23i32, 7i32, 22i32, 6i32, 21i32, 5i32, 20i32, 4i32, 19i32, 3i32, 18i32, 2i32, + 17i32, 1i32, 16i32, 0i32, + ); + _mm512_permutex2var_epi32(a.into(), idx, b.into()).simd_into(self) + } + } + #[inline(always)] + fn zip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { + let idx = _mm512_set_epi32( + 31i32, 15i32, 30i32, 14i32, 29i32, 13i32, 28i32, 12i32, 27i32, 11i32, 26i32, 10i32, + 25i32, 9i32, 24i32, 8i32, + ); + _mm512_permutex2var_epi32(a.into(), idx, b.into()).simd_into(self) + } + } + #[inline(always)] + fn unzip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { + let t1 = _mm512_permutexvar_epi32( + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15), + a.into(), + ); + let t2 = _mm512_permutexvar_epi32( + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15), + b.into(), + ); + _mm512_shuffle_i32x4::<0b01_00_01_00>(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { + let t1 = _mm512_permutexvar_epi32( + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15), + a.into(), + ); + let t2 = _mm512_permutexvar_epi32( + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15), + b.into(), + ); + _mm512_shuffle_i32x4::<0b11_10_11_10>(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn select_i32x16(self, a: mask32x16, b: i32x16, c: i32x16) -> i32x16 { + unsafe { + let k = _mm512_movepi32_mask(a.into()); + _mm512_mask_blend_epi32(k, c.into(), b.into()).simd_into(self) + } + } + #[inline(always)] + fn min_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { _mm512_min_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { _mm512_max_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn split_i32x16(self, a: i32x16) -> (i32x8, i32x8) { + unsafe { + ( + _mm512_castsi512_si256(a.into()).simd_into(self), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn neg_i32x16(self, a: i32x16) -> i32x16 { + unsafe { _mm512_sub_epi32(_mm512_setzero_si512(), a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_i32x16(self, a: i32x16) -> u8x64 { + __m512i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_i32x16(self, a: i32x16) -> u32x16 { + __m512i::from(a).simd_into(self) + } + #[inline(always)] + fn cvt_f32_i32x16(self, a: i32x16) -> f32x16 { + unsafe { _mm512_cvtepi32_ps(a.into()).simd_into(self) } + } + #[inline(always)] + fn splat_u32x16(self, val: u32) -> u32x16 { + unsafe { _mm512_set1_epi32(val.cast_signed()).simd_into(self) } + } + #[inline(always)] + fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16 { + u32x16 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16 { + u32x16 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_u32x16(self, a: u32x16) -> [u32; 16usize] { + unsafe { core::mem::transmute::<__m512i, [u32; 16usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_u32x16(self, a: &u32x16) -> &[u32; 16usize] { + unsafe { core::mem::transmute::<&__m512i, &[u32; 16usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_u32x16(self, a: &mut u32x16) -> &mut [u32; 16usize] { + unsafe { core::mem::transmute::<&mut __m512i, &mut [u32; 16usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const u32, + dest.as_mut_ptr(), + 16usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_u32x16(self, a: u8x64) -> u32x16 { + unsafe { + u32x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_u32x16(self, a: u32x16) -> u8x64 { + unsafe { + u8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_512( + self.cvt_to_bytes_u32x16(b).val.0, + self.cvt_to_bytes_u32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u32x16( + self, + a: u32x16, + b: u32x16, + ) -> u32x16 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_512( + self.cvt_to_bytes_u32x16(b).val.0, + self.cvt_to_bytes_u32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { _mm512_add_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { _mm512_sub_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { _mm512_mullo_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_u32x16(self, a: u32x16) -> u32x16 { + a ^ !0 + } + #[inline(always)] + fn shl_u32x16(self, a: u32x16, shift: u32) -> u32x16 { + unsafe { + _mm512_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shlv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { _mm512_sllv_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn shr_u32x16(self, a: u32x16, shift: u32) -> u32x16 { + unsafe { + _mm512_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shrv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { _mm512_srlv_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_eq_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + unsafe { + let mask = _mm512_cmpeq_epi32_mask(a.into(), b.into()); + _mm512_movm_epi32(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_lt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + unsafe { + let mask = _mm512_cmplt_epu32_mask(a.into(), b.into()); + _mm512_movm_epi32(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_le_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + unsafe { + let mask = _mm512_cmple_epu32_mask(a.into(), b.into()); + _mm512_movm_epi32(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_ge_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + unsafe { + let mask = _mm512_cmpge_epu32_mask(a.into(), b.into()); + _mm512_movm_epi32(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_gt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + unsafe { + let mask = _mm512_cmpgt_epu32_mask(a.into(), b.into()); + _mm512_movm_epi32(mask).simd_into(self) + } + } + #[inline(always)] + fn zip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { + let idx = _mm512_set_epi32( + 23i32, 7i32, 22i32, 6i32, 21i32, 5i32, 20i32, 4i32, 19i32, 3i32, 18i32, 2i32, + 17i32, 1i32, 16i32, 0i32, + ); + _mm512_permutex2var_epi32(a.into(), idx, b.into()).simd_into(self) + } + } + #[inline(always)] + fn zip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { + let idx = _mm512_set_epi32( + 31i32, 15i32, 30i32, 14i32, 29i32, 13i32, 28i32, 12i32, 27i32, 11i32, 26i32, 10i32, + 25i32, 9i32, 24i32, 8i32, + ); + _mm512_permutex2var_epi32(a.into(), idx, b.into()).simd_into(self) + } + } + #[inline(always)] + fn unzip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { + let t1 = _mm512_permutexvar_epi32( + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15), + a.into(), + ); + let t2 = _mm512_permutexvar_epi32( + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15), + b.into(), + ); + _mm512_shuffle_i32x4::<0b01_00_01_00>(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { + let t1 = _mm512_permutexvar_epi32( + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15), + a.into(), + ); + let t2 = _mm512_permutexvar_epi32( + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15), + b.into(), + ); + _mm512_shuffle_i32x4::<0b11_10_11_10>(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn select_u32x16(self, a: mask32x16, b: u32x16, c: u32x16) -> u32x16 { + unsafe { + let k = _mm512_movepi32_mask(a.into()); + _mm512_mask_blend_epi32(k, c.into(), b.into()).simd_into(self) + } + } + #[inline(always)] + fn min_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { _mm512_min_epu32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { _mm512_max_epu32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn split_u32x16(self, a: u32x16) -> (u32x8, u32x8) { + unsafe { + ( + _mm512_castsi512_si256(a.into()).simd_into(self), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { + unsafe { + let v0 = _mm_loadu_si128(src.as_ptr() as *const _); + let v1 = _mm_loadu_si128(src.as_ptr().add(4usize) as *const _); + let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 4usize) as *const _); + let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 4usize) as *const _); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + self.combine_u32x8( + self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)), + self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)), + ) + } + } + #[inline(always)] + fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { + let (v01, v23) = self.split_u32x16(a); + let (v0, v1) = self.split_u32x8(v01); + let (v2, v3) = self.split_u32x8(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + unsafe { + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0); + _mm_storeu_si128(dest.as_mut_ptr().add(4usize) as *mut _, out1); + _mm_storeu_si128(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2); + _mm_storeu_si128(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3); + } + } + #[inline(always)] + fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { + __m512i::from(a).simd_into(self) + } + #[inline(always)] + fn cvt_f32_u32x16(self, a: u32x16) -> f32x16 { + unsafe { _mm512_cvtepu32_ps(a.into()).simd_into(self) } + } + #[inline(always)] + fn splat_mask32x16(self, val: i32) -> mask32x16 { + unsafe { _mm512_set1_epi32(val).simd_into(self) } + } + #[inline(always)] + fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16 { + mask32x16 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_mask32x16(self, val: &[i32; 16usize]) -> mask32x16 { + mask32x16 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_mask32x16(self, a: mask32x16) -> [i32; 16usize] { + unsafe { core::mem::transmute::<__m512i, [i32; 16usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_mask32x16(self, a: &mask32x16) -> &[i32; 16usize] { + unsafe { core::mem::transmute::<&__m512i, &[i32; 16usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_mask32x16(self, a: &mut mask32x16) -> &mut [i32; 16usize] { + unsafe { core::mem::transmute::<&mut __m512i, &mut [i32; 16usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_mask32x16(self, a: mask32x16, dest: &mut [i32; 16usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i32, + dest.as_mut_ptr(), + 16usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_mask32x16(self, a: u8x64) -> mask32x16 { + unsafe { + mask32x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_mask32x16(self, a: mask32x16) -> u8x64 { + unsafe { + u8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_mask32x16( + self, + a: mask32x16, + b: mask32x16, + ) -> mask32x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_512( + self.cvt_to_bytes_mask32x16(b).val.0, + self.cvt_to_bytes_mask32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_mask32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask32x16( + self, + a: mask32x16, + b: mask32x16, + ) -> mask32x16 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_512( + self.cvt_to_bytes_mask32x16(b).val.0, + self.cvt_to_bytes_mask32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_mask32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_mask32x16(self, a: mask32x16) -> mask32x16 { + a ^ !0 + } + #[inline(always)] + fn select_mask32x16( + self, + a: mask32x16, + b: mask32x16, + c: mask32x16, + ) -> mask32x16 { + unsafe { + let k = _mm512_movepi32_mask(a.into()); + _mm512_mask_blend_epi32(k, c.into(), b.into()).simd_into(self) + } + } + #[inline(always)] + fn simd_eq_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + unsafe { + let mask = _mm512_cmpeq_epi32_mask(a.into(), b.into()); + _mm512_movm_epi32(mask).simd_into(self) + } + } + #[inline(always)] + fn any_true_mask32x16(self, a: mask32x16) -> bool { + unsafe { _mm512_movepi32_mask(a.into()) != 0 } + } + #[inline(always)] + fn all_true_mask32x16(self, a: mask32x16) -> bool { + unsafe { _mm512_movepi32_mask(a.into()) == 0xFFFFu16 } + } + #[inline(always)] + fn any_false_mask32x16(self, a: mask32x16) -> bool { + unsafe { _mm512_movepi32_mask(a.into()) != 0xFFFFu16 } + } + #[inline(always)] + fn all_false_mask32x16(self, a: mask32x16) -> bool { + unsafe { _mm512_movepi32_mask(a.into()) == 0 } + } + #[inline(always)] + fn split_mask32x16(self, a: mask32x16) -> (mask32x8, mask32x8) { + unsafe { + ( + _mm512_castsi512_si256(a.into()).simd_into(self), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn splat_f64x8(self, val: f64) -> f64x8 { + unsafe { _mm512_set1_pd(val).simd_into(self) } + } + #[inline(always)] + fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8 { + f64x8 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8 { + f64x8 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_f64x8(self, a: f64x8) -> [f64; 8usize] { + unsafe { core::mem::transmute::<__m512d, [f64; 8usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_f64x8(self, a: &f64x8) -> &[f64; 8usize] { + unsafe { core::mem::transmute::<&__m512d, &[f64; 8usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_f64x8(self, a: &mut f64x8) -> &mut [f64; 8usize] { + unsafe { core::mem::transmute::<&mut __m512d, &mut [f64; 8usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_f64x8(self, a: f64x8, dest: &mut [f64; 8usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const f64, + dest.as_mut_ptr(), + 8usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_f64x8(self, a: u8x64) -> f64x8 { + unsafe { + f64x8 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_f64x8(self, a: f64x8) -> u8x64 { + unsafe { + u8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_alignr_512( + self.cvt_to_bytes_f64x8(b).val.0, + self.cvt_to_bytes_f64x8(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f64x8( + self, + a: f64x8, + b: f64x8, + ) -> f64x8 { + unsafe { + if SHIFT >= 2usize { + return b; + } + let result = dyn_alignr_512( + self.cvt_to_bytes_f64x8(b).val.0, + self.cvt_to_bytes_f64x8(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn abs_f64x8(self, a: f64x8) -> f64x8 { + unsafe { _mm512_andnot_pd(_mm512_set1_pd(-0.0), a.into()).simd_into(self) } + } + #[inline(always)] + fn neg_f64x8(self, a: f64x8) -> f64x8 { + unsafe { _mm512_xor_pd(a.into(), _mm512_set1_pd(-0.0)).simd_into(self) } + } + #[inline(always)] + fn sqrt_f64x8(self, a: f64x8) -> f64x8 { + unsafe { _mm512_sqrt_pd(a.into()).simd_into(self) } + } + #[inline(always)] + fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { _mm512_add_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { _mm512_sub_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { _mm512_mul_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn div_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { _mm512_div_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn copysign_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { + let mask = _mm512_set1_pd(-0.0); + _mm512_or_pd( + _mm512_and_pd(mask, b.into()), + _mm512_andnot_pd(mask, a.into()), + ) + .simd_into(self) + } + } + #[inline(always)] + fn simd_eq_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + unsafe { + let mask = _mm512_cmp_pd_mask::<0i32>(a.into(), b.into()); + _mm512_movm_epi64(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_lt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + unsafe { + let mask = _mm512_cmp_pd_mask::<17i32>(a.into(), b.into()); + _mm512_movm_epi64(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_le_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + unsafe { + let mask = _mm512_cmp_pd_mask::<18i32>(a.into(), b.into()); + _mm512_movm_epi64(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_ge_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + unsafe { + let mask = _mm512_cmp_pd_mask::<29i32>(a.into(), b.into()); + _mm512_movm_epi64(mask).simd_into(self) + } + } + #[inline(always)] + fn simd_gt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + unsafe { + let mask = _mm512_cmp_pd_mask::<30i32>(a.into(), b.into()); + _mm512_movm_epi64(mask).simd_into(self) + } + } + #[inline(always)] + fn zip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { + let idx = _mm512_set_epi64(11i64, 3i64, 10i64, 2i64, 9i64, 1i64, 8i64, 0i64); + _mm512_permutex2var_pd(a.into(), idx, b.into()).simd_into(self) + } + } + #[inline(always)] + fn zip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { + let idx = _mm512_set_epi64(15i64, 7i64, 14i64, 6i64, 13i64, 5i64, 12i64, 4i64); + _mm512_permutex2var_pd(a.into(), idx, b.into()).simd_into(self) + } + } + #[inline(always)] + fn unzip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { + let t1 = _mm512_permutexvar_pd(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), a.into()); + let t2 = _mm512_permutexvar_pd(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), b.into()); + _mm512_shuffle_f64x2::<0b01_00_01_00>(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { + let t1 = _mm512_permutexvar_pd(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), a.into()); + let t2 = _mm512_permutexvar_pd(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), b.into()); + _mm512_shuffle_f64x2::<0b11_10_11_10>(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn max_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { _mm512_max_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { _mm512_min_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { + let intermediate = _mm512_max_pd(a.into(), b.into()); + let b_is_nan = _mm512_cmp_pd_mask::<0x03>(b.into(), b.into()); + _mm512_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self) + } + } + #[inline(always)] + fn min_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { + let intermediate = _mm512_min_pd(a.into(), b.into()); + let b_is_nan = _mm512_cmp_pd_mask::<0x03>(b.into(), b.into()); + _mm512_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self) + } + } + #[inline(always)] + fn mul_add_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { + unsafe { _mm512_fmadd_pd(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn mul_sub_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { + unsafe { _mm512_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn floor_f64x8(self, a: f64x8) -> f64x8 { + unsafe { _mm512_roundscale_pd::<_MM_FROUND_TO_NEG_INF>(a.into()).simd_into(self) } + } + #[inline(always)] + fn ceil_f64x8(self, a: f64x8) -> f64x8 { + unsafe { _mm512_roundscale_pd::<_MM_FROUND_TO_POS_INF>(a.into()).simd_into(self) } + } + #[inline(always)] + fn round_ties_even_f64x8(self, a: f64x8) -> f64x8 { + unsafe { _mm512_roundscale_pd::<_MM_FROUND_TO_NEAREST_INT>(a.into()).simd_into(self) } + } + #[inline(always)] + fn fract_f64x8(self, a: f64x8) -> f64x8 { + a - self.trunc_f64x8(a) + } + #[inline(always)] + fn trunc_f64x8(self, a: f64x8) -> f64x8 { + unsafe { _mm512_roundscale_pd::<_MM_FROUND_TO_ZERO>(a.into()).simd_into(self) } + } + #[inline(always)] + fn select_f64x8(self, a: mask64x8, b: f64x8, c: f64x8) -> f64x8 { + unsafe { + let k = _mm512_movepi64_mask(a.into()); + _mm512_mask_blend_pd(k, c.into(), b.into()).simd_into(self) + } + } + #[inline(always)] + fn split_f64x8(self, a: f64x8) -> (f64x4, f64x4) { + unsafe { + ( + _mm512_castpd512_pd256(a.into()).simd_into(self), + _mm512_extractf64x4_pd::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { + unsafe { _mm512_castpd_ps(a.into()).simd_into(self) } + } + #[inline(always)] + fn splat_mask64x8(self, val: i64) -> mask64x8 { + unsafe { _mm512_set1_epi64(val).simd_into(self) } + } + #[inline(always)] + fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8 { + mask64x8 { + val: unsafe { core::mem::transmute_copy(&val) }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_mask64x8(self, val: &[i64; 8usize]) -> mask64x8 { + mask64x8 { + val: unsafe { core::mem::transmute_copy(val) }, + simd: self, + } + } + #[inline(always)] + fn as_array_mask64x8(self, a: mask64x8) -> [i64; 8usize] { + unsafe { core::mem::transmute::<__m512i, [i64; 8usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_mask64x8(self, a: &mask64x8) -> &[i64; 8usize] { + unsafe { core::mem::transmute::<&__m512i, &[i64; 8usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_mask64x8(self, a: &mut mask64x8) -> &mut [i64; 8usize] { + unsafe { core::mem::transmute::<&mut __m512i, &mut [i64; 8usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_mask64x8(self, a: mask64x8, dest: &mut [i64; 8usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i64, + dest.as_mut_ptr(), + 8usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_mask64x8(self, a: u8x64) -> mask64x8 { + unsafe { + mask64x8 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_mask64x8(self, a: mask64x8) -> u8x64 { + unsafe { + u8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_mask64x8( + self, + a: mask64x8, + b: mask64x8, + ) -> mask64x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_alignr_512( + self.cvt_to_bytes_mask64x8(b).val.0, + self.cvt_to_bytes_mask64x8(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_mask64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask64x8( + self, + a: mask64x8, + b: mask64x8, + ) -> mask64x8 { + unsafe { + if SHIFT >= 2usize { + return b; + } + let result = dyn_alignr_512( + self.cvt_to_bytes_mask64x8(b).val.0, + self.cvt_to_bytes_mask64x8(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_mask64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { + unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { + unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { + unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_mask64x8(self, a: mask64x8) -> mask64x8 { + a ^ !0 + } + #[inline(always)] + fn select_mask64x8( + self, + a: mask64x8, + b: mask64x8, + c: mask64x8, + ) -> mask64x8 { + unsafe { + let k = _mm512_movepi64_mask(a.into()); + _mm512_mask_blend_epi64(k, c.into(), b.into()).simd_into(self) + } + } + #[inline(always)] + fn simd_eq_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { + unsafe { + let mask = _mm512_cmpeq_epi64_mask(a.into(), b.into()); + _mm512_movm_epi64(mask).simd_into(self) + } + } + #[inline(always)] + fn any_true_mask64x8(self, a: mask64x8) -> bool { + unsafe { _mm512_movepi64_mask(a.into()) != 0 } + } + #[inline(always)] + fn all_true_mask64x8(self, a: mask64x8) -> bool { + unsafe { _mm512_movepi64_mask(a.into()) == 0xFFu8 } + } + #[inline(always)] + fn any_false_mask64x8(self, a: mask64x8) -> bool { + unsafe { _mm512_movepi64_mask(a.into()) != 0xFFu8 } + } + #[inline(always)] + fn all_false_mask64x8(self, a: mask64x8) -> bool { + unsafe { _mm512_movepi64_mask(a.into()) == 0 } + } + #[inline(always)] + fn split_mask64x8(self, a: mask64x8) -> (mask64x4, mask64x4) { + unsafe { + ( + _mm512_castsi512_si256(a.into()).simd_into(self), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self), + ) + } + } +} +impl SimdFrom<__m512, S> for f32x16 { + #[inline(always)] + fn simd_from(simd: S, arch: __m512) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +impl From> for __m512 { + #[inline(always)] + fn from(value: f32x16) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +impl SimdFrom<__m512i, S> for i8x64 { + #[inline(always)] + fn simd_from(simd: S, arch: __m512i) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +impl From> for __m512i { + #[inline(always)] + fn from(value: i8x64) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +impl SimdFrom<__m512i, S> for u8x64 { + #[inline(always)] + fn simd_from(simd: S, arch: __m512i) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +impl From> for __m512i { + #[inline(always)] + fn from(value: u8x64) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +impl SimdFrom<__m512i, S> for mask8x64 { + #[inline(always)] + fn simd_from(simd: S, arch: __m512i) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +impl From> for __m512i { + #[inline(always)] + fn from(value: mask8x64) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +impl SimdFrom<__m512i, S> for i16x32 { + #[inline(always)] + fn simd_from(simd: S, arch: __m512i) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +impl From> for __m512i { + #[inline(always)] + fn from(value: i16x32) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +impl SimdFrom<__m512i, S> for u16x32 { + #[inline(always)] + fn simd_from(simd: S, arch: __m512i) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +impl From> for __m512i { + #[inline(always)] + fn from(value: u16x32) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +impl SimdFrom<__m512i, S> for mask16x32 { + #[inline(always)] + fn simd_from(simd: S, arch: __m512i) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +impl From> for __m512i { + #[inline(always)] + fn from(value: mask16x32) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +impl SimdFrom<__m512i, S> for i32x16 { + #[inline(always)] + fn simd_from(simd: S, arch: __m512i) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +impl From> for __m512i { + #[inline(always)] + fn from(value: i32x16) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +impl SimdFrom<__m512i, S> for u32x16 { + #[inline(always)] + fn simd_from(simd: S, arch: __m512i) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +impl From> for __m512i { + #[inline(always)] + fn from(value: u32x16) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +impl SimdFrom<__m512i, S> for mask32x16 { + #[inline(always)] + fn simd_from(simd: S, arch: __m512i) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +impl From> for __m512i { + #[inline(always)] + fn from(value: mask32x16) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +impl SimdFrom<__m512d, S> for f64x8 { + #[inline(always)] + fn simd_from(simd: S, arch: __m512d) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +impl From> for __m512d { + #[inline(always)] + fn from(value: f64x8) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +impl SimdFrom<__m512i, S> for mask64x8 { + #[inline(always)] + fn simd_from(simd: S, arch: __m512i) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +impl From> for __m512i { + #[inline(always)] + fn from(value: mask64x8) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"] +#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"] +#[doc = r" Rust doesn't currently let you do math on const generics."] +#[inline(always)] +unsafe fn dyn_alignr_128(a: __m128i, b: __m128i, shift: usize) -> __m128i { + unsafe { + match shift { + 0usize => _mm_alignr_epi8::<0i32>(a, b), + 1usize => _mm_alignr_epi8::<1i32>(a, b), + 2usize => _mm_alignr_epi8::<2i32>(a, b), + 3usize => _mm_alignr_epi8::<3i32>(a, b), + 4usize => _mm_alignr_epi8::<4i32>(a, b), + 5usize => _mm_alignr_epi8::<5i32>(a, b), + 6usize => _mm_alignr_epi8::<6i32>(a, b), + 7usize => _mm_alignr_epi8::<7i32>(a, b), + 8usize => _mm_alignr_epi8::<8i32>(a, b), + 9usize => _mm_alignr_epi8::<9i32>(a, b), + 10usize => _mm_alignr_epi8::<10i32>(a, b), + 11usize => _mm_alignr_epi8::<11i32>(a, b), + 12usize => _mm_alignr_epi8::<12i32>(a, b), + 13usize => _mm_alignr_epi8::<13i32>(a, b), + 14usize => _mm_alignr_epi8::<14i32>(a, b), + 15usize => _mm_alignr_epi8::<15i32>(a, b), + _ => unreachable!(), + } + } +} +#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"] +#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"] +#[doc = r" Rust doesn't currently let you do math on const generics."] +#[inline(always)] +unsafe fn dyn_alignr_256(a: __m256i, b: __m256i, shift: usize) -> __m256i { + unsafe { + match shift { + 0usize => _mm256_alignr_epi8::<0i32>(a, b), + 1usize => _mm256_alignr_epi8::<1i32>(a, b), + 2usize => _mm256_alignr_epi8::<2i32>(a, b), + 3usize => _mm256_alignr_epi8::<3i32>(a, b), + 4usize => _mm256_alignr_epi8::<4i32>(a, b), + 5usize => _mm256_alignr_epi8::<5i32>(a, b), + 6usize => _mm256_alignr_epi8::<6i32>(a, b), + 7usize => _mm256_alignr_epi8::<7i32>(a, b), + 8usize => _mm256_alignr_epi8::<8i32>(a, b), + 9usize => _mm256_alignr_epi8::<9i32>(a, b), + 10usize => _mm256_alignr_epi8::<10i32>(a, b), + 11usize => _mm256_alignr_epi8::<11i32>(a, b), + 12usize => _mm256_alignr_epi8::<12i32>(a, b), + 13usize => _mm256_alignr_epi8::<13i32>(a, b), + 14usize => _mm256_alignr_epi8::<14i32>(a, b), + 15usize => _mm256_alignr_epi8::<15i32>(a, b), + _ => unreachable!(), + } + } +} +#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"] +#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"] +#[doc = r" Rust doesn't currently let you do math on const generics."] +#[inline(always)] +unsafe fn dyn_alignr_512(a: __m512i, b: __m512i, shift: usize) -> __m512i { + unsafe { + match shift { + 0usize => _mm512_alignr_epi8::<0i32>(a, b), + 1usize => _mm512_alignr_epi8::<1i32>(a, b), + 2usize => _mm512_alignr_epi8::<2i32>(a, b), + 3usize => _mm512_alignr_epi8::<3i32>(a, b), + 4usize => _mm512_alignr_epi8::<4i32>(a, b), + 5usize => _mm512_alignr_epi8::<5i32>(a, b), + 6usize => _mm512_alignr_epi8::<6i32>(a, b), + 7usize => _mm512_alignr_epi8::<7i32>(a, b), + 8usize => _mm512_alignr_epi8::<8i32>(a, b), + 9usize => _mm512_alignr_epi8::<9i32>(a, b), + 10usize => _mm512_alignr_epi8::<10i32>(a, b), + 11usize => _mm512_alignr_epi8::<11i32>(a, b), + 12usize => _mm512_alignr_epi8::<12i32>(a, b), + 13usize => _mm512_alignr_epi8::<13i32>(a, b), + 14usize => _mm512_alignr_epi8::<14i32>(a, b), + 15usize => _mm512_alignr_epi8::<15i32>(a, b), + _ => unreachable!(), + } + } +} +#[doc = r" Computes one output __m256i for `cross_block_alignr_*` operations."] +#[doc = r""] +#[doc = r" Given an array of registers, each containing two 128-bit blocks, extracts two adjacent blocks (`lo_idx` and"] +#[doc = r" `hi_idx` = `lo_idx + 1`) and performs `alignr` with `intra_shift`."] +#[inline(always)] +unsafe fn cross_block_alignr_one( + regs: &[__m256i], + block_idx: usize, + shift_bytes: usize, +) -> __m256i { + let lo_idx = block_idx + (shift_bytes / 16); + let intra_shift = shift_bytes % 16; + let lo_blocks = if lo_idx & 1 == 0 { + regs[lo_idx / 2] + } else { + unsafe { _mm256_permute2x128_si256::<0x21>(regs[lo_idx / 2], regs[(lo_idx / 2) + 1]) } + }; + let hi_idx = lo_idx + 1; + let hi_blocks = if hi_idx & 1 == 0 { + regs[hi_idx / 2] + } else { + unsafe { _mm256_permute2x128_si256::<0x21>(regs[hi_idx / 2], regs[(hi_idx / 2) + 1]) } + }; + unsafe { dyn_alignr_256(hi_blocks, lo_blocks, intra_shift) } +} +#[doc = r" Concatenates `b` and `a` (each 1 x __m256i = 2 blocks) and extracts 2 blocks starting at byte offset"] +#[doc = r" `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics."] +#[inline(always)] +unsafe fn cross_block_alignr_256x1(a: __m256i, b: __m256i, shift_bytes: usize) -> __m256i { + let regs = [b, a]; + unsafe { cross_block_alignr_one(®s, 0, shift_bytes) } +} +#[doc = r" Concatenates `b` and `a` (each __m512i = 4 x 128-bit blocks) and extracts 4 blocks starting at byte offset"] +#[doc = r" `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics."] +#[doc = r" Uses AVX-512 VBMI's permutex2var for efficient cross-lane byte shuffling."] +#[inline(always)] +unsafe fn cross_block_alignr_512(a: __m512i, b: __m512i, shift_bytes: usize) -> __m512i { + let idx = unsafe { + match shift_bytes { + 0usize => _mm512_set_epi8( + 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, + 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, 36i8, + 35i8, 34i8, 33i8, 32i8, 31i8, 30i8, 29i8, 28i8, 27i8, 26i8, 25i8, 24i8, 23i8, 22i8, + 21i8, 20i8, 19i8, 18i8, 17i8, 16i8, 15i8, 14i8, 13i8, 12i8, 11i8, 10i8, 9i8, 8i8, + 7i8, 6i8, 5i8, 4i8, 3i8, 2i8, 1i8, 0i8, + ), + 1usize => _mm512_set_epi8( + 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, + 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, + 36i8, 35i8, 34i8, 33i8, 32i8, 31i8, 30i8, 29i8, 28i8, 27i8, 26i8, 25i8, 24i8, 23i8, + 22i8, 21i8, 20i8, 19i8, 18i8, 17i8, 16i8, 15i8, 14i8, 13i8, 12i8, 11i8, 10i8, 9i8, + 8i8, 7i8, 6i8, 5i8, 4i8, 3i8, 2i8, 1i8, + ), + 2usize => _mm512_set_epi8( + 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, + 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, + 37i8, 36i8, 35i8, 34i8, 33i8, 32i8, 31i8, 30i8, 29i8, 28i8, 27i8, 26i8, 25i8, 24i8, + 23i8, 22i8, 21i8, 20i8, 19i8, 18i8, 17i8, 16i8, 15i8, 14i8, 13i8, 12i8, 11i8, 10i8, + 9i8, 8i8, 7i8, 6i8, 5i8, 4i8, 3i8, 2i8, + ), + 3usize => _mm512_set_epi8( + 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, + 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, + 38i8, 37i8, 36i8, 35i8, 34i8, 33i8, 32i8, 31i8, 30i8, 29i8, 28i8, 27i8, 26i8, 25i8, + 24i8, 23i8, 22i8, 21i8, 20i8, 19i8, 18i8, 17i8, 16i8, 15i8, 14i8, 13i8, 12i8, 11i8, + 10i8, 9i8, 8i8, 7i8, 6i8, 5i8, 4i8, 3i8, + ), + 4usize => _mm512_set_epi8( + 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, + 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, + 39i8, 38i8, 37i8, 36i8, 35i8, 34i8, 33i8, 32i8, 31i8, 30i8, 29i8, 28i8, 27i8, 26i8, + 25i8, 24i8, 23i8, 22i8, 21i8, 20i8, 19i8, 18i8, 17i8, 16i8, 15i8, 14i8, 13i8, 12i8, + 11i8, 10i8, 9i8, 8i8, 7i8, 6i8, 5i8, 4i8, + ), + 5usize => _mm512_set_epi8( + 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, + 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, + 40i8, 39i8, 38i8, 37i8, 36i8, 35i8, 34i8, 33i8, 32i8, 31i8, 30i8, 29i8, 28i8, 27i8, + 26i8, 25i8, 24i8, 23i8, 22i8, 21i8, 20i8, 19i8, 18i8, 17i8, 16i8, 15i8, 14i8, 13i8, + 12i8, 11i8, 10i8, 9i8, 8i8, 7i8, 6i8, 5i8, + ), + 6usize => _mm512_set_epi8( + 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, + 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, + 41i8, 40i8, 39i8, 38i8, 37i8, 36i8, 35i8, 34i8, 33i8, 32i8, 31i8, 30i8, 29i8, 28i8, + 27i8, 26i8, 25i8, 24i8, 23i8, 22i8, 21i8, 20i8, 19i8, 18i8, 17i8, 16i8, 15i8, 14i8, + 13i8, 12i8, 11i8, 10i8, 9i8, 8i8, 7i8, 6i8, + ), + 7usize => _mm512_set_epi8( + 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, + 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, + 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, 36i8, 35i8, 34i8, 33i8, 32i8, 31i8, 30i8, 29i8, + 28i8, 27i8, 26i8, 25i8, 24i8, 23i8, 22i8, 21i8, 20i8, 19i8, 18i8, 17i8, 16i8, 15i8, + 14i8, 13i8, 12i8, 11i8, 10i8, 9i8, 8i8, 7i8, + ), + 8usize => _mm512_set_epi8( + 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, + 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, + 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, 36i8, 35i8, 34i8, 33i8, 32i8, 31i8, 30i8, + 29i8, 28i8, 27i8, 26i8, 25i8, 24i8, 23i8, 22i8, 21i8, 20i8, 19i8, 18i8, 17i8, 16i8, + 15i8, 14i8, 13i8, 12i8, 11i8, 10i8, 9i8, 8i8, + ), + 9usize => _mm512_set_epi8( + 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, + 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, + 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, 36i8, 35i8, 34i8, 33i8, 32i8, 31i8, + 30i8, 29i8, 28i8, 27i8, 26i8, 25i8, 24i8, 23i8, 22i8, 21i8, 20i8, 19i8, 18i8, 17i8, + 16i8, 15i8, 14i8, 13i8, 12i8, 11i8, 10i8, 9i8, + ), + 10usize => _mm512_set_epi8( + 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, + 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, + 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, 36i8, 35i8, 34i8, 33i8, 32i8, + 31i8, 30i8, 29i8, 28i8, 27i8, 26i8, 25i8, 24i8, 23i8, 22i8, 21i8, 20i8, 19i8, 18i8, + 17i8, 16i8, 15i8, 14i8, 13i8, 12i8, 11i8, 10i8, + ), + 11usize => _mm512_set_epi8( + 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, + 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, + 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, 36i8, 35i8, 34i8, 33i8, + 32i8, 31i8, 30i8, 29i8, 28i8, 27i8, 26i8, 25i8, 24i8, 23i8, 22i8, 21i8, 20i8, 19i8, + 18i8, 17i8, 16i8, 15i8, 14i8, 13i8, 12i8, 11i8, + ), + 12usize => _mm512_set_epi8( + 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, + 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, + 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, 36i8, 35i8, 34i8, + 33i8, 32i8, 31i8, 30i8, 29i8, 28i8, 27i8, 26i8, 25i8, 24i8, 23i8, 22i8, 21i8, 20i8, + 19i8, 18i8, 17i8, 16i8, 15i8, 14i8, 13i8, 12i8, + ), + 13usize => _mm512_set_epi8( + 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, + 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, + 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, 36i8, 35i8, + 34i8, 33i8, 32i8, 31i8, 30i8, 29i8, 28i8, 27i8, 26i8, 25i8, 24i8, 23i8, 22i8, 21i8, + 20i8, 19i8, 18i8, 17i8, 16i8, 15i8, 14i8, 13i8, + ), + 14usize => _mm512_set_epi8( + 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, + 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, + 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, 36i8, + 35i8, 34i8, 33i8, 32i8, 31i8, 30i8, 29i8, 28i8, 27i8, 26i8, 25i8, 24i8, 23i8, 22i8, + 21i8, 20i8, 19i8, 18i8, 17i8, 16i8, 15i8, 14i8, + ), + 15usize => _mm512_set_epi8( + 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, + 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, + 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, + 36i8, 35i8, 34i8, 33i8, 32i8, 31i8, 30i8, 29i8, 28i8, 27i8, 26i8, 25i8, 24i8, 23i8, + 22i8, 21i8, 20i8, 19i8, 18i8, 17i8, 16i8, 15i8, + ), + 16usize => _mm512_set_epi8( + 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, + 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, + 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, + 37i8, 36i8, 35i8, 34i8, 33i8, 32i8, 31i8, 30i8, 29i8, 28i8, 27i8, 26i8, 25i8, 24i8, + 23i8, 22i8, 21i8, 20i8, 19i8, 18i8, 17i8, 16i8, + ), + 17usize => _mm512_set_epi8( + 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, + 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, + 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, + 38i8, 37i8, 36i8, 35i8, 34i8, 33i8, 32i8, 31i8, 30i8, 29i8, 28i8, 27i8, 26i8, 25i8, + 24i8, 23i8, 22i8, 21i8, 20i8, 19i8, 18i8, 17i8, + ), + 18usize => _mm512_set_epi8( + 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, + 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, + 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, + 39i8, 38i8, 37i8, 36i8, 35i8, 34i8, 33i8, 32i8, 31i8, 30i8, 29i8, 28i8, 27i8, 26i8, + 25i8, 24i8, 23i8, 22i8, 21i8, 20i8, 19i8, 18i8, + ), + 19usize => _mm512_set_epi8( + 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, + 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, + 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, + 40i8, 39i8, 38i8, 37i8, 36i8, 35i8, 34i8, 33i8, 32i8, 31i8, 30i8, 29i8, 28i8, 27i8, + 26i8, 25i8, 24i8, 23i8, 22i8, 21i8, 20i8, 19i8, + ), + 20usize => _mm512_set_epi8( + 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, + 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, + 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, + 41i8, 40i8, 39i8, 38i8, 37i8, 36i8, 35i8, 34i8, 33i8, 32i8, 31i8, 30i8, 29i8, 28i8, + 27i8, 26i8, 25i8, 24i8, 23i8, 22i8, 21i8, 20i8, + ), + 21usize => _mm512_set_epi8( + 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, + 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, + 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, + 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, 36i8, 35i8, 34i8, 33i8, 32i8, 31i8, 30i8, 29i8, + 28i8, 27i8, 26i8, 25i8, 24i8, 23i8, 22i8, 21i8, + ), + 22usize => _mm512_set_epi8( + 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, + 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, + 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, + 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, 36i8, 35i8, 34i8, 33i8, 32i8, 31i8, 30i8, + 29i8, 28i8, 27i8, 26i8, 25i8, 24i8, 23i8, 22i8, + ), + 23usize => _mm512_set_epi8( + 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, + 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, + 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, + 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, 36i8, 35i8, 34i8, 33i8, 32i8, 31i8, + 30i8, 29i8, 28i8, 27i8, 26i8, 25i8, 24i8, 23i8, + ), + 24usize => _mm512_set_epi8( + 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, + 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, + 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, + 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, 36i8, 35i8, 34i8, 33i8, 32i8, + 31i8, 30i8, 29i8, 28i8, 27i8, 26i8, 25i8, 24i8, + ), + 25usize => _mm512_set_epi8( + 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, + 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, + 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, + 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, 36i8, 35i8, 34i8, 33i8, + 32i8, 31i8, 30i8, 29i8, 28i8, 27i8, 26i8, 25i8, + ), + 26usize => _mm512_set_epi8( + 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, + 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, + 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, + 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, 36i8, 35i8, 34i8, + 33i8, 32i8, 31i8, 30i8, 29i8, 28i8, 27i8, 26i8, + ), + 27usize => _mm512_set_epi8( + 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, + 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, + 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, + 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, 36i8, 35i8, + 34i8, 33i8, 32i8, 31i8, 30i8, 29i8, 28i8, 27i8, + ), + 28usize => _mm512_set_epi8( + 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, + 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, + 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, + 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, 36i8, + 35i8, 34i8, 33i8, 32i8, 31i8, 30i8, 29i8, 28i8, + ), + 29usize => _mm512_set_epi8( + 92i8, 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, + 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, + 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, + 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, + 36i8, 35i8, 34i8, 33i8, 32i8, 31i8, 30i8, 29i8, + ), + 30usize => _mm512_set_epi8( + 93i8, 92i8, 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, + 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, + 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, + 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, + 37i8, 36i8, 35i8, 34i8, 33i8, 32i8, 31i8, 30i8, + ), + 31usize => _mm512_set_epi8( + 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, + 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, + 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, + 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, + 38i8, 37i8, 36i8, 35i8, 34i8, 33i8, 32i8, 31i8, + ), + 32usize => _mm512_set_epi8( + 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, + 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, + 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, + 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, + 39i8, 38i8, 37i8, 36i8, 35i8, 34i8, 33i8, 32i8, + ), + 33usize => _mm512_set_epi8( + 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, + 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, + 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, + 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, + 40i8, 39i8, 38i8, 37i8, 36i8, 35i8, 34i8, 33i8, + ), + 34usize => _mm512_set_epi8( + 97i8, 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, + 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, + 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, + 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, + 41i8, 40i8, 39i8, 38i8, 37i8, 36i8, 35i8, 34i8, + ), + 35usize => _mm512_set_epi8( + 98i8, 97i8, 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, + 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, + 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, + 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, + 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, 36i8, 35i8, + ), + 36usize => _mm512_set_epi8( + 99i8, 98i8, 97i8, 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, + 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, + 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, + 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, + 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, 36i8, + ), + 37usize => _mm512_set_epi8( + 100i8, 99i8, 98i8, 97i8, 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, 88i8, + 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, + 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, + 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, + 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, 37i8, + ), + 38usize => _mm512_set_epi8( + 101i8, 100i8, 99i8, 98i8, 97i8, 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, + 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, + 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, + 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, + 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, 38i8, + ), + 39usize => _mm512_set_epi8( + 102i8, 101i8, 100i8, 99i8, 98i8, 97i8, 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, + 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, + 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, + 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, + 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, 39i8, + ), + 40usize => _mm512_set_epi8( + 103i8, 102i8, 101i8, 100i8, 99i8, 98i8, 97i8, 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, + 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, + 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, + 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, + 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, 40i8, + ), + 41usize => _mm512_set_epi8( + 104i8, 103i8, 102i8, 101i8, 100i8, 99i8, 98i8, 97i8, 96i8, 95i8, 94i8, 93i8, 92i8, + 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, + 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, + 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, + 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, 41i8, + ), + 42usize => _mm512_set_epi8( + 105i8, 104i8, 103i8, 102i8, 101i8, 100i8, 99i8, 98i8, 97i8, 96i8, 95i8, 94i8, 93i8, + 92i8, 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, + 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, + 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, + 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, 42i8, + ), + 43usize => _mm512_set_epi8( + 106i8, 105i8, 104i8, 103i8, 102i8, 101i8, 100i8, 99i8, 98i8, 97i8, 96i8, 95i8, + 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, + 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, + 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, + 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, 43i8, + ), + 44usize => _mm512_set_epi8( + 107i8, 106i8, 105i8, 104i8, 103i8, 102i8, 101i8, 100i8, 99i8, 98i8, 97i8, 96i8, + 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, + 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, + 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, + 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, 44i8, + ), + 45usize => _mm512_set_epi8( + 108i8, 107i8, 106i8, 105i8, 104i8, 103i8, 102i8, 101i8, 100i8, 99i8, 98i8, 97i8, + 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, + 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, + 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, + 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, 45i8, + ), + 46usize => _mm512_set_epi8( + 109i8, 108i8, 107i8, 106i8, 105i8, 104i8, 103i8, 102i8, 101i8, 100i8, 99i8, 98i8, + 97i8, 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, + 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, + 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, + 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, 46i8, + ), + 47usize => _mm512_set_epi8( + 110i8, 109i8, 108i8, 107i8, 106i8, 105i8, 104i8, 103i8, 102i8, 101i8, 100i8, 99i8, + 98i8, 97i8, 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, + 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, + 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, + 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, 47i8, + ), + 48usize => _mm512_set_epi8( + 111i8, 110i8, 109i8, 108i8, 107i8, 106i8, 105i8, 104i8, 103i8, 102i8, 101i8, 100i8, + 99i8, 98i8, 97i8, 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, + 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, + 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, + 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, 48i8, + ), + 49usize => _mm512_set_epi8( + 112i8, 111i8, 110i8, 109i8, 108i8, 107i8, 106i8, 105i8, 104i8, 103i8, 102i8, 101i8, + 100i8, 99i8, 98i8, 97i8, 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, 88i8, + 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, + 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, + 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, 49i8, + ), + 50usize => _mm512_set_epi8( + 113i8, 112i8, 111i8, 110i8, 109i8, 108i8, 107i8, 106i8, 105i8, 104i8, 103i8, 102i8, + 101i8, 100i8, 99i8, 98i8, 97i8, 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, + 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, + 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, + 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, 50i8, + ), + 51usize => _mm512_set_epi8( + 114i8, 113i8, 112i8, 111i8, 110i8, 109i8, 108i8, 107i8, 106i8, 105i8, 104i8, 103i8, + 102i8, 101i8, 100i8, 99i8, 98i8, 97i8, 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, + 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, + 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, + 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, 51i8, + ), + 52usize => _mm512_set_epi8( + 115i8, 114i8, 113i8, 112i8, 111i8, 110i8, 109i8, 108i8, 107i8, 106i8, 105i8, 104i8, + 103i8, 102i8, 101i8, 100i8, 99i8, 98i8, 97i8, 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, + 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, + 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, + 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, 52i8, + ), + 53usize => _mm512_set_epi8( + 116i8, 115i8, 114i8, 113i8, 112i8, 111i8, 110i8, 109i8, 108i8, 107i8, 106i8, 105i8, + 104i8, 103i8, 102i8, 101i8, 100i8, 99i8, 98i8, 97i8, 96i8, 95i8, 94i8, 93i8, 92i8, + 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, + 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, + 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, 53i8, + ), + 54usize => _mm512_set_epi8( + 117i8, 116i8, 115i8, 114i8, 113i8, 112i8, 111i8, 110i8, 109i8, 108i8, 107i8, 106i8, + 105i8, 104i8, 103i8, 102i8, 101i8, 100i8, 99i8, 98i8, 97i8, 96i8, 95i8, 94i8, 93i8, + 92i8, 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, + 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, + 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, 54i8, + ), + 55usize => _mm512_set_epi8( + 118i8, 117i8, 116i8, 115i8, 114i8, 113i8, 112i8, 111i8, 110i8, 109i8, 108i8, 107i8, + 106i8, 105i8, 104i8, 103i8, 102i8, 101i8, 100i8, 99i8, 98i8, 97i8, 96i8, 95i8, + 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, + 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, + 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, 55i8, + ), + 56usize => _mm512_set_epi8( + 119i8, 118i8, 117i8, 116i8, 115i8, 114i8, 113i8, 112i8, 111i8, 110i8, 109i8, 108i8, + 107i8, 106i8, 105i8, 104i8, 103i8, 102i8, 101i8, 100i8, 99i8, 98i8, 97i8, 96i8, + 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, + 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, + 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, 56i8, + ), + 57usize => _mm512_set_epi8( + 120i8, 119i8, 118i8, 117i8, 116i8, 115i8, 114i8, 113i8, 112i8, 111i8, 110i8, 109i8, + 108i8, 107i8, 106i8, 105i8, 104i8, 103i8, 102i8, 101i8, 100i8, 99i8, 98i8, 97i8, + 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, + 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, + 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, 57i8, + ), + 58usize => _mm512_set_epi8( + 121i8, 120i8, 119i8, 118i8, 117i8, 116i8, 115i8, 114i8, 113i8, 112i8, 111i8, 110i8, + 109i8, 108i8, 107i8, 106i8, 105i8, 104i8, 103i8, 102i8, 101i8, 100i8, 99i8, 98i8, + 97i8, 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, + 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, + 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, 58i8, + ), + 59usize => _mm512_set_epi8( + 122i8, 121i8, 120i8, 119i8, 118i8, 117i8, 116i8, 115i8, 114i8, 113i8, 112i8, 111i8, + 110i8, 109i8, 108i8, 107i8, 106i8, 105i8, 104i8, 103i8, 102i8, 101i8, 100i8, 99i8, + 98i8, 97i8, 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, 85i8, + 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, 71i8, + 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, 59i8, + ), + 60usize => _mm512_set_epi8( + 123i8, 122i8, 121i8, 120i8, 119i8, 118i8, 117i8, 116i8, 115i8, 114i8, 113i8, 112i8, + 111i8, 110i8, 109i8, 108i8, 107i8, 106i8, 105i8, 104i8, 103i8, 102i8, 101i8, 100i8, + 99i8, 98i8, 97i8, 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, 88i8, 87i8, 86i8, + 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, 73i8, 72i8, + 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, 60i8, + ), + 61usize => _mm512_set_epi8( + 124i8, 123i8, 122i8, 121i8, 120i8, 119i8, 118i8, 117i8, 116i8, 115i8, 114i8, 113i8, + 112i8, 111i8, 110i8, 109i8, 108i8, 107i8, 106i8, 105i8, 104i8, 103i8, 102i8, 101i8, + 100i8, 99i8, 98i8, 97i8, 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, 88i8, + 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, 74i8, + 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, 61i8, + ), + 62usize => _mm512_set_epi8( + 125i8, 124i8, 123i8, 122i8, 121i8, 120i8, 119i8, 118i8, 117i8, 116i8, 115i8, 114i8, + 113i8, 112i8, 111i8, 110i8, 109i8, 108i8, 107i8, 106i8, 105i8, 104i8, 103i8, 102i8, + 101i8, 100i8, 99i8, 98i8, 97i8, 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, 89i8, + 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, 75i8, + 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, 62i8, + ), + 63usize => _mm512_set_epi8( + 126i8, 125i8, 124i8, 123i8, 122i8, 121i8, 120i8, 119i8, 118i8, 117i8, 116i8, 115i8, + 114i8, 113i8, 112i8, 111i8, 110i8, 109i8, 108i8, 107i8, 106i8, 105i8, 104i8, 103i8, + 102i8, 101i8, 100i8, 99i8, 98i8, 97i8, 96i8, 95i8, 94i8, 93i8, 92i8, 91i8, 90i8, + 89i8, 88i8, 87i8, 86i8, 85i8, 84i8, 83i8, 82i8, 81i8, 80i8, 79i8, 78i8, 77i8, 76i8, + 75i8, 74i8, 73i8, 72i8, 71i8, 70i8, 69i8, 68i8, 67i8, 66i8, 65i8, 64i8, 63i8, + ), + _ => unreachable!(), + } + }; + unsafe { _mm512_permutex2var_epi8(b, idx, a) } +} diff --git a/fearless_simd/src/lib.rs b/fearless_simd/src/lib.rs index 9a078dfa5..e9a8a0575 100644 --- a/fearless_simd/src/lib.rs +++ b/fearless_simd/src/lib.rs @@ -179,6 +179,7 @@ pub mod wasm32 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub mod x86 { pub use crate::generated::Avx2; + pub use crate::generated::Avx512; pub use crate::generated::Sse4_2; } @@ -245,8 +246,47 @@ pub enum Level { Sse4_2(Sse4_2), /// The AVX2 and FMA instruction set on (32 and 64 bit) x86, plus the other instructions /// guaranteed to be available on AVX2+FMA CPUs. Also known as x86-64-v3. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + // We don't need to support this if the compilation target definitely supports something better. + #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + not(all( + target_feature = "adx", + target_feature = "aes", + target_feature = "avx512bitalg", + target_feature = "avx512bw", + target_feature = "avx512cd", + target_feature = "avx512dq", + target_feature = "avx512f", + target_feature = "avx512ifma", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2", + target_feature = "avx512vl", + target_feature = "avx512vnni", + target_feature = "avx512vpopcntdq", + target_feature = "bmi1", + target_feature = "bmi2", + target_feature = "cmpxchg16b", + target_feature = "fma", + target_feature = "gfni", + target_feature = "lzcnt", + target_feature = "movbe", + target_feature = "pclmulqdq", + target_feature = "popcnt", + target_feature = "rdrand", + target_feature = "rdseed", + target_feature = "sha", + target_feature = "vaes", + target_feature = "vpclmulqdq", + target_feature = "xsave", + target_feature = "xsavec", + target_feature = "xsaveopt", + target_feature = "xsaves", + )) + ))] Avx2(Avx2), + /// The AVX-512 instruction set on (32 and 64 bit) x86 with the Ice Lake feature set. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Avx512(Avx512), // If new variants are added, make sure to handle them in `Level::dispatch` // and `dispatch!()` } @@ -295,12 +335,47 @@ impl Level { } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + // Ice Lake AVX-512 feature set. The following features are implied by avx512f + // and do not need to be spelled out: avx, avx2, f16c, fxsr, sse, sse2, sse3, sse4.1, sse4.2, ssse3 + if std::arch::is_x86_feature_detected!("adx") + && std::arch::is_x86_feature_detected!("aes") + && std::arch::is_x86_feature_detected!("avx512bitalg") + && std::arch::is_x86_feature_detected!("avx512bw") + && std::arch::is_x86_feature_detected!("avx512cd") + && std::arch::is_x86_feature_detected!("avx512dq") + && std::arch::is_x86_feature_detected!("avx512f") + && std::arch::is_x86_feature_detected!("avx512ifma") + && std::arch::is_x86_feature_detected!("avx512vbmi") + && std::arch::is_x86_feature_detected!("avx512vbmi2") + && std::arch::is_x86_feature_detected!("avx512vl") + && std::arch::is_x86_feature_detected!("avx512vnni") + && std::arch::is_x86_feature_detected!("avx512vpopcntdq") + && std::arch::is_x86_feature_detected!("bmi1") + && std::arch::is_x86_feature_detected!("bmi2") + && std::arch::is_x86_feature_detected!("cmpxchg16b") + && std::arch::is_x86_feature_detected!("fma") + && std::arch::is_x86_feature_detected!("gfni") + && std::arch::is_x86_feature_detected!("lzcnt") + && std::arch::is_x86_feature_detected!("movbe") + && std::arch::is_x86_feature_detected!("pclmulqdq") + && std::arch::is_x86_feature_detected!("popcnt") + && std::arch::is_x86_feature_detected!("rdrand") + && std::arch::is_x86_feature_detected!("rdseed") + && std::arch::is_x86_feature_detected!("sha") + && std::arch::is_x86_feature_detected!("vaes") + && std::arch::is_x86_feature_detected!("vpclmulqdq") + && std::arch::is_x86_feature_detected!("xsave") + && std::arch::is_x86_feature_detected!("xsavec") + && std::arch::is_x86_feature_detected!("xsaveopt") + && std::arch::is_x86_feature_detected!("xsaves") + { + return unsafe { Self::Avx512(Avx512::new_unchecked()) }; // Feature list sourced from `rustc --print=cfg --target x86_64-unknown-linux-gnu -C target-cpu=x86-64-v3` // However, the following features are implied by avx2 and do not need to be spelled out: // avx,fxsr,sse,sse2,sse3,sse4.1,sse4.2,ssse3 // This can be verified by running: // rustc --print=cfg --target x86_64-unknown-linux-gnu -C target-feature='+avx2' - if std::arch::is_x86_feature_detected!("avx2") + } else if std::arch::is_x86_feature_detected!("avx2") && std::arch::is_x86_feature_detected!("bmi1") && std::arch::is_x86_feature_detected!("bmi2") && std::arch::is_x86_feature_detected!("cmpxchg16b") @@ -506,11 +581,36 @@ impl Level { reason = "On machines which statically support `avx2`, there is only one variant." )] match self { + // Safety: The Avx512 struct represents AVX-512 target features being enabled. + // AVX-512 implicitly enables the "avx2" target feature, which is + // the only target feature required to make our Avx2 token. + Self::Avx512(_avx512) => unsafe { Some(Avx2::new_unchecked()) }, Self::Avx2(avx2) => Some(avx2), _ => None, } } + /// If this is a proof that AVX-512 (or better) is available, access that instruction set. + /// + /// This method should be preferred over matching against the `Avx512` variant of self, + /// because if Fearless SIMD gets support for an instruction set which is a superset of AVX-512, + /// this method will return a value even if that "better" instruction set is available. + /// + /// This can be used in combination with the `safe_wrappers` feature to gain checked access to + /// the level-specific SIMD capabilities. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[inline] + pub fn as_avx512(self) -> Option { + #[allow( + unreachable_patterns, + reason = "On machines which statically support `avx512f`, there is only one variant." + )] + match self { + Self::Avx512(avx512) => Some(avx512), + _ => None, + } + } + /// Get the strongest statically supported SIMD level. /// /// That is, if your compilation run ambiently declares that a target feature is enabled, @@ -553,6 +653,8 @@ impl Level { } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + #[cfg(target_feature = "avx512f")] + return unsafe { Self::Avx512(Avx512::new_unchecked()) }; #[cfg(all( target_feature = "avx2", target_feature = "bmi1", @@ -563,7 +665,7 @@ impl Level { target_feature = "lzcnt", target_feature = "movbe", target_feature = "popcnt", - target_feature = "xsave" + target_feature = "xsave", ))] return unsafe { Self::Avx2(Avx2::new_unchecked()) }; #[cfg(all( diff --git a/fearless_simd/src/macros.rs b/fearless_simd/src/macros.rs index e7227aeb5..f81cfbe29 100644 --- a/fearless_simd/src/macros.rs +++ b/fearless_simd/src/macros.rs @@ -91,7 +91,9 @@ macro_rules! dispatch { || $op, ) } - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + ))] $crate::Level::Avx2(avx2) => { let $simd = launder(avx2); $crate::Simd::vectorize( @@ -100,6 +102,15 @@ macro_rules! dispatch { || $op, ) } + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + $crate::Level::Avx512(avx512) => { + let $simd = launder(avx512); + $crate::Simd::vectorize( + avx512, + #[inline(always)] + || $op, + ) + } #[cfg(any( all(target_arch = "aarch64", not(target_feature = "neon")), all( diff --git a/fearless_simd_dev_macros/src/lib.rs b/fearless_simd_dev_macros/src/lib.rs index f43ca87c3..e028b2644 100644 --- a/fearless_simd_dev_macros/src/lib.rs +++ b/fearless_simd_dev_macros/src/lib.rs @@ -21,6 +21,7 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream { let neon_name = get_ident("neon"); let sse4_name = get_ident("sse4"); let avx2_name = get_ident("avx2"); + let avx512_name = get_ident("avx512"); let wasm_name = get_ident("wasm"); let ignore_attr = |f: fn(&str) -> bool| { @@ -40,6 +41,7 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream { let ignore_neon = ignore_attr(exclude_neon); let ignore_sse4 = ignore_attr(exclude_sse4); let ignore_avx2 = ignore_attr(exclude_avx2); + let ignore_avx512 = ignore_attr(exclude_avx512); let ignore_wasm = ignore_attr(exclude_wasm); let fallback_snippet = quote! { @@ -105,6 +107,36 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream { } }; + // Ice Lake AVX-512 feature set + let avx512_snippet = quote! { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[test] + #ignore_avx512 + fn #avx512_name() { + if std::arch::is_x86_feature_detected!("avx512f") + && std::arch::is_x86_feature_detected!("avx512bw") + && std::arch::is_x86_feature_detected!("avx512cd") + && std::arch::is_x86_feature_detected!("avx512dq") + && std::arch::is_x86_feature_detected!("avx512vl") + && std::arch::is_x86_feature_detected!("avx512bitalg") + && std::arch::is_x86_feature_detected!("avx512ifma") + && std::arch::is_x86_feature_detected!("avx512vbmi") + && std::arch::is_x86_feature_detected!("avx512vbmi2") + && std::arch::is_x86_feature_detected!("avx512vnni") + && std::arch::is_x86_feature_detected!("avx512vpopcntdq") + && std::arch::is_x86_feature_detected!("gfni") + && std::arch::is_x86_feature_detected!("vaes") + && std::arch::is_x86_feature_detected!("vpclmulqdq") + { + let avx512 = unsafe { fearless_simd::x86::Avx512::new_unchecked() }; + avx512.vectorize( + #[inline(always)] + || #input_fn_name(avx512) + ); + } + } + }; + let wasm_snippet = quote! { #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] #[test] @@ -124,6 +156,7 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream { #wasm_snippet #sse4_snippet #avx2_snippet + #avx512_snippet } .into() } @@ -147,6 +180,10 @@ fn exclude_avx2(_test_name: &str) -> bool { false } +fn exclude_avx512(_test_name: &str) -> bool { + false +} + fn exclude_wasm(_test_name: &str) -> bool { false } diff --git a/fearless_simd_gen/src/arch/x86.rs b/fearless_simd_gen/src/arch/x86.rs index 210fd5c1a..625b78173 100644 --- a/fearless_simd_gen/src/arch/x86.rs +++ b/fearless_simd_gen/src/arch/x86.rs @@ -43,7 +43,6 @@ pub(crate) fn expr(op: &str, ty: &VecType, args: &[TokenStream]) -> TokenStream let suffix = op_suffix(ty.scalar, ty.scalar_bits, true); match op { "floor" | "ceil" | "round_ties_even" | "trunc" => { - let intrinsic = intrinsic_ident("round", suffix, ty.n_bits()); let rounding_mode = match op { "floor" => quote! { _MM_FROUND_TO_NEG_INF }, "ceil" => quote! { _MM_FROUND_TO_POS_INF }, @@ -51,7 +50,15 @@ pub(crate) fn expr(op: &str, ty: &VecType, args: &[TokenStream]) -> TokenStream "trunc" => quote! { _MM_FROUND_TO_ZERO }, _ => unreachable!(), }; - quote! { #intrinsic::<{#rounding_mode | _MM_FROUND_NO_EXC}>( #( #args, )* ) } + if ty.n_bits() == 512 { + // AVX-512 uses _mm512_roundscale_ps/pd with imm8 encoding: + // bits 7:4 = scale (0 for integer rounding), bits 3:0 = rounding mode + let intrinsic = intrinsic_ident("roundscale", suffix, ty.n_bits()); + quote! { #intrinsic::<#rounding_mode>( #( #args, )* ) } + } else { + let intrinsic = intrinsic_ident("round", suffix, ty.n_bits()); + quote! { #intrinsic::<{#rounding_mode | _MM_FROUND_NO_EXC}>( #( #args, )* ) } + } } "neg" => match ty.scalar { ScalarType::Float => { @@ -112,19 +119,46 @@ pub(crate) fn expr(op: &str, ty: &VecType, args: &[TokenStream]) -> TokenStream suffix, ty.n_bits(), ); - let cmpunord = float_compare_method("unord", ty); - let blend = intrinsic_ident("blendv", suffix, ty.n_bits()); let a = &args[0]; let b = &args[1]; - quote! { - let intermediate = #intrinsic(#a, #b); - // The x86 min/max intrinsics behave like `a < b ? a : b` and `a > b ? a : b` respectively. That - // means that if either `a` or `b` is NaN, they return the second argument `b`. So to implement a - // min/max where we always return the non-NaN argument, we add an additional check if `b` is NaN, - // and select `a` if so. - let b_is_nan = #cmpunord(#b, #b); - #blend(intermediate, #a, b_is_nan) + if ty.n_bits() == 512 { + // AVX-512 uses mask registers for comparisons and mask-based blending + // _mm512_cmp_ps_mask returns __mmask16/__mmask8, _mm512_mask_blend_ps uses it + let cmp_mask = match ty.scalar_bits { + 32 => format_ident!("_mm512_cmp_ps_mask"), + 64 => format_ident!("_mm512_cmp_pd_mask"), + _ => unreachable!(), + }; + let blend = match ty.scalar_bits { + 32 => format_ident!("_mm512_mask_blend_ps"), + 64 => format_ident!("_mm512_mask_blend_pd"), + _ => unreachable!(), + }; + // CMP_UNORD_Q predicate = 0x03 + quote! { + let intermediate = #intrinsic(#a, #b); + // The x86 min/max intrinsics behave like `a < b ? a : b` and `a > b ? a : b` respectively. That + // means that if either `a` or `b` is NaN, they return the second argument `b`. So to implement a + // min/max where we always return the non-NaN argument, we add an additional check if `b` is NaN, + // and select `a` if so. + let b_is_nan = #cmp_mask::<0x03>(#b, #b); + // mask_blend: where mask bit is 0, take from first arg; where 1, take from second + #blend(b_is_nan, intermediate, #a) + } + } else { + let cmpunord = float_compare_method("unord", ty); + let blend = intrinsic_ident("blendv", suffix, ty.n_bits()); + + quote! { + let intermediate = #intrinsic(#a, #b); + // The x86 min/max intrinsics behave like `a < b ? a : b` and `a > b ? a : b` respectively. That + // means that if either `a` or `b` is NaN, they return the second argument `b`. So to implement a + // min/max where we always return the non-NaN argument, we add an additional check if `b` is NaN, + // and select `a` if so. + let b_is_nan = #cmpunord(#b, #b); + #blend(intermediate, #a, b_is_nan) + } } } _ => unimplemented!("{}", op), @@ -166,9 +200,11 @@ pub(crate) fn coarse_type(vec_ty: &VecType) -> &'static str { pub(crate) fn set1_intrinsic(vec_ty: &VecType) -> Ident { use ScalarType::*; - let suffix = match (vec_ty.scalar, vec_ty.scalar_bits) { - (Int | Unsigned | Mask, 64) => "epi64x", - (scalar, bits) => op_suffix(scalar, bits, false), + let suffix = match (vec_ty.scalar, vec_ty.scalar_bits, vec_ty.n_bits()) { + // For 128/256-bit, use epi64x; for 512-bit, use epi64 (AVX-512 naming) + (Int | Unsigned | Mask, 64, 512) => "epi64", + (Int | Unsigned | Mask, 64, _) => "epi64x", + (scalar, bits, _) => op_suffix(scalar, bits, false), }; intrinsic_ident("set1", suffix, vec_ty.n_bits()) @@ -273,7 +309,7 @@ pub(crate) fn float_compare_method(method: &str, vec_ty: &VecType) -> TokenStrea }; quote! { #ident } } - 256 => { + 256 | 512 => { // For AVX2 and up, Intel gives us a generic comparison intrinsic that takes a predicate. There are 32, // of which only a few are useful and the rest will violate IEEE754 and/or raise a SIGFPE on NaN. // diff --git a/fearless_simd_gen/src/main.rs b/fearless_simd_gen/src/main.rs index 10efdfd99..57df1ba3a 100644 --- a/fearless_simd_gen/src/main.rs +++ b/fearless_simd_gen/src/main.rs @@ -36,6 +36,7 @@ enum Module { Fallback, Sse4_2, Avx2, + Avx512, } #[derive(Parser)] @@ -66,6 +67,7 @@ impl Module { Self::Fallback => mk_fallback::Fallback.make_module(), Self::Sse4_2 => mk_x86::X86::Sse4_2.make_module(), Self::Avx2 => mk_x86::X86::Avx2.make_module(), + Self::Avx512 => mk_x86::X86::Avx512.make_module(), } } @@ -105,6 +107,7 @@ impl Module { Self::Wasm => "wasm", Self::Sse4_2 => "sse4_2", Self::Avx2 => "avx2", + Self::Avx512 => "avx512", } } } @@ -118,6 +121,7 @@ const MODULES: &[Module] = &[ Module::Wasm, Module::Sse4_2, Module::Avx2, + Module::Avx512, ]; const FILE_BASE: &str = "./fearless_simd/src/generated"; diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index 883bc1ffd..b6ceae2f7 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -20,6 +20,7 @@ use quote::{ToTokens as _, format_ident, quote}; pub(crate) enum X86 { Sse4_2, Avx2, + Avx512, } impl Level for X86 { @@ -27,6 +28,7 @@ impl Level for X86 { match self { Self::Sse4_2 => "Sse4_2", Self::Avx2 => "Avx2", + Self::Avx512 => "Avx512", } } @@ -34,6 +36,7 @@ impl Level for X86 { match self { Self::Sse4_2 => 128, Self::Avx2 => 256, + Self::Avx512 => 512, } } @@ -45,6 +48,10 @@ impl Level for X86 { Some(match self { Self::Sse4_2 => "sse4.2,cmpxchg16b,popcnt", Self::Avx2 => "avx2,bmi1,bmi2,cmpxchg16b,f16c,fma,lzcnt,movbe,popcnt,xsave", + // Ice Lake feature set (avx512f implies avx, avx2, f16c, fxsr, sse, sse2, sse3, sse4.1, sse4.2, ssse3) + Self::Avx512 => { + "adx,aes,avx512bitalg,avx512bw,avx512cd,avx512dq,avx512f,avx512ifma,avx512vbmi,avx512vbmi2,avx512vl,avx512vnni,avx512vpopcntdq,bmi1,bmi2,cmpxchg16b,fma,gfni,lzcnt,movbe,pclmulqdq,popcnt,rdrand,rdseed,sha,vaes,vpclmulqdq,xsave,xsavec,xsaveopt,xsaves" + } }) } @@ -63,6 +70,7 @@ impl Level for X86 { match self { Self::Sse4_2 => r#"The SIMD token for the "SSE4.2" level."#, Self::Avx2 => r#"The SIMD token for the "AVX2" and "FMA" level."#, + Self::Avx512 => r#"The SIMD token for the "AVX-512" level (Ice Lake feature set)."#, } } @@ -70,6 +78,7 @@ impl Level for X86 { match self { Self::Sse4_2 => quote!(crate::core_arch::x86::Sse4_2), Self::Avx2 => quote!(crate::core_arch::x86::Avx2), + Self::Avx512 => quote!(crate::core_arch::x86::Avx512), } } @@ -87,6 +96,14 @@ impl Level for X86 { let slide_helpers = match self { Self::Sse4_2 => Self::sse42_slide_helpers(), Self::Avx2 => Self::avx2_slide_helpers(), + Self::Avx512 => { + let avx2_common_helpers = Self::avx2_slide_helpers_common(); + let avx512_helpers = Self::avx512_slide_helpers(); + quote! { + #avx2_common_helpers + #avx512_helpers + } + } }; quote! { @@ -109,6 +126,9 @@ impl Level for X86 { Self::Avx2 => quote! { Level::#level_tok(self) }, + Self::Avx512 => quote! { + Level::#level_tok(self) + }, } } @@ -140,6 +160,19 @@ impl Level for X86 { } } }, + Self::Avx512 => quote! { + /// Create a SIMD token. + /// + /// # Safety + /// + /// The AVX-512 (Ice Lake feature set) CPU features must be available. + #[inline] + pub const unsafe fn new_unchecked() -> Self { + Self { + avx512: unsafe { crate::core_arch::x86::Avx512::new_unchecked() }, + } + } + }, } } @@ -219,6 +252,12 @@ impl X86 { method: &str, vec_ty: &VecType, ) -> TokenStream { + // AVX-512 has native comparison intrinsics that return masks + // We then convert the mask back to a vector using movm intrinsics + if *self == Self::Avx512 && vec_ty.n_bits() == 512 { + return self.handle_compare_avx512(method_sig, method, vec_ty); + } + let args = [quote! { a.into() }, quote! { b.into() }]; let expr = if vec_ty.scalar != ScalarType::Float { @@ -294,6 +333,142 @@ impl X86 { } } + /// Handle AVX-512 comparisons using native mask-returning intrinsics + fn handle_compare_avx512( + &self, + method_sig: TokenStream, + method: &str, + vec_ty: &VecType, + ) -> TokenStream { + // AVX-512 comparisons return __mmask* types + // We need to convert back to vector masks using _mm512_movm_epi* + + // Get the movm intrinsic to convert mask register to vector + let movm = match vec_ty.scalar_bits { + 8 => format_ident!("_mm512_movm_epi8"), + 16 => format_ident!("_mm512_movm_epi16"), + 32 => format_ident!("_mm512_movm_epi32"), + 64 => format_ident!("_mm512_movm_epi64"), + _ => unreachable!(), + }; + + let expr = if vec_ty.scalar == ScalarType::Float { + // Float comparisons use _mm512_cmp_ps_mask / _mm512_cmp_pd_mask with predicates + let cmp_mask = match vec_ty.scalar_bits { + 32 => format_ident!("_mm512_cmp_ps_mask"), + 64 => format_ident!("_mm512_cmp_pd_mask"), + _ => unreachable!(), + }; + // Predicate values from Intel docs (same as used in float_compare_method) + let predicate = match method { + "simd_eq" => 0x00, + "simd_lt" => 0x11, + "simd_le" => 0x12, + "simd_ge" => 0x1D, + "simd_gt" => 0x1E, + _ => unreachable!(), + }; + // The mask type is stored as __m512i internally, so movm gives us what we need + quote! { + let mask = #cmp_mask::<#predicate>(a.into(), b.into()); + #movm(mask) + } + } else { + // Integer comparisons + let suffix = match vec_ty.scalar_bits { + 8 => "epi8", + 16 => "epi16", + 32 => "epi32", + 64 => "epi64", + _ => unreachable!(), + }; + let unsigned_suffix = match vec_ty.scalar_bits { + 8 => "epu8", + 16 => "epu16", + 32 => "epu32", + 64 => "epu64", + _ => unreachable!(), + }; + + match method { + "simd_eq" => { + let cmp = format_ident!("_mm512_cmpeq_{}_mask", suffix); + quote! { + let mask = #cmp(a.into(), b.into()); + #movm(mask) + } + } + "simd_lt" => { + if vec_ty.scalar == ScalarType::Unsigned { + let cmp = format_ident!("_mm512_cmplt_{}_mask", unsigned_suffix); + quote! { + let mask = #cmp(a.into(), b.into()); + #movm(mask) + } + } else { + let cmp = format_ident!("_mm512_cmplt_{}_mask", suffix); + quote! { + let mask = #cmp(a.into(), b.into()); + #movm(mask) + } + } + } + "simd_le" => { + if vec_ty.scalar == ScalarType::Unsigned { + let cmp = format_ident!("_mm512_cmple_{}_mask", unsigned_suffix); + quote! { + let mask = #cmp(a.into(), b.into()); + #movm(mask) + } + } else { + let cmp = format_ident!("_mm512_cmple_{}_mask", suffix); + quote! { + let mask = #cmp(a.into(), b.into()); + #movm(mask) + } + } + } + "simd_gt" => { + if vec_ty.scalar == ScalarType::Unsigned { + let cmp = format_ident!("_mm512_cmpgt_{}_mask", unsigned_suffix); + quote! { + let mask = #cmp(a.into(), b.into()); + #movm(mask) + } + } else { + let cmp = format_ident!("_mm512_cmpgt_{}_mask", suffix); + quote! { + let mask = #cmp(a.into(), b.into()); + #movm(mask) + } + } + } + "simd_ge" => { + if vec_ty.scalar == ScalarType::Unsigned { + let cmp = format_ident!("_mm512_cmpge_{}_mask", unsigned_suffix); + quote! { + let mask = #cmp(a.into(), b.into()); + #movm(mask) + } + } else { + let cmp = format_ident!("_mm512_cmpge_{}_mask", suffix); + quote! { + let mask = #cmp(a.into(), b.into()); + #movm(mask) + } + } + } + _ => unreachable!(), + } + }; + + quote! { + #method_sig { + unsafe { #expr.simd_into(self) } + } + } + } + pub(crate) fn handle_unary( &self, method_sig: TokenStream, @@ -339,7 +514,7 @@ impl X86 { let expr = match method { "widen" => { match (self, dst_width, vec_ty.n_bits()) { - (Self::Avx2, 256, 128) => { + (Self::Avx2 | Self::Avx512, 256, 128) => { let extend = extend_intrinsic( vec_ty.scalar, vec_ty.scalar_bits, @@ -352,6 +527,20 @@ impl X86 { } } } + (Self::Avx512, 512, 256) => { + // AVX-512 has native _mm512_cvt* intrinsics that extend 256-bit to 512-bit directly + let extend = extend_intrinsic( + vec_ty.scalar, + vec_ty.scalar_bits, + target_ty.scalar_bits, + dst_width, // Use 512-bit intrinsic directly + ); + quote! { + unsafe { + #extend(a.into()).simd_into(self) + } + } + } (Self::Avx2, 512, 256) => { let extend = extend_intrinsic( vec_ty.scalar, @@ -400,7 +589,7 @@ impl X86 { } "narrow" => { match (self, dst_width, vec_ty.n_bits()) { - (Self::Avx2, 128, 256) => { + (Self::Avx2 | Self::Avx512, 128, 256) => { let mask = match target_ty.scalar_bits { 8 => { quote! { 0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1 } @@ -418,6 +607,25 @@ impl X86 { } } } + (Self::Avx512, 256, 512) => { + // AVX-512 has native truncation intrinsics: _mm512_cvtepi16_epi8 etc. + // These directly narrow 512-bit to 256-bit with truncation + let narrow = match (vec_ty.scalar_bits, target_ty.scalar_bits) { + (16, 8) => format_ident!("_mm512_cvtepi16_epi8"), + (32, 16) => format_ident!("_mm512_cvtepi32_epi16"), + (64, 32) => format_ident!("_mm512_cvtepi64_epi32"), + _ => unimplemented!( + "narrow from {} to {} bits", + vec_ty.scalar_bits, + target_ty.scalar_bits + ), + }; + quote! { + unsafe { + #narrow(a.into()).simd_into(self) + } + } + } (Self::Avx2, 256, 512) => { let mask = set1_intrinsic(&VecType::new( vec_ty.scalar, @@ -506,7 +714,9 @@ impl X86 { } } } - "shlv" | "shrv" if *self == Self::Avx2 && vec_ty.scalar_bits >= 32 => { + "shlv" | "shrv" + if (*self == Self::Avx2 || *self == Self::Avx512) && vec_ty.scalar_bits >= 32 => + { let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false); let name = match (method, vec_ty.scalar) { ("shrv", ScalarType::Int) => "srav", @@ -557,6 +767,54 @@ impl X86 { if vec_ty.scalar_bits == 8 { // x86 doesn't have shifting for 8-bit, so we first convert into 16-bit, shift, and then back to 8-bit. + // AVX-512 uses different intrinsics - cmpgt returns a mask, not a vector + if *self == Self::Avx512 && ty_bits == 512 { + // For AVX-512, use cvtepi8_epi16 for sign extension (simpler than unpack + cmpgt) + // We split 512-bit into two 256-bit halves, extend each to 512-bit, shift, then narrow back + // Use VBMI's permutex2var_epi8 to pack the results efficiently (faster than cvtepi16_epi8) + // See: https://github.com/llvm/llvm-project/issues/34219 + + let extend = match vec_ty.scalar { + ScalarType::Unsigned => format_ident!("_mm512_cvtepu8_epi16"), + ScalarType::Int => format_ident!("_mm512_cvtepi8_epi16"), + _ => unimplemented!(), + }; + + return quote! { + #method_sig { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + + // Split into low and high 256-bit halves + let lo_256 = _mm512_castsi512_si256(val); + let hi_256 = _mm512_extracti64x4_epi64::<1>(val); + + // Extend each half from 8-bit to 16-bit (256-bit -> 512-bit) + let lo_16 = #extend(lo_256); + let hi_16 = #extend(hi_256); + + // Shift + let lo_shifted = #shift_intrinsic(lo_16, shift_count); + let hi_shifted = #shift_intrinsic(hi_16, shift_count); + + // Truncate back to 8-bit using permutex2var to select low bytes from each 16-bit element. + // This is more efficient than cvtepi16_epi8 which is 2 uops producing only 256-bit output. + // Index vector: select bytes 0,2,4,...,62 from lo_shifted (indices 0-31 in output) + // and bytes 0,2,4,...,62 from hi_shifted (indices 64,66,... -> output indices 32-63) + const PACK_LO_BYTES: __m512i = unsafe { core::mem::transmute([ + 0u8, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, + 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, + ]) }; + let result = _mm512_permutex2var_epi8(lo_shifted, PACK_LO_BYTES, hi_shifted); + result.simd_into(self) + } + } + }; + } + let unpack_hi = unpack_intrinsic(ScalarType::Int, 8, false, ty_bits); let unpack_lo = unpack_intrinsic(ScalarType::Int, 8, true, ty_bits); @@ -610,7 +868,7 @@ impl X86 { vec_ty: &VecType, ) -> TokenStream { match method { - "mul_add" if *self == Self::Avx2 => { + "mul_add" if *self == Self::Avx2 || *self == Self::Avx512 => { let intrinsic = simple_intrinsic("fmadd", vec_ty); quote! { #method_sig { @@ -618,7 +876,7 @@ impl X86 { } } } - "mul_sub" if *self == Self::Avx2 => { + "mul_sub" if *self == Self::Avx2 || *self == Self::Avx512 => { let intrinsic = simple_intrinsic("fmsub", vec_ty); quote! { #method_sig { @@ -658,6 +916,56 @@ impl X86 { } pub(crate) fn handle_select(&self, method_sig: TokenStream, vec_ty: &VecType) -> TokenStream { + // AVX-512 uses mask registers instead of blendv + if *self == Self::Avx512 && vec_ty.n_bits() == 512 { + // Convert vector mask to __mmask* using movepi*_mask, then use mask_blend + // select(mask, a, b) = where mask is true, take a; else take b + // _mm512_mask_blend_*(k, a, b) = where k bit is 0, take a; where 1, take b + // So we need: mask_blend(mask, c, b) where c is "false" value, b is "true" value + let (move_mask, blend) = match (vec_ty.scalar, vec_ty.scalar_bits) { + (ScalarType::Float, 32) => ( + format_ident!("_mm512_movepi32_mask"), + format_ident!("_mm512_mask_blend_ps"), + ), + (ScalarType::Float, 64) => ( + format_ident!("_mm512_movepi64_mask"), + format_ident!("_mm512_mask_blend_pd"), + ), + (_, 8) => ( + format_ident!("_mm512_movepi8_mask"), + format_ident!("_mm512_mask_blend_epi8"), + ), + (_, 16) => ( + format_ident!("_mm512_movepi16_mask"), + format_ident!("_mm512_mask_blend_epi16"), + ), + (_, 32) => ( + format_ident!("_mm512_movepi32_mask"), + format_ident!("_mm512_mask_blend_epi32"), + ), + (_, 64) => ( + format_ident!("_mm512_movepi64_mask"), + format_ident!("_mm512_mask_blend_epi64"), + ), + _ => unreachable!(), + }; + + // The mask 'a' is a mask type (mask32x16, etc.) which stores as __m512i + // We just need to convert it to __m512i and call movepi*_mask + let mask_expr = quote! { #move_mask(a.into()) }; + + return quote! { + #method_sig { + unsafe { + // a is the mask, b is the "true" value, c is the "false" value + // mask_blend: where mask bit is 0, take first arg; where 1, take second + let k = #mask_expr; + #blend(k, c.into(), b.into()).simd_into(self) + } + } + }; + } + // Our select ops' argument order is mask, a, b; Intel's intrinsics are b, a, mask let args = [ quote! { c.into() }, @@ -691,7 +999,7 @@ impl X86 { vec_ty: &VecType, half_ty: &VecType, ) -> TokenStream { - if *self == Self::Avx2 && half_ty.n_bits() == 128 { + if (*self == Self::Avx2 || *self == Self::Avx512) && half_ty.n_bits() == 128 { let extract_op = match vec_ty.scalar { ScalarType::Float => "extractf128", _ => "extracti128", @@ -707,6 +1015,32 @@ impl X86 { } } } + } else if *self == Self::Avx512 && half_ty.n_bits() == 256 { + // Split a 512-bit vector into two 256-bit halves using AVX-512 intrinsics + let (cast_intrinsic, extract_intrinsic) = match vec_ty.scalar { + ScalarType::Float if vec_ty.scalar_bits == 32 => ( + format_ident!("_mm512_castps512_ps256"), + format_ident!("_mm512_extractf32x8_ps"), + ), + ScalarType::Float => ( + format_ident!("_mm512_castpd512_pd256"), + format_ident!("_mm512_extractf64x4_pd"), + ), + _ => ( + format_ident!("_mm512_castsi512_si256"), + format_ident!("_mm512_extracti64x4_epi64"), + ), + }; + quote! { + #method_sig { + unsafe { + ( + #cast_intrinsic(a.into()).simd_into(self), + #extract_intrinsic::<1>(a.into()).simd_into(self), + ) + } + } + } } else { generic_block_split(method_sig, half_ty, self.max_block_size()) } @@ -718,7 +1052,7 @@ impl X86 { vec_ty: &VecType, combined_ty: &VecType, ) -> TokenStream { - if *self == Self::Avx2 && combined_ty.n_bits() == 256 { + if (*self == Self::Avx2 || *self == Self::Avx512) && combined_ty.n_bits() == 256 { let suffix = match (vec_ty.scalar, vec_ty.scalar_bits) { (ScalarType::Float, 32) => "m128", (ScalarType::Float, 64) => "m128d", @@ -732,6 +1066,30 @@ impl X86 { } } } + } else if *self == Self::Avx512 && combined_ty.n_bits() == 512 { + // Combine two 256-bit vectors into one 512-bit vector using AVX-512 intrinsics + let (cast_intrinsic, insert_intrinsic) = match (vec_ty.scalar, vec_ty.scalar_bits) { + (ScalarType::Float, 32) => ( + format_ident!("_mm512_castps256_ps512"), + format_ident!("_mm512_insertf32x8"), + ), + (ScalarType::Float, 64) => ( + format_ident!("_mm512_castpd256_pd512"), + format_ident!("_mm512_insertf64x4"), + ), + _ => ( + format_ident!("_mm512_castsi256_si512"), + format_ident!("_mm512_inserti64x4"), + ), + }; + quote! { + #method_sig { + unsafe { + let lo = #cast_intrinsic(a.into()); + #insert_intrinsic::<1>(lo, b.into()).simd_into(self) + } + } + } } else { generic_block_combine(method_sig, combined_ty, self.max_block_size()) } @@ -781,6 +1139,99 @@ impl X86 { } } } + 512 => { + // AVX-512 zip using permutex2var instructions for efficient single-instruction interleaving. + // The control mask selects elements from `a` (indices 0..n-1) and `b` (indices n..2n-1). + let half_len = vec_ty.len / 2; + + // Generate the index pattern for zip_low or zip_high + // For zip_low with 16 elements: interleave elements 0-7 from each vector + // For zip_high with 16 elements: interleave elements 8-15 from each vector + let base_offset = if select_low { 0_usize } else { half_len }; + let b_offset = vec_ty.len; // b's elements are at indices n..2n-1 + + // Build index array: [base, base+b_offset, base+1, base+1+b_offset, ...] + let indices: Vec<_> = (0..half_len) + .flat_map(|i| { + let a_idx = base_offset + i; + let b_idx = base_offset + i + b_offset; + [a_idx, b_idx] + }) + .collect(); + + // Choose the appropriate permutex2var intrinsic based on element type + // Note: for floats, use the epi32/epi64 set intrinsic since idx is always __m512i + let (permute_intrinsic, set_intrinsic, index_bits) = + match (vec_ty.scalar, vec_ty.scalar_bits) { + (ScalarType::Float, 32) => ( + format_ident!("_mm512_permutex2var_ps"), + format_ident!("_mm512_set_epi32"), + 32_usize, + ), + (ScalarType::Float, 64) => ( + format_ident!("_mm512_permutex2var_pd"), + format_ident!("_mm512_set_epi64"), + 64_usize, + ), + (_, 8) => ( + format_ident!("_mm512_permutex2var_epi8"), + format_ident!("_mm512_set_epi8"), + 8_usize, + ), + (_, 16) => ( + format_ident!("_mm512_permutex2var_epi16"), + format_ident!("_mm512_set_epi16"), + 16_usize, + ), + (_, 32) => ( + format_ident!("_mm512_permutex2var_epi32"), + format_ident!("_mm512_set_epi32"), + 32_usize, + ), + (_, 64) => ( + format_ident!("_mm512_permutex2var_epi64"), + format_ident!("_mm512_set_epi64"), + 64_usize, + ), + _ => unreachable!(), + }; + + // _mm512_set_* takes arguments from most-significant to least-significant, + // so we need to reverse the indices + let reversed_indices: Vec<_> = indices.iter().rev().copied().collect(); + let index_literals: Vec<_> = reversed_indices + .iter() + .map(|&i| { + // Use the appropriate integer type for each set intrinsic + match index_bits { + 8 => { + let i: i8 = i.try_into().unwrap(); + quote! { #i } + } + 16 => { + let i: i16 = i.try_into().unwrap(); + quote! { #i } + } + 32 => { + let i: i32 = i.try_into().unwrap(); + quote! { #i } + } + 64 => { + let i: i64 = i.try_into().unwrap(); + quote! { #i } + } + _ => unreachable!(), + } + }) + .collect(); + + quote! { + unsafe { + let idx = #set_intrinsic(#(#index_literals),*); + #permute_intrinsic(a.into(), idx, b.into()).simd_into(self) + } + } + } _ => unreachable!(), }; @@ -940,6 +1391,119 @@ impl X86 { } } } + (ScalarType::Float, 512, 32) => { + // 512-bit shuffle of 32-bit floats + // First permute within each 256-bit half to group evens/odds + let permute_mask = quote! { _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15) }; + let shuffle_immediate = if select_even { + quote! { 0b01_00_01_00 } + } else { + quote! { 0b11_10_11_10 } + }; + + quote! { + unsafe { + let t1 = _mm512_permutexvar_ps(#permute_mask, a.into()); + let t2 = _mm512_permutexvar_ps(#permute_mask, b.into()); + _mm512_shuffle_f32x4::<#shuffle_immediate>(t1, t2).simd_into(self) + } + } + } + (ScalarType::Float, 512, 64) => { + // 512-bit shuffle of 64-bit floats + let permute_mask = quote! { _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7) }; + let shuffle_immediate = if select_even { + quote! { 0b01_00_01_00 } + } else { + quote! { 0b11_10_11_10 } + }; + + quote! { + unsafe { + let t1 = _mm512_permutexvar_pd(#permute_mask, a.into()); + let t2 = _mm512_permutexvar_pd(#permute_mask, b.into()); + _mm512_shuffle_f64x2::<#shuffle_immediate>(t1, t2).simd_into(self) + } + } + } + (_, 512, 64) => { + // 512-bit shuffle of 64-bit integers + let permute_mask = quote! { _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7) }; + let shuffle_immediate = if select_even { + quote! { 0b01_00_01_00 } + } else { + quote! { 0b11_10_11_10 } + }; + + quote! { + unsafe { + let t1 = _mm512_permutexvar_epi64(#permute_mask, a.into()); + let t2 = _mm512_permutexvar_epi64(#permute_mask, b.into()); + _mm512_shuffle_i64x2::<#shuffle_immediate>(t1, t2).simd_into(self) + } + } + } + (_, 512, 32) => { + // 512-bit shuffle of 32-bit integers + let permute_mask = quote! { _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15) }; + let shuffle_immediate = if select_even { + quote! { 0b01_00_01_00 } + } else { + quote! { 0b11_10_11_10 } + }; + + quote! { + unsafe { + let t1 = _mm512_permutexvar_epi32(#permute_mask, a.into()); + let t2 = _mm512_permutexvar_epi32(#permute_mask, b.into()); + _mm512_shuffle_i32x4::<#shuffle_immediate>(t1, t2).simd_into(self) + } + } + } + (_, 512, 16 | 8) => { + // 512-bit shuffle of 8 or 16-bit integers + // Separate out the even-indexed and odd-indexed elements within each 128-bit lane + let mask = match vec_ty.scalar_bits { + 8 => { + // For 8-bit elements: move even indices to low half, odd to high half + quote! { + _mm512_set4_epi64( + 0x0F0D0B0907050301u64.cast_signed(), 0x0E0C0A0806040200u64.cast_signed(), + 0x0F0D0B0907050301u64.cast_signed(), 0x0E0C0A0806040200u64.cast_signed() + ) + } + } + 16 => { + // For 16-bit elements: move even indices to low half, odd to high half + quote! { + _mm512_set4_epi64( + 0x0F0E0B0A07060302u64.cast_signed(), 0x0D0C090805040100u64.cast_signed(), + 0x0F0E0B0A07060302u64.cast_signed(), 0x0D0C090805040100u64.cast_signed() + ) + } + } + _ => unreachable!(), + }; + + let shuffle_immediate = if select_even { + quote! { 0b01_00_01_00 } + } else { + quote! { 0b11_10_11_10 } + }; + + quote! { + unsafe { + let mask = #mask; + let a_shuffled = _mm512_shuffle_epi8(a.into(), mask); + let b_shuffled = _mm512_shuffle_epi8(b.into(), mask); + + let a_packed = _mm512_permutex_epi64::<0b11_01_10_00>(a_shuffled); + let b_packed = _mm512_permutex_epi64::<0b11_01_10_00>(b_shuffled); + + _mm512_shuffle_i64x2::<#shuffle_immediate>(a_packed, b_packed).simd_into(self) + } + } + } _ => unimplemented!(), }; @@ -967,6 +1531,32 @@ impl X86 { }; let to_bytes = generic_op_name("cvt_to_bytes", vec_ty); let from_bytes = generic_op_name("cvt_from_bytes", vec_ty); + let byte_shift = if scalar_bytes == 1 { + quote! { SHIFT } + } else { + quote! { SHIFT * #scalar_bytes } + }; + + // For AVX-512 with 512-bit AcrossBlocks, we need special handling since we have __m512i directly + if *self == Self::Avx512 && vec_ty.n_bits() == 512 && granularity == AcrossBlocks { + return quote! { + #method_sig { + unsafe { + if SHIFT >= #max_shift { + return b; + } + + // b and a are swapped here to match ARM's vext semantics + let result = cross_block_alignr_512( + self.#to_bytes(b).val.0, + self.#to_bytes(a).val.0, + #byte_shift + ); + self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self }) + } + } + }; + } let alignr_op = match (granularity, vec_ty.n_bits(), self) { (WithinBlocks, 128, _) => { @@ -976,21 +1566,23 @@ impl X86 { // For WithinBlocks, use elements per 128-bit block; for 128-bit vectors, use total elements format_ident!("dyn_alignr_{}", vec_ty.n_bits()) } - (AcrossBlocks, 256 | 512, Self::Sse4_2) => { + (AcrossBlocks, 256, Self::Sse4_2) => { // Inter-block shift or rotate in SSE4.2: use cross_block_alignr - format_ident!("cross_block_alignr_128x{}", vec_ty.n_bits() / 128) } - (AcrossBlocks, 256 | 512, Self::Avx2) => { + (AcrossBlocks, 256, Self::Avx2 | Self::Avx512) => { + format_ident!("cross_block_alignr_256x{}", vec_ty.n_bits() / 256) + } + (AcrossBlocks, 512, Self::Sse4_2) => { + // 512-bit AcrossBlocks for SSE4.2: use 128-bit blocks + format_ident!("cross_block_alignr_128x{}", vec_ty.n_bits() / 128) + } + (AcrossBlocks, 512, Self::Avx2) => { + // 512-bit AcrossBlocks for AVX2: use 256-bit blocks format_ident!("cross_block_alignr_256x{}", vec_ty.n_bits() / 256) } _ => unimplemented!(), }; - let byte_shift = if scalar_bytes == 1 { - quote! { SHIFT } - } else { - quote! { SHIFT * #scalar_bytes } - }; quote! { #method_sig { @@ -1021,6 +1613,104 @@ impl X86 { vec_ty.scalar_bits, target_scalar_bits, "we currently only support converting between types of the same width" ); + + // AVX-512 has native unsigned conversion intrinsics, so handle it specially + if *self == Self::Avx512 && vec_ty.n_bits() == 512 { + let expr = match (vec_ty.scalar, target_scalar, precise) { + (ScalarType::Float, ScalarType::Int, false) => { + // f32 -> i32: _mm512_cvttps_epi32 + quote! { + unsafe { + _mm512_cvttps_epi32(a.into()).simd_into(self) + } + } + } + (ScalarType::Float, ScalarType::Int, true) => { + // f32 -> i32 precise: handle out-of-range values and NaN + // Saturate to i32::MAX if >= 2147483648.0, NaN becomes 0 + quote! { + unsafe { + let a = a.into(); + let mut converted = _mm512_cvttps_epi32(a); + + // In the common case where everything is in range, we don't need to do anything else. + let in_range_mask = _mm512_cmp_ps_mask::<{ _CMP_LT_OQ }>(a, _mm512_set1_ps(2147483648.0)); + let all_in_range = in_range_mask == 0xFFFF; + + if !all_in_range { + // If we are above i32::MAX (2147483647), clamp to it. + converted = _mm512_mask_blend_epi32(in_range_mask, _mm512_set1_epi32(i32::MAX), converted); + // Set NaN to 0. + let is_not_nan_mask = _mm512_cmp_ps_mask::<{ _CMP_ORD_Q }>(a, a); + converted = _mm512_maskz_mov_epi32(is_not_nan_mask, converted); + // We don't need to handle negative overflow because Intel's "invalid result" sentinel + // value is -2147483648, which is what we want anyway. + } + + converted.simd_into(self) + } + } + } + (ScalarType::Float, ScalarType::Unsigned, false) => { + // f32 -> u32: _mm512_cvttps_epu32 (native unsigned support!) + quote! { + unsafe { + _mm512_cvttps_epu32(a.into()).simd_into(self) + } + } + } + (ScalarType::Float, ScalarType::Unsigned, true) => { + // f32 -> u32 precise: handle out-of-range values and NaN + // Saturate to u32::MAX if >= 4294967040.0, NaN and negative become 0 + quote! { + unsafe { + // Clamp negative values and NaN to 0. Intel's `_mm512_max_ps` always takes the second + // operand if the first is NaN. + let a = _mm512_max_ps(a.into(), _mm512_setzero_ps()); + let mut converted = _mm512_cvttps_epu32(a); + + // Check if any value exceeds u32::MAX representable in f32 (4294967040.0) + let exceeds_range_mask = _mm512_cmp_ps_mask::<{ _CMP_GT_OQ }>(a, _mm512_set1_ps(4294967040.0)); + + if exceeds_range_mask != 0 { + // Clamp to u32::MAX. + converted = _mm512_mask_blend_epi32(exceeds_range_mask, converted, _mm512_set1_epi32(u32::MAX.cast_signed())); + } + + converted.simd_into(self) + } + } + } + (ScalarType::Int, ScalarType::Float, _) => { + // i32 -> f32: _mm512_cvtepi32_ps + quote! { + unsafe { + _mm512_cvtepi32_ps(a.into()).simd_into(self) + } + } + } + (ScalarType::Unsigned, ScalarType::Float, _) => { + // u32 -> f32: _mm512_cvtepu32_ps (native unsigned support!) + quote! { + unsafe { + _mm512_cvtepu32_ps(a.into()).simd_into(self) + } + } + } + _ => unimplemented!( + "512-bit conversion from {:?} to {:?}", + vec_ty.scalar, + target_scalar + ), + }; + + return quote! { + #method_sig { + #expr + } + }; + } + let expr = match (vec_ty.scalar, target_scalar) { (ScalarType::Float, ScalarType::Int | ScalarType::Unsigned) => { let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits); @@ -1245,6 +1935,44 @@ impl X86 { "mask reduce ops only operate on masks" ); + // AVX-512 uses mask registers instead of movemask + if *self == Self::Avx512 && vec_ty.n_bits() == 512 { + // Use _mm512_movepi*_mask to convert to a mask register, then check the mask + // The mask return type depends on the element size: + // - 8-bit elements: __mmask64 (u64) + // - 16-bit elements: __mmask32 (u32) + // - 32-bit elements: __mmask16 (u16) + // - 64-bit elements: __mmask8 (u8) + let (move_mask, all_ones) = match vec_ty.scalar_bits { + 8 => ( + format_ident!("_mm512_movepi8_mask"), + quote! { 0xFFFFFFFFFFFFFFFFu64 }, + ), + 16 => ( + format_ident!("_mm512_movepi16_mask"), + quote! { 0xFFFFFFFFu32 }, + ), + 32 => (format_ident!("_mm512_movepi32_mask"), quote! { 0xFFFFu16 }), + 64 => (format_ident!("_mm512_movepi64_mask"), quote! { 0xFFu8 }), + _ => unreachable!(), + }; + + let op = match (quantifier, condition) { + (Quantifier::Any, true) => quote! { != 0 }, + (Quantifier::Any, false) => quote! { != #all_ones }, + (Quantifier::All, true) => quote! { == #all_ones }, + (Quantifier::All, false) => quote! { == 0 }, + }; + + return quote! { + #method_sig { + unsafe { + #move_mask(a.into()) #op + } + } + }; + } + let (movemask, all_ones) = match vec_ty.scalar_bits { 32 | 64 => { let float_ty = vec_ty.cast(ScalarType::Float); @@ -1568,6 +2296,7 @@ impl X86 { let vec_widths: &[usize] = match self { Self::Sse4_2 => &[128], Self::Avx2 => &[128, 256], + Self::Avx512 => &[128, 256, 512], }; for vec_ty in vec_widths @@ -1629,7 +2358,8 @@ impl X86 { } } - fn avx2_slide_helpers() -> TokenStream { + /// Helpers shared between AVX2 and AVX-512 (`cross_block_alignr_one` and `cross_block_alignr_256x1`). + fn avx2_slide_helpers_common() -> TokenStream { quote! { /// Computes one output __m256i for `cross_block_alignr_*` operations. /// @@ -1656,6 +2386,25 @@ impl X86 { unsafe { dyn_alignr_256(hi_blocks, lo_blocks, intra_shift) } } + /// Concatenates `b` and `a` (each 1 x __m256i = 2 blocks) and extracts 2 blocks starting at byte offset + /// `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics. + #[inline(always)] + unsafe fn cross_block_alignr_256x1(a: __m256i, b: __m256i, shift_bytes: usize) -> __m256i { + // Concatenation is [b : a], so b comes first + let regs = [b, a]; + + unsafe { + cross_block_alignr_one(®s, 0, shift_bytes) + } + } + } + } + + fn avx2_slide_helpers() -> TokenStream { + let common = Self::avx2_slide_helpers_common(); + quote! { + #common + /// Concatenates `b` and `a` (each 2 x __m256i = 4 blocks) and extracts 4 blocks starting at byte offset /// `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics. #[inline(always)] @@ -1670,17 +2419,47 @@ impl X86 { ] } } + } + } - /// Concatenates `b` and `a` (each 1 x __m256i = 2 blocks) and extracts 2 blocks starting at byte offset + fn avx512_slide_helpers() -> TokenStream { + // Generate index vectors for each possible shift amount (0..64). + // For shift S, we want to extract bytes [S..S+64) from the concatenation [b : a]. + // In _mm512_permutex2var_epi8, indices 0-63 select from the first arg, 64-127 from the second. + // Since we want [b : a] with b in low bytes: b is first arg, a is second arg. + // So index i in result should get byte (S + i) from [b : a]: + // - if (S + i) < 64: from b at position (S + i), index = S + i + // - if (S + i) >= 64: from a at position (S + i - 64), index = 64 + (S + i - 64) = S + i + // So the index is simply (S + i) for all cases, which works because indices wrap at 128. + let match_arms: Vec<_> = (0_usize..64) + .map(|shift| { + let shift_u8: u8 = shift.try_into().unwrap(); + let indices: Vec = (0_u8..64) + .map(|i| shift_u8.wrapping_add(i).cast_signed()) + .collect(); + // _mm512_set_epi8 takes arguments in reverse order (element 63 first, element 0 last) + let indices_rev: Vec<_> = indices.into_iter().rev().collect(); + quote! { + #shift => _mm512_set_epi8(#(#indices_rev),*) + } + }) + .collect(); + + quote! { + /// Concatenates `b` and `a` (each __m512i = 4 x 128-bit blocks) and extracts 4 blocks starting at byte offset /// `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics. + /// Uses AVX-512 VBMI's permutex2var for efficient cross-lane byte shuffling. #[inline(always)] - unsafe fn cross_block_alignr_256x1(a: __m256i, b: __m256i, shift_bytes: usize) -> __m256i { - // Concatenation is [b : a], so b comes first - let regs = [b, a]; - - unsafe { - cross_block_alignr_one(®s, 0, shift_bytes) - } + unsafe fn cross_block_alignr_512(a: __m512i, b: __m512i, shift_bytes: usize) -> __m512i { + // Use _mm512_permutex2var_epi8 (VBMI) to select 64 bytes from the 128-byte concatenation. + // The index vector specifies which byte to select: 0-63 from b, 64-127 from a. + let idx = unsafe { + match shift_bytes { + #(#match_arms,)* + _ => unreachable!() + } + }; + unsafe { _mm512_permutex2var_epi8(b, idx, a) } } } }