From ac145c4f22e715bce2727d29c4140709bfbd4226 Mon Sep 17 00:00:00 2001 From: Austin Orr Date: Sun, 24 Aug 2025 23:29:46 -0700 Subject: [PATCH 1/3] fix macro fall-through calls --- src/invoking.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/invoking.rs b/src/invoking.rs index 27c19c4..430c284 100644 --- a/src/invoking.rs +++ b/src/invoking.rs @@ -99,7 +99,7 @@ macro_rules! simd_compiletime_select { } }; ($(#[$meta:meta])* $vis:vis fn $fn_name:ident ($($arg:ident:$typ:ty),* $(,)? ) $body:block ) => { - simd_runtime_generate!($(#[$meta])* $vis fn $fn_name ($($arg:$typ),*) -> () $body); + simd_compiletime_select!($(#[$meta])* $vis fn $fn_name ($($arg:$typ),*) -> () $body); }; } @@ -107,6 +107,13 @@ macro_rules! simd_compiletime_select { macro_rules! simd_unsafe_generate_all { ($(#[$meta:meta])* $vis:vis fn $fn_name:ident $(<$($lt:lifetime),+>)? ($($arg:ident:$typ:ty),* $(,)? ) -> $rt:ty $body:block ) => { simdeez_paste_item! { + $(#[$meta])* + #[inline(always)] + $vis fn $fn_name $(<$($lt),+>)?($($arg:$typ,)*) -> $rt { + let args_tuple = ($($arg,)*); + __run_simd_runtime_decide::<[<__ $fn_name _dispatch_struct>], fix_tuple_type!(($($typ),*)), $rt>(args_tuple) + } + $(#[$meta])* #[inline(always)] #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] @@ -159,7 +166,7 @@ macro_rules! simd_unsafe_generate_all { } }; ($(#[$meta:meta])* $vis:vis fn $fn_name:ident ($($arg:ident:$typ:ty),* $(,)? ) $body:block ) => { - simd_runtime_generate!($(#[$meta])* $vis fn $fn_name ($($arg:$typ),*) -> () $body); + simd_unsafe_generate_all!($(#[$meta])* $vis fn $fn_name ($($arg:$typ),*) -> () $body); }; } From 235b5c6ce3ce400bc7c8dc8cdc8e330e99338306 Mon Sep 17 00:00:00 2001 From: Austin Orr Date: Mon, 25 Aug 2025 11:09:57 -0700 Subject: [PATCH 2/3] fix f32 & f64 EqPrecision::almost implementation, esp for near-zeros --- src/tests/lib/numbers.rs | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/tests/lib/numbers.rs b/src/tests/lib/numbers.rs index 912cad8..fbcf517 100644 --- a/src/tests/lib/numbers.rs +++ b/src/tests/lib/numbers.rs @@ -136,10 +136,13 @@ impl ScalarNumber for f32 { match precision { EqPrecision::Exact => self == other, EqPrecision::Almost { figs } => { + let epsilon = 10.0f32.powi(-(figs as i32)); + if (self - other).abs() < epsilon { + return true; + } let bigger = self.max(other); let norm_diff = (self / bigger) - (other / bigger); - let epsilon = 10.0f32.powi(-(figs as i32)); - norm_diff < epsilon + norm_diff.abs() < epsilon } } } @@ -189,10 +192,13 @@ impl ScalarNumber for f64 { match precision { EqPrecision::Exact => self == other, EqPrecision::Almost { figs } => { + let epsilon = 10.0f64.powi(-(figs as i32)); + if (self - other).abs() < epsilon { + return true; + } let bigger = self.max(other); let norm_diff = (self / bigger) - (other / bigger); - let epsilon = 10.0f64.powi(-(figs as i32)); - norm_diff < epsilon + norm_diff.abs() < epsilon } } } From df4fdc1e7e166010431a7f1f823a7a12a767fc7f Mon Sep 17 00:00:00 2001 From: Austin Orr Date: Sun, 24 Aug 2025 23:18:48 -0700 Subject: [PATCH 3/3] fix avx, sse41, sse2, and neon horizontal add impl --- src/ops/f64.rs | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/src/ops/f64.rs b/src/ops/f64.rs index b4ff33d..afcb969 100644 --- a/src/ops/f64.rs +++ b/src/ops/f64.rs @@ -654,35 +654,23 @@ impl_op! { fn horizontal_add { for Avx2(a: __m256d) -> f64 { let a = _mm256_hadd_pd(a, a); - let b = _mm256_hadd_pd(a, a); - - let first = _mm_cvtsd_f64(_mm256_extractf128_pd(b, 0)); - let second = _mm_cvtsd_f64(_mm256_extractf128_pd(b, 1)); - + let first = _mm_cvtsd_f64(_mm256_extractf128_pd(a, 0)); + let second = _mm_cvtsd_f64(_mm256_extractf128_pd(a, 1)); first + second } for Sse41(a: __m128d) -> f64 { - let a = _mm_hadd_pd(a, a); - - let first = _mm_cvtsd_f64(a); - let second = _mm_cvtsd_f64(_mm_shuffle_pd(a, a, 1)); - - first + second + _mm_cvtsd_f64(_mm_hadd_pd(a, a)) } for Sse2(a: __m128d) -> f64 { let a = _mm_add_pd(a, _mm_shuffle_pd(a, a, 1)); - - let first = _mm_cvtsd_f64(a); - let second = _mm_cvtsd_f64(_mm_shuffle_pd(a, a, 1)); - - first + second + _mm_cvtsd_f64(a) } for Scalar(a: f64) -> f64 { a } for Neon(a: float64x2_t) -> f64 { let a = vpaddq_f64(a, a); - vgetq_lane_f64(a, 0) + vgetq_lane_f64(a, 1) + vgetq_lane_f64(a, 0) } for Wasm(a: v128) -> f64 { let l0 = f64x2_extract_lane::<0>(a);