feat: add comprehensive special-value tests + fix NaN/signed-zero in all AVX-512 paths

peng.li24 · peng.li24 · commit 8e45c518e193 · 2026-06-06T00:43:58.000+08:00
- avx512_loops.h: add explicit NaN passthrough blend to exp&lt;f32&gt;, sin&lt;f32&gt;, cos&lt;f32&gt;
  after polynomial computation, guaranteeing same NaN bit pattern as numpy's
  scalar npy_expf / npy_sinf / npy_cosf (which return x unchanged for NaN input)

- svml_bridge.h: fix sin(±0)=±0 for both f32 and f64 scalar paths
  * sin_f32(): npy_sinf polynomial fma(sp,r,r) with r=±0 gives +0 by IEEE 754 RN
  * sin_f64(): SVML broadcast scalar path __svml_sin8(-0) returns +0
  Both fixed with cheap branch: if (x==0 &amp;&amp; r==0) return x

- test_all.py section 16 – 206 new bit-exact special-value tests:
  * NaN passthrough: all 21 unary math functions × f32/f64 × sizes 1,16,17
  * Mixed NaN/finite (17 elements): NaN must not corrupt neighbours in SIMD path
  * Signed zero: sin(±0)=±0, cos(±0)=1, log(-0)=-inf, exp(±0)=1
  * Infinity: exp(±inf), log(+inf), sqrt(+inf), sin/cos(±inf)→NaN
  * Domain errors: log(neg), sqrt(neg), arcsin/arccos(|x|&gt;1) → NaN bit-exact
  * sign(NaN)=NaN, sign(±inf)=±1, sign(±0)=0
  * unwrap NaN propagation (mid, leading, all-NaN)
  * linalg: norm/dot with NaN or Inf inputs
  * AVX-512 boundary sizes 15/16/17/32 for exp/log/sin/cos

- test_all.py check_bit_aligned: upgraded to uint-view bit comparison for float
  arrays with matching dtype so NaN==NaN passes at bit level; dtype-mismatch
  case (C++ returns float64 for float32 input) falls back to numeric equality
diff --git a/numpy/avx512_loops.h b/numpy/avx512_loops.h
@@ -200,6 +200,11 @@ inline void exp<float>(const float* __restrict__ s,
                                     poly, Vinf);
         poly = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(x, VXmin, _CMP_LE_OQ),
                                     poly, _mm512_setzero_ps());
+        // NaN passthrough: ordered comparisons above return false for NaN → poly holds
+        // polynomial-derived NaN; blend back original x to guarantee bit-exact match
+        // with numpy's scalar npy_expf which returns x unchanged for NaN input.
+        __mmask16 is_nan_e = _mm512_cmp_ps_mask(x, x, _CMP_UNORD_Q);
+        poly = _mm512_mask_blend_ps(is_nan_e, poly, x);
         _mm512_storeu_ps(d + i, poly);
     }
     for (; i < n; ++i) d[i] = detail::exp_npy_f32(s[i]);
@@ -366,9 +371,14 @@ inline void sin<float>(const float* __restrict__ s,
                 if (!((inr >> j) & 1)) rt[j] = std::sin(xt[j]);
             result = _mm512_loadu_ps(rt);
         }
+        // NaN passthrough: blend back original x after fallback so NaN output = NaN
+        // input (bit-exact with numpy's scalar npy_sinf which returns x for NaN).
+        __mmask16 is_nan_s = _mm512_cmp_ps_mask(x, x, _CMP_UNORD_Q);
+        result = _mm512_mask_blend_ps(is_nan_s, result, x);
         _mm512_storeu_ps(d + i, result);
     }
-    for (; i < n; ++i) d[i] = detail::sin_npy_f32(s[i]);
+    // sin_f32 adds signed-zero fix: sin(±0)=±0 (npy_sinf polynomial gives +0 for -0).
+    for (; i < n; ++i) d[i] = detail::sin_f32(s[i]);
 }
 
 // ----------------------------------------------------------------------------
@@ -434,6 +444,9 @@ inline void cos<float>(const float* __restrict__ s,
                 if (!((inr >> j) & 1)) rt[j] = std::cos(xt[j]);
             result = _mm512_loadu_ps(rt);
         }
+        // NaN passthrough: blend back original x (bit-exact with numpy scalar npy_cosf).
+        __mmask16 is_nan_c = _mm512_cmp_ps_mask(x, x, _CMP_UNORD_Q);
+        result = _mm512_mask_blend_ps(is_nan_c, result, x);
         _mm512_storeu_ps(d + i, result);
     }
     for (; i < n; ++i) d[i] = detail::cos_npy_f32(s[i]);
diff --git a/numpy/svml_bridge.h b/numpy/svml_bridge.h
@@ -274,7 +274,13 @@ inline float atan2_npy_f32(float y, float x) {
 
 DISPATCH_F64(exp)
 DISPATCH_F64(log)
-DISPATCH_F64(sin)
+// sin_f64: custom — SVML scalar broadcast path loses signed zero (sin(-0)→+0).
+// IEEE 754 requires sin(±0) = ±0; preserve sign of zero explicitly.
+inline double sin_f64(double x) {
+    double r = cpu_has_avx512f() ? sin_svml_f64(x) : sin_npy_f64(x);
+    if (__builtin_expect(x == 0.0 && r == 0.0, 0)) return x;  // ±0 → ±0
+    return r;
+}
 DISPATCH_F64(cos)
 DISPATCH_F64(tan)
 DISPATCH_F64(asin)
@@ -301,7 +307,13 @@ DISPATCH_F32(log1p)
 // (npy_math_float.h), NOT SVML. These are bit-exact on all architectures.
 inline float exp_f32(float x)  { return exp_npy_f32(x); }
 inline float log_f32(float x)  { return log_npy_f32(x); }
-inline float sin_f32(float x)  { return sin_npy_f32(x); }
+// sin_f32: npy_sinf polynomial computes fma(sp,r,r) with r=±0 → +0 (IEEE RN rule),
+// losing the sign.  Restore: IEEE 754 mandates sin(±0) = ±0.
+inline float sin_f32(float x) {
+    float r = sin_npy_f32(x);
+    if (__builtin_expect(x == 0.0f && r == 0.0f, 0)) return x;  // sin(±0)=±0
+    return r;
+}
 inline float cos_f32(float x)  { return cos_npy_f32(x); }
 
 // pow / atan2 dispatchers
diff --git a/tests/test_all.py b/tests/test_all.py