fix: 0 ULP for all APIs — 5 edge-case corrections

peng.li24 · peng.li24 · commit dc21f03edb0a · 2026-06-05T23:47:37.000+08:00
log f32 (AVX-512): add NaN passthrough after integer bit-ops strip NaN pattern
  - _mm512_cmp_ps_mask(x, x, _CMP_UNORD_Q) detects NaN; blend back original x

log f32 (scalar): npy_logf(-0.0) returned NaN instead of -inf
  - zero check (bits &amp; 0x7fffffff == 0) must precede sign-bit check
  - -0.0 has bits=0x80000000 → sign bit was triggering NaN path first

sin f32 (AVX-512): sin(±0.0) returned +0.0 instead of preserving sign
  - fma(sp, r, r) with r=±0 yields +0.0 by IEEE 754 RN rule
  - _mm512_cmp_ps_mask(x, zero, _CMP_EQ_OQ) catches both ±0; blend back x

sign(): sign(NaN) returned 0.0 instead of NaN
  - ordered comparisons (&gt;, &lt;) with NaN always return false → T(0-0)=0
  - fix: std::isnan guard before comparison expression

unwrap(): two fixes
  - NaN propagation: when dd=NaN, set cum_correct=NaN so all subsequent
    outputs become NaN (matching numpy's cumsum-of-corrections behavior)
  - f32 precision: replaced floor(a/b)*b with std::fmod+sign-correction
    to match numpy's np.mod exactly (fmod is internally more precise)

All 548 tests pass. ULP scanner confirms 0 ULP on 131k+ random values
plus special cases (±0, ±inf, NaN) for every API.
diff --git a/numpy/avx512_loops.h b/numpy/avx512_loops.h
@@ -276,14 +276,17 @@ inline void log<float>(const float* __restrict__ s,
         __m512 result = _mm512_fmadd_ps(exponent, Vln2, _mm512_div_ps(num, den));
 
         // Special cases (checked after main polynomial — masks override)
+        // Note: integer bit ops above strip NaN bits → must restore NaN explicitly.
         __mmask16 is_zero = _mm512_cmpeq_epi32_mask(
                                 _mm512_and_si512(bits, _mm512_set1_epi32(0x7fffffff)),
                                 _mm512_setzero_si512());
         __mmask16 is_neg    = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_LT_OQ);
         __mmask16 is_posinf = _mm512_cmp_ps_mask(x, Vposinf, _CMP_EQ_OQ);
+        __mmask16 is_nan    = _mm512_cmp_ps_mask(x, x, _CMP_UNORD_Q); // x!=x → NaN
         result = _mm512_mask_blend_ps(is_zero,   result, Vneginf);
         result = _mm512_mask_blend_ps(is_neg,    result, Vneqnan);
         result = _mm512_mask_blend_ps(is_posinf, result, Vposinf);
+        result = _mm512_mask_blend_ps(is_nan,    result, x);  // NaN passthrough
         _mm512_storeu_ps(d + i, result);
     }
     for (; i < n; ++i) d[i] = detail::log_npy_f32(s[i]);
@@ -348,6 +351,11 @@ inline void sin<float>(const float* __restrict__ s,
         __mmask16 neg  = _mm512_test_epi32_mask(iq, _mm512_set1_epi32(2));
         result = _mm512_mask_sub_ps(result, neg, _mm512_setzero_ps(), result);
 
+        // sin(±0) = ±0: signed-zero must be preserved (numpy matches IEEE 754)
+        // fma(sp, r, r) with r=±0 gives +0 due to IEEE 754 RN rules → blend back x
+        __mmask16 is_zero = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_OQ);
+        result = _mm512_mask_blend_ps(is_zero, result, x);
+
         // Out-of-range scalar fallback (rarely triggered; branch predicted-not-taken)
         __mmask16 inr = _mm512_cmp_ps_mask(_mm512_abs_ps(x), Vmax, _CMP_LE_OQ);
         if (__builtin_expect(inr != 0xFFFF, 0)) {
diff --git a/numpy/core.h b/numpy/core.h
@@ -208,7 +208,9 @@ inline void radians(const T* src, T* dst, size_t n) {
 /// numpy.sign(x, /, out=None, *, where=True, ...)
 template<typename T>
 inline void sign(const T* src, T* dst, size_t n) {
-    NUMPY_UNROLL4(i, dst[i] = T((src[i] > T(0)) - (src[i] < T(0))));
+    // NaN input → NaN output (numpy behavior); ordered comparisons return false for NaN
+    NUMPY_UNROLL4(i, dst[i] = std::isnan(src[i]) ? src[i]
+                                                  : T((src[i] > T(0)) - (src[i] < T(0))));
 }
 
 // ============================================================================
@@ -848,17 +850,25 @@ inline void unwrap(const T* src, T* dst, size_t n, T discont = T(M_PI)) {
     for (size_t i = 1; i < n; ++i) {
         T dd = src[i] - src[i - 1];
         T ph_correct = T(0);
-        if (std::abs(dd) >= discont) {
-            // numpy: ddmod = (dd + period/2) % period - period/2
-            // Python-style mod using floor division (numpy's mod):
+        if (std::isnan(dd)) {
+            // NaN difference: propagate NaN into cumulative correction so all
+            // subsequent outputs are NaN — matching numpy's cumsum(corrections) behavior
+            cum_correct = dd;
+        } else if (std::abs(dd) >= discont) {
+            // numpy: ddmod = mod(dd - interval_low, period) + interval_low
+            //   where interval_low = -p2, so: ddmod = mod(dd + p2, period) - p2
+            // numpy's mod uses fmod + sign-correction, NOT floor(a/b)*b:
             T val = dd + p2;
-            T val_mod = val - std::floor(val / period) * period;
+            T val_mod_signed = std::fmod(val, period);
+            T val_mod = (val_mod_signed < T(0)) ? val_mod_signed + period : val_mod_signed;
             T ddmod = val_mod - p2;
             // boundary_ambiguous: when dd > 0 and ddmod == -period/2, use +period/2
             if (dd > T(0) && ddmod == -p2) ddmod = p2;
             ph_correct = ddmod - dd;
+            cum_correct += ph_correct;
+        } else {
+            cum_correct += ph_correct;
         }
-        cum_correct += ph_correct;
         dst[i] = src[i] + cum_correct;
     }
 }
diff --git a/numpy/npy_math_float.h b/numpy/npy_math_float.h
@@ -105,10 +105,10 @@ inline float npy_logf(float x) {
 
     // NaN
     if (exp_field == 0xff && (bits & 0x7fffff) != 0) return x;
+    // x == 0 or -0 -> -inf  (must check before sign bit to handle -0.0 correctly)
+    if ((bits & 0x7fffffffu) == 0) return uint32_to_float(0xff800000u);
     // x < 0 -> -NaN
     if (bits & 0x80000000u) return uint32_to_float(0xffc00000u);
-    // x == 0 -> -inf
-    if ((bits & 0x7fffffffu) == 0) return uint32_to_float(0xff800000u);
     // x == +inf -> +inf
     if (bits == 0x7f800000u) return x;