Skip to content

Commit dc21f03

Browse files
author
peng.li24
committed
fix: 0 ULP for all APIs — 5 edge-case corrections
log f32 (AVX-512): add NaN passthrough after integer bit-ops strip NaN pattern - _mm512_cmp_ps_mask(x, x, _CMP_UNORD_Q) detects NaN; blend back original x log f32 (scalar): npy_logf(-0.0) returned NaN instead of -inf - zero check (bits & 0x7fffffff == 0) must precede sign-bit check - -0.0 has bits=0x80000000 → sign bit was triggering NaN path first sin f32 (AVX-512): sin(±0.0) returned +0.0 instead of preserving sign - fma(sp, r, r) with r=±0 yields +0.0 by IEEE 754 RN rule - _mm512_cmp_ps_mask(x, zero, _CMP_EQ_OQ) catches both ±0; blend back x sign(): sign(NaN) returned 0.0 instead of NaN - ordered comparisons (>, <) with NaN always return false → T(0-0)=0 - fix: std::isnan guard before comparison expression unwrap(): two fixes - NaN propagation: when dd=NaN, set cum_correct=NaN so all subsequent outputs become NaN (matching numpy's cumsum-of-corrections behavior) - f32 precision: replaced floor(a/b)*b with std::fmod+sign-correction to match numpy's np.mod exactly (fmod is internally more precise) All 548 tests pass. ULP scanner confirms 0 ULP on 131k+ random values plus special cases (±0, ±inf, NaN) for every API.
1 parent 32cd98a commit dc21f03

3 files changed

Lines changed: 26 additions & 8 deletions

File tree

numpy/avx512_loops.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,14 +276,17 @@ inline void log<float>(const float* __restrict__ s,
276276
__m512 result = _mm512_fmadd_ps(exponent, Vln2, _mm512_div_ps(num, den));
277277

278278
// Special cases (checked after main polynomial — masks override)
279+
// Note: integer bit ops above strip NaN bits → must restore NaN explicitly.
279280
__mmask16 is_zero = _mm512_cmpeq_epi32_mask(
280281
_mm512_and_si512(bits, _mm512_set1_epi32(0x7fffffff)),
281282
_mm512_setzero_si512());
282283
__mmask16 is_neg = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_LT_OQ);
283284
__mmask16 is_posinf = _mm512_cmp_ps_mask(x, Vposinf, _CMP_EQ_OQ);
285+
__mmask16 is_nan = _mm512_cmp_ps_mask(x, x, _CMP_UNORD_Q); // x!=x → NaN
284286
result = _mm512_mask_blend_ps(is_zero, result, Vneginf);
285287
result = _mm512_mask_blend_ps(is_neg, result, Vneqnan);
286288
result = _mm512_mask_blend_ps(is_posinf, result, Vposinf);
289+
result = _mm512_mask_blend_ps(is_nan, result, x); // NaN passthrough
287290
_mm512_storeu_ps(d + i, result);
288291
}
289292
for (; i < n; ++i) d[i] = detail::log_npy_f32(s[i]);
@@ -348,6 +351,11 @@ inline void sin<float>(const float* __restrict__ s,
348351
__mmask16 neg = _mm512_test_epi32_mask(iq, _mm512_set1_epi32(2));
349352
result = _mm512_mask_sub_ps(result, neg, _mm512_setzero_ps(), result);
350353

354+
// sin(±0) = ±0: signed-zero must be preserved (numpy matches IEEE 754)
355+
// fma(sp, r, r) with r=±0 gives +0 due to IEEE 754 RN rules → blend back x
356+
__mmask16 is_zero = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_OQ);
357+
result = _mm512_mask_blend_ps(is_zero, result, x);
358+
351359
// Out-of-range scalar fallback (rarely triggered; branch predicted-not-taken)
352360
__mmask16 inr = _mm512_cmp_ps_mask(_mm512_abs_ps(x), Vmax, _CMP_LE_OQ);
353361
if (__builtin_expect(inr != 0xFFFF, 0)) {

numpy/core.h

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,9 @@ inline void radians(const T* src, T* dst, size_t n) {
208208
/// numpy.sign(x, /, out=None, *, where=True, ...)
209209
template<typename T>
210210
inline void sign(const T* src, T* dst, size_t n) {
211-
NUMPY_UNROLL4(i, dst[i] = T((src[i] > T(0)) - (src[i] < T(0))));
211+
// NaN input → NaN output (numpy behavior); ordered comparisons return false for NaN
212+
NUMPY_UNROLL4(i, dst[i] = std::isnan(src[i]) ? src[i]
213+
: T((src[i] > T(0)) - (src[i] < T(0))));
212214
}
213215

214216
// ============================================================================
@@ -848,17 +850,25 @@ inline void unwrap(const T* src, T* dst, size_t n, T discont = T(M_PI)) {
848850
for (size_t i = 1; i < n; ++i) {
849851
T dd = src[i] - src[i - 1];
850852
T ph_correct = T(0);
851-
if (std::abs(dd) >= discont) {
852-
// numpy: ddmod = (dd + period/2) % period - period/2
853-
// Python-style mod using floor division (numpy's mod):
853+
if (std::isnan(dd)) {
854+
// NaN difference: propagate NaN into cumulative correction so all
855+
// subsequent outputs are NaN — matching numpy's cumsum(corrections) behavior
856+
cum_correct = dd;
857+
} else if (std::abs(dd) >= discont) {
858+
// numpy: ddmod = mod(dd - interval_low, period) + interval_low
859+
// where interval_low = -p2, so: ddmod = mod(dd + p2, period) - p2
860+
// numpy's mod uses fmod + sign-correction, NOT floor(a/b)*b:
854861
T val = dd + p2;
855-
T val_mod = val - std::floor(val / period) * period;
862+
T val_mod_signed = std::fmod(val, period);
863+
T val_mod = (val_mod_signed < T(0)) ? val_mod_signed + period : val_mod_signed;
856864
T ddmod = val_mod - p2;
857865
// boundary_ambiguous: when dd > 0 and ddmod == -period/2, use +period/2
858866
if (dd > T(0) && ddmod == -p2) ddmod = p2;
859867
ph_correct = ddmod - dd;
868+
cum_correct += ph_correct;
869+
} else {
870+
cum_correct += ph_correct;
860871
}
861-
cum_correct += ph_correct;
862872
dst[i] = src[i] + cum_correct;
863873
}
864874
}

numpy/npy_math_float.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,10 +105,10 @@ inline float npy_logf(float x) {
105105

106106
// NaN
107107
if (exp_field == 0xff && (bits & 0x7fffff) != 0) return x;
108+
// x == 0 or -0 -> -inf (must check before sign bit to handle -0.0 correctly)
109+
if ((bits & 0x7fffffffu) == 0) return uint32_to_float(0xff800000u);
108110
// x < 0 -> -NaN
109111
if (bits & 0x80000000u) return uint32_to_float(0xffc00000u);
110-
// x == 0 -> -inf
111-
if ((bits & 0x7fffffffu) == 0) return uint32_to_float(0xff800000u);
112112
// x == +inf -> +inf
113113
if (bits == 0x7f800000u) return x;
114114

0 commit comments

Comments
 (0)