fix: linalg dot/norm — 0 ULP via OpenBLAS bridge

peng.li24 · peng.li24 · commit 55fb05482c4e · 2026-06-06T00:09:49.000+08:00
np.dot(a,b) and np.linalg.norm(a) internally call BLAS (OpenBLAS ILP64
sdot_64_/ddot_64_) — our previous pairwise_sum implementation gave the
same mathematical result but different bit patterns (~70% mismatch rate).

Changes:
- numpy/blas_bridge.h (new): auto-discovers libopenblas64_p*.so from
  /proc/self/maps (same pattern as svml_bridge.h); provides
  detail::blas_ops&lt;T&gt;::dot/norm using ILP64 Fortran calling convention
- numpy/core.h: numpy::dot&lt;T&gt; now calls blas_ops&lt;T&gt;::dot
- numpy/linalg.h: linalg::norm(axis=None) now calls blas_ops&lt;T&gt;::norm
  = sqrt(blas_dot(x, x)) — matches np.linalg.norm exactly
- linalg::norm_axis already 0 ULP (uses numpy pairwise sum) — unchanged
- tests/test_all.py: test_dot and test_norm_* now compare against
  np.dot / np.linalg.norm (correct BLAS references)

ULP scan (1000 random arrays, sizes 1-300): 0/1000 mismatches for
dot f32/f64, norm(axis=None) f32/f64, norm(axis=1) f32/f64.
All 548 tests pass.
diff --git a/numpy/blas_bridge.h b/numpy/blas_bridge.h
@@ -0,0 +1,112 @@
+// INTERNAL HEADER — auto-included by core.h and linalg.h.
+// DO NOT include directly.
+//
+// BLAS bridge — bit-exact dot/norm vs numpy's OpenBLAS-backed np.dot /
+// np.linalg.norm (without axis).
+//
+// numpy routes 1-D dot and Frobenius norm through BLAS (OpenBLAS ILP64):
+//   np.dot(a, b)      → sdot_64_ / ddot_64_
+//   np.linalg.norm(a) → sqrt(x.dot(x)) → same sdot_64_ / ddot_64_
+//
+// np.linalg.norm(a, axis=k) uses numpy's own pairwise sum — already
+// handled by norm_axis() in core.h, no BLAS needed.
+//
+// The OpenBLAS library path is auto-discovered from /proc/self/maps
+// (numpy loads it when imported), so no compile-time link flag is needed.
+//
+// ILP64 Fortran calling convention (OpenBLAS built with BLAS_SYMBOL_SUFFIX=64_):
+//   sdot_64_(n*, x*, incx*, y*, incy*)  → float   (return in xmm0)
+//   ddot_64_(n*, x*, incx*, y*, incy*)  → double  (return in xmm0)
+//
+// Fallback (if OpenBLAS not discovered): sequential accumulation.
+
+#pragma once
+
+#include <cstdint>
+#include <cmath>
+#include <dlfcn.h>
+#include <fstream>
+#include <string>
+
+namespace numpy {
+namespace detail {
+
+inline void* g_blas_handle = nullptr;
+
+inline const char* find_openblas_path() {
+    static std::string path;
+    static bool tried = false;
+    if (tried) return path.empty() ? nullptr : path.c_str();
+    tried = true;
+
+    std::ifstream maps("/proc/self/maps");
+    std::string line;
+    while (std::getline(maps, line)) {
+        if (line.find("libopenblas") != std::string::npos &&
+            line.find(".so")         != std::string::npos) {
+            auto pos   = line.rfind('/');
+            auto start = line.rfind(' ', pos);
+            if (start != std::string::npos && pos != std::string::npos) {
+                path = line.substr(start + 1);
+                // trim trailing whitespace / newline
+                while (!path.empty() && (path.back() == '\n' || path.back() == '\r'
+                                         || path.back() == ' '))
+                    path.pop_back();
+                break;
+            }
+        }
+    }
+    return path.empty() ? nullptr : path.c_str();
+}
+
+inline void* resolve_blas(const char* sym) {
+    if (!g_blas_handle) {
+        const char* path = find_openblas_path();
+        if (path) g_blas_handle = dlopen(path, RTLD_NOLOAD | RTLD_LAZY);
+    }
+    return g_blas_handle ? dlsym(g_blas_handle, sym) : nullptr;
+}
+
+// ILP64 Fortran function types (all int args are int64_t by pointer)
+using sdot64_fn = float  (const int64_t*, const float*,  const int64_t*,
+                           const float*,  const int64_t*);
+using ddot64_fn = double (const int64_t*, const double*, const int64_t*,
+                           const double*, const int64_t*);
+
+inline float blas_sdot(const float* x, const float* y, size_t n) {
+    static auto fn = (sdot64_fn*)resolve_blas("sdot_64_");
+    if (__builtin_expect(fn != nullptr, 1)) {
+        const int64_t ni = static_cast<int64_t>(n), inc = 1;
+        return fn(&ni, x, &inc, y, &inc);
+    }
+    // Fallback: sequential accumulation
+    float r = 0.0f;
+    for (size_t i = 0; i < n; ++i) r += x[i] * y[i];
+    return r;
+}
+
+inline double blas_ddot(const double* x, const double* y, size_t n) {
+    static auto fn = (ddot64_fn*)resolve_blas("ddot_64_");
+    if (__builtin_expect(fn != nullptr, 1)) {
+        const int64_t ni = static_cast<int64_t>(n), inc = 1;
+        return fn(&ni, x, &inc, y, &inc);
+    }
+    double r = 0.0;
+    for (size_t i = 0; i < n; ++i) r += x[i] * y[i];
+    return r;
+}
+
+// Template dispatcher
+template<typename T> struct blas_ops;
+
+template<> struct blas_ops<float> {
+    static float  dot (const float*  x, const float*  y, size_t n) { return blas_sdot(x, y, n); }
+    static float  norm(const float*  x,                  size_t n) { return std::sqrt(blas_sdot(x, x, n)); }
+};
+template<> struct blas_ops<double> {
+    static double dot (const double* x, const double* y, size_t n) { return blas_ddot(x, y, n); }
+    static double norm(const double* x,                  size_t n) { return std::sqrt(blas_ddot(x, x, n)); }
+};
+
+} // namespace detail
+} // namespace numpy
diff --git a/numpy/core.h b/numpy/core.h
@@ -23,6 +23,7 @@
 #include <stdexcept>
 
 #include "svml_bridge.h"
+#include "blas_bridge.h"
 
 namespace numpy {
 
@@ -974,15 +975,12 @@ inline T norm_sq(const T* data, size_t n) {
     return result;
 }
 
-/// numpy.dot(a, b, out=None) — pairwise sum, matches np.sum(a*b)
+/// numpy.dot(a, b, out=None)
+/// Routes through OpenBLAS sdot_64_/ddot_64_ (auto-discovered via /proc/self/maps)
+/// for bit-exact match with np.dot(a, b) which calls BLAS internally.
 template<typename T>
 inline T dot(const T* a, const T* b, size_t n) {
-    T buf[NUMPY_SMALL_STACK];
-    T* prods = (n <= NUMPY_SMALL_STACK) ? buf : new T[n];
-    for (size_t i = 0; i < n; ++i) prods[i] = a[i] * b[i];
-    T result = pairwise_sum(prods, n);
-    if (n > NUMPY_SMALL_STACK) delete[] prods;
-    return result;
+    return detail::blas_ops<T>::dot(a, b, n);
 }
 
 /// numpy.linalg.norm(x, ord=None, axis=N, keepdims=False) — N-D
diff --git a/numpy/linalg.h b/numpy/linalg.h
@@ -9,13 +9,12 @@
 namespace numpy {
 namespace linalg {
 
-/// numpy.linalg.norm(x, ord=None, axis=None, keepdims=False) — frobenius/vector
-//  Uses norm_sq (pairwise sum) → matches np.sqrt(np.sum(x**2)).
-//  For float32, norm_sq() and sqrt() stay in float32.
+/// numpy.linalg.norm(x, ord=None, axis=None, keepdims=False) — vector / Frobenius
+//  np.linalg.norm(a) internally computes sqrt(a.dot(a)) via BLAS sdot/ddot.
+//  We call the same OpenBLAS routine (auto-discovered) for bit-exact match.
 template<typename T>
 inline T norm(const T* data, size_t n) {
-    T sqnorm = numpy::norm_sq(data, n);  // pairwise sum of squares
-    return std::sqrt(sqnorm);
+    return numpy::detail::blas_ops<T>::norm(data, n);
 }
 
 /// numpy.linalg.norm(x, ord=None, axis=N, keepdims=False) — N-D
diff --git a/tests/test_all.py b/tests/test_all.py
@@ -958,11 +958,13 @@ def test_to_vector_bool(cpp):
 
 def test_norm_1d(cpp, dtype):
     a = random_array((100,), dtype=dtype)
-    assert_bit_aligned(dtype(cpp.linalg.norm(a)), np.sqrt(np.sum(a * a)), "linalg.norm 1d")
+    # np.linalg.norm internally computes sqrt(a.dot(a)) via BLAS
+    assert_bit_aligned(dtype(cpp.linalg.norm(a)), dtype(np.linalg.norm(a)), "linalg.norm 1d")
 
 def test_norm_2d(cpp, dtype):
     a = random_array((5, 4), dtype=dtype)
-    assert_bit_aligned(dtype(cpp.linalg.norm(a)), np.sqrt(np.sum(a * a)), "linalg.norm 2d")
+    # Frobenius norm: same BLAS path as 1d
+    assert_bit_aligned(dtype(cpp.linalg.norm(a)), dtype(np.linalg.norm(a)), "linalg.norm 2d")
 
 def test_norm_zero(cpp, dtype):
     a = np.zeros((100,), dtype=dtype)
@@ -981,12 +983,13 @@ def test_norm_1d_fallback(cpp, dtype):
 def test_dot(cpp, dtype):
     a = random_array((5,), dtype=dtype)
     b = random_array((5,), seed=99, dtype=dtype)
-    assert_bit_aligned(cpp.dot(a, b), np.sum(a * b), "dot")
+    # np.dot routes through BLAS sdot/ddot
+    assert_bit_aligned(cpp.dot(a, b), np.dot(a, b), "dot")
 
 def test_dot_orthogonal(cpp, dtype):
     a = np.array([1.0, 0.0], dtype=dtype)
     b = np.array([0.0, 1.0], dtype=dtype)
-    assert_bit_aligned(cpp.dot(a, b), np.sum(a * b), "dot orthogonal")
+    assert_bit_aligned(cpp.dot(a, b), np.dot(a, b), "dot orthogonal")
 
 
 # ============================================================================