enforce: lock internal headers behind NUMPYCPP_INTERNAL_INCLUDE guard

peng.li24 · peng.li24 · commit aaa6322b3ad0 · 2026-06-06T01:18:29.000+08:00
All 4 arch/OS-specific implementation headers now cause a hard compile error
if included directly — external callers must only use #include "numpy/core.h".

Files locked:
  npy_math_float.h  — float32 numpy polynomial kernels (numpy-internal constants)
  svml_bridge.h     — SVML/npy scalar bridge (x86_64 + Linux only)
  blas_bridge.h     — OpenBLAS ILP64 bridge (x86_64 + Linux only)
  avx512_loops.h    — AVX-512F template specializations

Mechanism:
  core.h defines NUMPYCPP_INTERNAL_INCLUDE before pulling in the internal
  headers, then #undef-s it at the end so the macro cannot leak into the
  caller's translation unit.  Each internal header opens with:
    #ifndef NUMPYCPP_INTERNAL_INCLUDE
    #  error "... do not include directly. Use #include "numpy/core.h""
    #endif
  This is the standard C++ header-only library pattern for API boundary
  enforcement (same as used by Abseil, Boost.Asio, etc.).

core.h file comment updated to list all 4 locked headers and clarify:
  Public API:   namespace numpy::
  Internal API: namespace numpy::detail:: — DO NOT CALL DIRECTLY
diff --git a/numpy/avx512_loops.h b/numpy/avx512_loops.h
@@ -1,5 +1,13 @@
-// INTERNAL HEADER — included at the bottom of core.h, inside namespace numpy.
-// DO NOT include directly.
+// ╔══════════════════════════════════════════════════════════════════════════╗
+// ║  INTERNAL HEADER — DIRECT INCLUSION IS A COMPILE ERROR                 ║
+// ║                                                                          ║
+// ║  This file contains AVX-512 template specializations that override the  ║
+// ║  generic loops in core.h.  It is x86_64 + AVX-512F specific and must   ║
+// ║  be included INSIDE namespace numpy at the end of core.h — nowhere else.║
+// ║                                                                          ║
+// ║  ✗  #include "numpy/avx512_loops.h"     ← compile error                ║
+// ║  ✓  #include "numpy/core.h"             ← only correct entry point      ║
+// ╚══════════════════════════════════════════════════════════════════════════╝
 //
 // AVX-512 wide-loop specializations for array math functions.
 //
@@ -18,6 +26,12 @@
 //     Previously these called noinline helpers → 32768 call/returns per 524k array.
 
 #pragma once
+
+#ifndef NUMPYCPP_INTERNAL_INCLUDE
+#  error "avx512_loops.h is an internal header — do not include directly. \
+Use #include \"numpy/core.h\" instead."
+#endif
+
 #ifdef __AVX512F__
 #include <immintrin.h>
 
diff --git a/numpy/blas_bridge.h b/numpy/blas_bridge.h
@@ -1,5 +1,15 @@
-// INTERNAL HEADER — auto-included by core.h and linalg.h.
-// DO NOT include directly.
+// ╔══════════════════════════════════════════════════════════════════════════╗
+// ║  INTERNAL HEADER — DIRECT INCLUSION IS A COMPILE ERROR                 ║
+// ║                                                                          ║
+// ║  This file wraps OpenBLAS ILP64 (Linux x86_64 only) via dlsym/dlopen.  ║
+// ║  All symbols live in numpy::detail — an implementation namespace that   ║
+// ║  external code must never reference.                                     ║
+// ║                                                                          ║
+// ║  ✗  #include "numpy/blas_bridge.h"      ← compile error                ║
+// ║  ✗  numpy::detail::blas_sdot(...)       ← undefined behaviour          ║
+// ║  ✓  #include "numpy/core.h"             ← only correct entry point      ║
+// ║  ✓  numpy::dot(a, b, n)                 ← public API                    ║
+// ╚══════════════════════════════════════════════════════════════════════════╝
 //
 // BLAS bridge — bit-exact dot/norm vs numpy's OpenBLAS-backed np.dot /
 // np.linalg.norm (without axis).
@@ -22,6 +32,11 @@
 
 #pragma once
 
+#ifndef NUMPYCPP_INTERNAL_INCLUDE
+#  error "blas_bridge.h is an internal header — do not include directly. \
+Use #include \"numpy/core.h\" instead."
+#endif
+
 #include <cstdint>
 #include <cmath>
 #include <dlfcn.h>
diff --git a/numpy/core.h b/numpy/core.h
@@ -1,16 +1,29 @@
-// Native C++ implementations — zero pybind11 dependency.
-// All functions operate on raw pointers + sizes.
+// ════════════════════════════════════════════════════════════════════════════
+//  numpycpp — public C++ API  (zero pybind11 dependency)
+//  The ONLY header external code should include:
 //
-// Usable by any C++ project via #include "numpy/core.h"
+//      #include "numpy/core.h"
 //
-// Convention: each function is annotated with its Python numpy equivalent,
-// e.g. /// numpy.sqrt(x, /, out=None, *, where=True, ...)
+//  Public namespace:   numpy::           e.g.  numpy::exp(src, dst, n)
+//  Internal namespace: numpy::detail::   ← DO NOT CALL DIRECTLY
 //
-// Acceleration (安全优化，保持 bit-exact 对齐):
-//   - Loop unrolling (4x) for element-wise functions
-//   - Stack allocation for small buffers (n ≤ 128)
-//   - Reusable fiber buffer in axis reductions
-//   - Fused multiply-accumulate in norm_sq/dot
+//  The four internal headers pulled in below are LOCKED behind
+//  NUMPYCPP_INTERNAL_INCLUDE and will cause a #error if included directly:
+//    • svml_bridge.h   — SVML/npy scalar bridge  (x86_64 + Linux)
+//    • blas_bridge.h   — OpenBLAS ILP64 bridge   (x86_64 + Linux)
+//    • npy_math_float.h— float32 poly kernels    (numpy internal constants)
+//    • avx512_loops.h  — AVX-512 specializations (requires AVX-512F CPU)
+//
+//  All functions operate on raw pointers + sizes.
+//  Each function is annotated with its Python numpy equivalent,
+//  e.g. /// numpy.sqrt(x, /, out=None, *, where=True, ...)
+//
+//  Acceleration (安全优化，保持 bit-exact 对齐):
+//    - Loop unrolling (4x) for element-wise functions
+//    - Stack allocation for small buffers (n ≤ 128)
+//    - Reusable fiber buffer in axis reductions
+//    - Fused multiply-accumulate in norm_sq/dot
+// ════════════════════════════════════════════════════════════════════════════
 
 #pragma once
 
@@ -22,8 +35,15 @@
 #include <cstddef>
 #include <stdexcept>
 
-#include "svml_bridge.h"
-#include "blas_bridge.h"
+// ── Internal headers ─────────────────────────────────────────────────────────
+// These files contain arch/OS-specific implementations (SVML/AVX-512/BLAS/npy).
+// They MUST NOT be included directly by external code.
+// The macro below is the compile-time lock; it is #undef-ed at the end of this
+// file so it cannot "leak" into translation units that include core.h.
+#define NUMPYCPP_INTERNAL_INCLUDE
+#include "svml_bridge.h"   // numpy::detail::{exp,log,sin,...}_f32/f64 — SVML/npy
+#include "blas_bridge.h"   // numpy::detail::blas_ops<T> — OpenBLAS ILP64
+// avx512_loops.h included at namespace-close (line ~1004), also guarded.
 
 namespace numpy {
 
@@ -1003,4 +1023,7 @@ inline void norm_axis(const T* src, T* dst, const ptrdiff_t* shape, int ndim, in
 // ============================================================================
 #include "avx512_loops.h"
 
+// Release the internal-include lock so it does not pollute the includer's TU.
+#undef NUMPYCPP_INTERNAL_INCLUDE
+
 } // namespace numpy
diff --git a/numpy/npy_math_float.h b/numpy/npy_math_float.h
@@ -1,14 +1,21 @@
-// INTERNAL HEADER — DO NOT INCLUDE DIRECTLY.
-// Use #include "numpy/core.h" which pulls this in automatically.
-//
-// All functions live in numpy::detail — do not call directly.
-// Use numpy::exp() etc. from core.h.
-//
-// Bit-exact float32 math matching numpy 1.23.5 SIMD polynomial approximations.
-// Replicates numpy's simd_exp_FLOAT, simd_log_FLOAT, simd_sincos_f32 algorithms.
+// ╔══════════════════════════════════════════════════════════════════════════╗
+// ║  INTERNAL HEADER — DIRECT INCLUSION IS A COMPILE ERROR                 ║
+// ║                                                                          ║
+// ║  This file implements arch/OS-specific float32 polynomial kernels that  ║
+// ║  are tied to numpy's internal SIMD constants.  The API is UNSTABLE and  ║
+// ║  subject to change without notice.                                       ║
+// ║                                                                          ║
+// ║  ✗  #include "numpy/npy_math_float.h"   ← compile error                ║
+// ║  ✓  #include "numpy/core.h"             ← only correct entry point      ║
+// ╚══════════════════════════════════════════════════════════════════════════╝
 
 #pragma once
 
+#ifndef NUMPYCPP_INTERNAL_INCLUDE
+#  error "npy_math_float.h is an internal header — do not include directly. \
+Use #include \"numpy/core.h\" instead."
+#endif
+
 #include <cstdint>
 #include <cstring>
 #include <cmath>
diff --git a/numpy/svml_bridge.h b/numpy/svml_bridge.h
@@ -1,23 +1,31 @@
-// INTERNAL HEADER — DO NOT INCLUDE DIRECTLY.
-// Use #include "numpy/core.h" which pulls this in automatically.
-//
-// All functions live in numpy::detail — do not call numpy::detail::exp()
-// directly. Use numpy::exp() from core.h.
+// ╔══════════════════════════════════════════════════════════════════════════╗
+// ║  INTERNAL HEADER — DIRECT INCLUSION IS A COMPILE ERROR                 ║
+// ║                                                                          ║
+// ║  This file bridges numpycpp to numpy's SVML / npy_* scalar kernels.    ║
+// ║  It is x86_64 + Linux specific (dlsym, /proc/self/maps, AVX-512).      ║
+// ║  All symbols live in numpy::detail — an implementation namespace that   ║
+// ║  external code must never reference.                                     ║
+// ║                                                                          ║
+// ║  ✗  #include "numpy/svml_bridge.h"      ← compile error                ║
+// ║  ✗  numpy::detail::exp_svml_f64(x)      ← undefined behaviour          ║
+// ║  ✓  #include "numpy/core.h"             ← only correct entry point      ║
+// ║  ✓  numpy::exp(src, dst, n)             ← public API                    ║
+// ╚══════════════════════════════════════════════════════════════════════════╝
 //
 // SVML/npy bridge — bit-exact math on every x86_64 architecture.
-//
-// numpy uses different math implementations depending on CPU features:
-//   AVX-512 HW → __svml_exp8 (SVML vector) → resolves via dlsym
-//   non-AVX-512 → npy_exp (scalar)        → resolves via dlsym
-//
-// This header detects CPU features at RUNTIME and selects the matching path.
-// AVX-512 intrinsics are isolated behind __attribute__((target("avx512f")))
-// so the binary is safe on non-AVX-512 CPUs — no SIGILL.
-//
+//   AVX-512 HW → __svml_exp8 (SVML vector) → resolved via dlsym
+//   non-AVX-512 → npy_exp (scalar)         → resolved via dlsym
+// CPU feature detection is at RUNTIME; AVX-512 intrinsics are isolated behind
+// __attribute__((target("avx512f"))) — safe on non-AVX-512 CPUs (no SIGILL).
 // The .so path is auto-discovered via /proc/self/maps — no manual init needed.
 
 #pragma once
 
+#ifndef NUMPYCPP_INTERNAL_INCLUDE
+#  error "svml_bridge.h is an internal header — do not include directly. \
+Use #include \"numpy/core.h\" instead."
+#endif
+
 #include <cmath>
 #include <cstdio>
 #include <dlfcn.h>