|
1 | | -// Native C++ implementations — zero pybind11 dependency. |
2 | | -// All functions operate on raw pointers + sizes. |
| 1 | +// ════════════════════════════════════════════════════════════════════════════ |
| 2 | +// numpycpp — public C++ API (zero pybind11 dependency) |
| 3 | +// The ONLY header external code should include: |
3 | 4 | // |
4 | | -// Usable by any C++ project via #include "numpy/core.h" |
| 5 | +// #include "numpy/core.h" |
5 | 6 | // |
6 | | -// Convention: each function is annotated with its Python numpy equivalent, |
7 | | -// e.g. /// numpy.sqrt(x, /, out=None, *, where=True, ...) |
| 7 | +// Public namespace: numpy:: e.g. numpy::exp(src, dst, n) |
| 8 | +// Internal namespace: numpy::detail:: ← DO NOT CALL DIRECTLY |
8 | 9 | // |
9 | | -// Acceleration (安全优化,保持 bit-exact 对齐): |
10 | | -// - Loop unrolling (4x) for element-wise functions |
11 | | -// - Stack allocation for small buffers (n ≤ 128) |
12 | | -// - Reusable fiber buffer in axis reductions |
13 | | -// - Fused multiply-accumulate in norm_sq/dot |
| 10 | +// The four internal headers pulled in below are LOCKED behind |
| 11 | +// NUMPYCPP_INTERNAL_INCLUDE and will cause a #error if included directly: |
| 12 | +// • svml_bridge.h — SVML/npy scalar bridge (x86_64 + Linux) |
| 13 | +// • blas_bridge.h — OpenBLAS ILP64 bridge (x86_64 + Linux) |
| 14 | +// • npy_math_float.h— float32 poly kernels (numpy internal constants) |
| 15 | +// • avx512_loops.h — AVX-512 specializations (requires AVX-512F CPU) |
| 16 | +// |
| 17 | +// All functions operate on raw pointers + sizes. |
| 18 | +// Each function is annotated with its Python numpy equivalent, |
| 19 | +// e.g. /// numpy.sqrt(x, /, out=None, *, where=True, ...) |
| 20 | +// |
| 21 | +// Acceleration (安全优化,保持 bit-exact 对齐): |
| 22 | +// - Loop unrolling (4x) for element-wise functions |
| 23 | +// - Stack allocation for small buffers (n ≤ 128) |
| 24 | +// - Reusable fiber buffer in axis reductions |
| 25 | +// - Fused multiply-accumulate in norm_sq/dot |
| 26 | +// ════════════════════════════════════════════════════════════════════════════ |
14 | 27 |
|
15 | 28 | #pragma once |
16 | 29 |
|
|
22 | 35 | #include <cstddef> |
23 | 36 | #include <stdexcept> |
24 | 37 |
|
25 | | -#include "svml_bridge.h" |
26 | | -#include "blas_bridge.h" |
| 38 | +// ── Internal headers ───────────────────────────────────────────────────────── |
| 39 | +// These files contain arch/OS-specific implementations (SVML/AVX-512/BLAS/npy). |
| 40 | +// They MUST NOT be included directly by external code. |
| 41 | +// The macro below is the compile-time lock; it is #undef-ed at the end of this |
| 42 | +// file so it cannot "leak" into translation units that include core.h. |
| 43 | +#define NUMPYCPP_INTERNAL_INCLUDE |
| 44 | +#include "svml_bridge.h" // numpy::detail::{exp,log,sin,...}_f32/f64 — SVML/npy |
| 45 | +#include "blas_bridge.h" // numpy::detail::blas_ops<T> — OpenBLAS ILP64 |
| 46 | +// avx512_loops.h included at namespace-close (line ~1004), also guarded. |
27 | 47 |
|
28 | 48 | namespace numpy { |
29 | 49 |
|
@@ -1003,4 +1023,7 @@ inline void norm_axis(const T* src, T* dst, const ptrdiff_t* shape, int ndim, in |
1003 | 1023 | // ============================================================================ |
1004 | 1024 | #include "avx512_loops.h" |
1005 | 1025 |
|
| 1026 | +// Release the internal-include lock so it does not pollute the includer's TU. |
| 1027 | +#undef NUMPYCPP_INTERNAL_INCLUDE |
| 1028 | + |
1006 | 1029 | } // namespace numpy |
0 commit comments