@@ -53,272 +53,144 @@ inline void full_like(T* dst, size_t n, T fill_value) {
5353
5454// ============================================================================
5555// Element-wise math — T in → T out
56- // All unrolled 4x to reduce loop overhead on small arrays.
57- // ============================================================================
56+ //
57+ // NUMPY_UNROLL4 macro: 4x loop unrolling to reduce branch overhead.
58+ // All calls are inlined → identical codegen to hand-written unrolled loops.
59+ // ============================================================================
60+
61+ #define NUMPY_UNROLL4 (dst_i, body ) \
62+ do { size_t _i = 0 ; \
63+ for (; _i + 3 < n; _i += 4 ) { \
64+ size_t dst_i = _i + 0 ; body; \
65+ dst_i = _i + 1 ; body; \
66+ dst_i = _i + 2 ; body; \
67+ dst_i = _i + 3 ; body; \
68+ } \
69+ for (; _i < n; ++_i) { \
70+ size_t dst_i = _i; body; \
71+ } \
72+ } while (0 )
5873
5974// / numpy.sqrt(x, /, out=None, *, where=True, ...)
6075template <typename T>
6176inline void sqrt (const T* src, T* dst, size_t n) {
62- size_t i = 0 ;
63- for (; i + 3 < n; i += 4 ) {
64- dst[i+0 ] = std::sqrt (src[i+0 ]);
65- dst[i+1 ] = std::sqrt (src[i+1 ]);
66- dst[i+2 ] = std::sqrt (src[i+2 ]);
67- dst[i+3 ] = std::sqrt (src[i+3 ]);
68- }
69- for (; i < n; ++i) dst[i] = std::sqrt (src[i]);
77+ NUMPY_UNROLL4 (i, dst[i] = std::sqrt (src[i]));
7078}
7179
7280// / numpy.abs(x, /, out=None, *, where=True, ...)
7381template <typename T>
7482inline void abs (const T* src, T* dst, size_t n) {
75- size_t i = 0 ;
76- for (; i + 3 < n; i += 4 ) {
77- dst[i+0 ] = std::abs (src[i+0 ]);
78- dst[i+1 ] = std::abs (src[i+1 ]);
79- dst[i+2 ] = std::abs (src[i+2 ]);
80- dst[i+3 ] = std::abs (src[i+3 ]);
81- }
82- for (; i < n; ++i) dst[i] = std::abs (src[i]);
83+ NUMPY_UNROLL4 (i, dst[i] = std::abs (src[i]));
8384}
8485
8586// / numpy.exp(x, /, out=None, *, where=True, ...)
8687template <typename T>
8788inline void exp (const T* src, T* dst, size_t n) {
88- size_t i = 0 ;
89- for (; i + 3 < n; i += 4 ) {
90- dst[i+0 ] = svml::exp (src[i+0 ]);
91- dst[i+1 ] = svml::exp (src[i+1 ]);
92- dst[i+2 ] = svml::exp (src[i+2 ]);
93- dst[i+3 ] = svml::exp (src[i+3 ]);
94- }
95- for (; i < n; ++i) dst[i] = svml::exp (src[i]);
89+ NUMPY_UNROLL4 (i, dst[i] = svml::exp (src[i]));
9690}
9791
9892// / numpy.log(x, /, out=None, *, where=True, ...)
9993template <typename T>
10094inline void log (const T* src, T* dst, size_t n) {
101- size_t i = 0 ;
102- for (; i + 3 < n; i += 4 ) {
103- dst[i+0 ] = svml::log (src[i+0 ]);
104- dst[i+1 ] = svml::log (src[i+1 ]);
105- dst[i+2 ] = svml::log (src[i+2 ]);
106- dst[i+3 ] = svml::log (src[i+3 ]);
107- }
108- for (; i < n; ++i) dst[i] = svml::log (src[i]);
95+ NUMPY_UNROLL4 (i, dst[i] = svml::log (src[i]));
10996}
11097
11198// / numpy.sin(x, /, out=None, *, where=True, ...)
11299template <typename T>
113100inline void sin (const T* src, T* dst, size_t n) {
114- size_t i = 0 ;
115- for (; i + 3 < n; i += 4 ) {
116- dst[i+0 ] = svml::sin (src[i+0 ]);
117- dst[i+1 ] = svml::sin (src[i+1 ]);
118- dst[i+2 ] = svml::sin (src[i+2 ]);
119- dst[i+3 ] = svml::sin (src[i+3 ]);
120- }
121- for (; i < n; ++i) dst[i] = svml::sin (src[i]);
101+ NUMPY_UNROLL4 (i, dst[i] = svml::sin (src[i]));
122102}
123103
124104// / numpy.cos(x, /, out=None, *, where=True, ...)
125105template <typename T>
126106inline void cos (const T* src, T* dst, size_t n) {
127- size_t i = 0 ;
128- for (; i + 3 < n; i += 4 ) {
129- dst[i+0 ] = svml::cos (src[i+0 ]);
130- dst[i+1 ] = svml::cos (src[i+1 ]);
131- dst[i+2 ] = svml::cos (src[i+2 ]);
132- dst[i+3 ] = svml::cos (src[i+3 ]);
133- }
134- for (; i < n; ++i) dst[i] = svml::cos (src[i]);
107+ NUMPY_UNROLL4 (i, dst[i] = svml::cos (src[i]));
135108}
136109
137110// / numpy.tan(x, /, out=None, *, where=True, ...)
138111template <typename T>
139112inline void tan (const T* src, T* dst, size_t n) {
140- size_t i = 0 ;
141- for (; i + 3 < n; i += 4 ) {
142- dst[i+0 ] = svml::tan (src[i+0 ]);
143- dst[i+1 ] = svml::tan (src[i+1 ]);
144- dst[i+2 ] = svml::tan (src[i+2 ]);
145- dst[i+3 ] = svml::tan (src[i+3 ]);
146- }
147- for (; i < n; ++i) dst[i] = svml::tan (src[i]);
113+ NUMPY_UNROLL4 (i, dst[i] = svml::tan (src[i]));
148114}
149115
150116// / numpy.power(x1, x2, /, out=None, *, where=True, ...)
151117template <typename T>
152118inline void power (const T* src, T* dst, size_t n, T exponent) {
153- size_t i = 0 ;
154- for (; i + 3 < n; i += 4 ) {
155- dst[i+0 ] = svml::pow (src[i+0 ], exponent);
156- dst[i+1 ] = svml::pow (src[i+1 ], exponent);
157- dst[i+2 ] = svml::pow (src[i+2 ], exponent);
158- dst[i+3 ] = svml::pow (src[i+3 ], exponent);
159- }
160- for (; i < n; ++i) dst[i] = svml::pow (src[i], exponent);
119+ NUMPY_UNROLL4 (i, dst[i] = svml::pow (src[i], exponent));
161120}
162121
163122// / numpy.clip(a, a_min, a_max, out=None, **kwargs)
164123template <typename T>
165124inline void clip (const T* src, T* dst, size_t n, T min_val, T max_val) {
166- size_t i = 0 ;
167- for (; i + 3 < n; i += 4 ) {
168- dst[i+0 ] = std::max (min_val, std::min (max_val, src[i+0 ]));
169- dst[i+1 ] = std::max (min_val, std::min (max_val, src[i+1 ]));
170- dst[i+2 ] = std::max (min_val, std::min (max_val, src[i+2 ]));
171- dst[i+3 ] = std::max (min_val, std::min (max_val, src[i+3 ]));
172- }
173- for (; i < n; ++i)
174- dst[i] = std::max (min_val, std::min (max_val, src[i]));
125+ NUMPY_UNROLL4 (i, dst[i] = std::max (min_val, std::min (max_val, src[i])));
175126}
176127
177128// / numpy.log10(x, /, out=None, *, where=True, ...)
178129template <typename T>
179130inline void log10 (const T* src, T* dst, size_t n) {
180- size_t i = 0 ;
181- for (; i + 3 < n; i += 4 ) {
182- dst[i+0 ] = svml::log10 (src[i+0 ]);
183- dst[i+1 ] = svml::log10 (src[i+1 ]);
184- dst[i+2 ] = svml::log10 (src[i+2 ]);
185- dst[i+3 ] = svml::log10 (src[i+3 ]);
186- }
187- for (; i < n; ++i) dst[i] = svml::log10 (src[i]);
131+ NUMPY_UNROLL4 (i, dst[i] = svml::log10 (src[i]));
188132}
189133
190134// / numpy.log2(x, /, out=None, *, where=True, ...)
191135template <typename T>
192136inline void log2 (const T* src, T* dst, size_t n) {
193- size_t i = 0 ;
194- for (; i + 3 < n; i += 4 ) {
195- dst[i+0 ] = svml::log2 (src[i+0 ]);
196- dst[i+1 ] = svml::log2 (src[i+1 ]);
197- dst[i+2 ] = svml::log2 (src[i+2 ]);
198- dst[i+3 ] = svml::log2 (src[i+3 ]);
199- }
200- for (; i < n; ++i) dst[i] = svml::log2 (src[i]);
137+ NUMPY_UNROLL4 (i, dst[i] = svml::log2 (src[i]));
201138}
202139
203140// / numpy.arcsin(x, /, out=None, *, where=True, ...)
204141template <typename T>
205142inline void arcsin (const T* src, T* dst, size_t n) {
206- size_t i = 0 ;
207- for (; i + 3 < n; i += 4 ) {
208- dst[i+0 ] = svml::asin (src[i+0 ]);
209- dst[i+1 ] = svml::asin (src[i+1 ]);
210- dst[i+2 ] = svml::asin (src[i+2 ]);
211- dst[i+3 ] = svml::asin (src[i+3 ]);
212- }
213- for (; i < n; ++i) dst[i] = svml::asin (src[i]);
143+ NUMPY_UNROLL4 (i, dst[i] = svml::asin (src[i]));
214144}
215145
216146// / numpy.arccos(x, /, out=None, *, where=True, ...)
217147template <typename T>
218148inline void arccos (const T* src, T* dst, size_t n) {
219- size_t i = 0 ;
220- for (; i + 3 < n; i += 4 ) {
221- dst[i+0 ] = svml::acos (src[i+0 ]);
222- dst[i+1 ] = svml::acos (src[i+1 ]);
223- dst[i+2 ] = svml::acos (src[i+2 ]);
224- dst[i+3 ] = svml::acos (src[i+3 ]);
225- }
226- for (; i < n; ++i) dst[i] = svml::acos (src[i]);
149+ NUMPY_UNROLL4 (i, dst[i] = svml::acos (src[i]));
227150}
228151
229152// / numpy.arctan(x, /, out=None, *, where=True, ...)
230153template <typename T>
231154inline void arctan (const T* src, T* dst, size_t n) {
232- size_t i = 0 ;
233- for (; i + 3 < n; i += 4 ) {
234- dst[i+0 ] = svml::atan (src[i+0 ]);
235- dst[i+1 ] = svml::atan (src[i+1 ]);
236- dst[i+2 ] = svml::atan (src[i+2 ]);
237- dst[i+3 ] = svml::atan (src[i+3 ]);
238- }
239- for (; i < n; ++i) dst[i] = svml::atan (src[i]);
155+ NUMPY_UNROLL4 (i, dst[i] = svml::atan (src[i]));
240156}
241157
242158// / numpy.round(a, decimals=0, out=None)
243159template <typename T>
244160inline void round (const T* src, T* dst, size_t n) {
245- size_t i = 0 ;
246- for (; i + 3 < n; i += 4 ) {
247- dst[i+0 ] = std::round (src[i+0 ]);
248- dst[i+1 ] = std::round (src[i+1 ]);
249- dst[i+2 ] = std::round (src[i+2 ]);
250- dst[i+3 ] = std::round (src[i+3 ]);
251- }
252- for (; i < n; ++i) dst[i] = std::round (src[i]);
161+ NUMPY_UNROLL4 (i, dst[i] = std::round (src[i]));
253162}
254163
255164// / numpy.floor(x, /, out=None, *, where=True, ...)
256165template <typename T>
257166inline void floor (const T* src, T* dst, size_t n) {
258- size_t i = 0 ;
259- for (; i + 3 < n; i += 4 ) {
260- dst[i+0 ] = std::floor (src[i+0 ]);
261- dst[i+1 ] = std::floor (src[i+1 ]);
262- dst[i+2 ] = std::floor (src[i+2 ]);
263- dst[i+3 ] = std::floor (src[i+3 ]);
264- }
265- for (; i < n; ++i) dst[i] = std::floor (src[i]);
167+ NUMPY_UNROLL4 (i, dst[i] = std::floor (src[i]));
266168}
267169
268170// / numpy.ceil(x, /, out=None, *, where=True, ...)
269171template <typename T>
270172inline void ceil (const T* src, T* dst, size_t n) {
271- size_t i = 0 ;
272- for (; i + 3 < n; i += 4 ) {
273- dst[i+0 ] = std::ceil (src[i+0 ]);
274- dst[i+1 ] = std::ceil (src[i+1 ]);
275- dst[i+2 ] = std::ceil (src[i+2 ]);
276- dst[i+3 ] = std::ceil (src[i+3 ]);
277- }
278- for (; i < n; ++i) dst[i] = std::ceil (src[i]);
173+ NUMPY_UNROLL4 (i, dst[i] = std::ceil (src[i]));
279174}
280175
281176// / numpy.degrees(x, /, out=None, *, where=True, ...)
282- // Must use native-type division to match numpy's compute path:
283- // numpy does float32(180.0) / float32(pi), NOT float32(double(180) / double(pi)).
284177template <typename T>
285178inline void degrees (const T* src, T* dst, size_t n) {
286179 T factor = T (180.0 ) / T (M_PI );
287- size_t i = 0 ;
288- for (; i + 3 < n; i += 4 ) {
289- dst[i+0 ] = src[i+0 ] * factor;
290- dst[i+1 ] = src[i+1 ] * factor;
291- dst[i+2 ] = src[i+2 ] * factor;
292- dst[i+3 ] = src[i+3 ] * factor;
293- }
294- for (; i < n; ++i) dst[i] = src[i] * factor;
180+ NUMPY_UNROLL4 (i, dst[i] = src[i] * factor);
295181}
296182
297183// / numpy.radians(x, /, out=None, *, where=True, ...)
298184template <typename T>
299185inline void radians (const T* src, T* dst, size_t n) {
300186 T factor = T (M_PI ) / T (180.0 );
301- size_t i = 0 ;
302- for (; i + 3 < n; i += 4 ) {
303- dst[i+0 ] = src[i+0 ] * factor;
304- dst[i+1 ] = src[i+1 ] * factor;
305- dst[i+2 ] = src[i+2 ] * factor;
306- dst[i+3 ] = src[i+3 ] * factor;
307- }
308- for (; i < n; ++i) dst[i] = src[i] * factor;
187+ NUMPY_UNROLL4 (i, dst[i] = src[i] * factor);
309188}
310189
311190// / numpy.sign(x, /, out=None, *, where=True, ...)
312191template <typename T>
313192inline void sign (const T* src, T* dst, size_t n) {
314- size_t i = 0 ;
315- for (; i + 3 < n; i += 4 ) {
316- dst[i+0 ] = T ((src[i+0 ] > T (0 )) - (src[i+0 ] < T (0 )));
317- dst[i+1 ] = T ((src[i+1 ] > T (0 )) - (src[i+1 ] < T (0 )));
318- dst[i+2 ] = T ((src[i+2 ] > T (0 )) - (src[i+2 ] < T (0 )));
319- dst[i+3 ] = T ((src[i+3 ] > T (0 )) - (src[i+3 ] < T (0 )));
320- }
321- for (; i < n; ++i) dst[i] = T ((src[i] > T (0 )) - (src[i] < T (0 )));
193+ NUMPY_UNROLL4 (i, dst[i] = T ((src[i] > T (0 )) - (src[i] < T (0 ))));
322194}
323195
324196// ============================================================================
@@ -931,43 +803,26 @@ inline void mean_axis(const T* src, T* dst, const ptrdiff_t* shape, int ndim, in
931803// norm, dot — used by linalg
932804// ============================================================================
933805
934- // / squared L2 norm (internal helper for linalg.norm)
935- // Stack allocation for n ≤ NUMPY_SMALL_STACK (128).
806+ // / squared L2 norm / dot — stack allocation for n ≤ 128
936807template <typename T>
937808inline T norm_sq (const T* data, size_t n) {
938- T stack_buf[NUMPY_SMALL_STACK ];
939- T* squares;
940- std::vector<T> heap_buf;
941- if (n <= NUMPY_SMALL_STACK ) {
942- squares = stack_buf;
943- } else {
944- heap_buf.resize (n);
945- squares = heap_buf.data ();
946- }
947- for (size_t i = 0 ; i < n; ++i) {
948- T v = data[i];
949- squares[i] = v * v;
950- }
951- return pairwise_sum (squares, n);
809+ T buf[NUMPY_SMALL_STACK ];
810+ T* squares = (n <= NUMPY_SMALL_STACK ) ? buf : new T[n];
811+ for (size_t i = 0 ; i < n; ++i) squares[i] = data[i] * data[i];
812+ T result = pairwise_sum (squares, n);
813+ if (n > NUMPY_SMALL_STACK ) delete[] squares;
814+ return result;
952815}
953816
954- // / numpy.dot(a, b, out=None) — 1D vector dot product (pairwise sum)
955- // Matches numpy's np.sum(a * b) bit-exactly.
956- // Stack allocation for n ≤ NUMPY_SMALL_STACK (128).
817+ // / numpy.dot(a, b, out=None) — pairwise sum, matches np.sum(a*b)
957818template <typename T>
958819inline T dot (const T* a, const T* b, size_t n) {
959- T stack_buf[NUMPY_SMALL_STACK ];
960- T* products;
961- std::vector<T> heap_buf;
962- if (n <= NUMPY_SMALL_STACK ) {
963- products = stack_buf;
964- } else {
965- heap_buf.resize (n);
966- products = heap_buf.data ();
967- }
968- for (size_t i = 0 ; i < n; ++i)
969- products[i] = a[i] * b[i];
970- return pairwise_sum (products, n);
820+ T buf[NUMPY_SMALL_STACK ];
821+ T* prods = (n <= NUMPY_SMALL_STACK ) ? buf : new T[n];
822+ for (size_t i = 0 ; i < n; ++i) prods[i] = a[i] * b[i];
823+ T result = pairwise_sum (prods, n);
824+ if (n > NUMPY_SMALL_STACK ) delete[] prods;
825+ return result;
971826}
972827
973828// / numpy.linalg.norm(x, ord=None, axis=N, keepdims=False) — N-D
0 commit comments